diff --git a/.clang-tidy b/.clang-tidy
new file mode 100644
index 000000000000..033b1cbfe576
--- /dev/null
+++ b/.clang-tidy
@@ -0,0 +1,88 @@
+# When making changes, verify the output of:
+#   clang-tidy -list-checks
+---
+Checks: "-*,\
+  bugprone-argument-comment,\
+  bugprone-dangling-handle,\
+  bugprone-fold-init-type,\
+  bugprone-forward-declaration-namespace,\
+  bugprone-forwarding-reference-overload,\
+  bugprone-shadow,\
+  bugprone-sizeof-*,\
+  bugprone-string-constructor,\
+  bugprone-undefined-memory-manipulation,\
+  bugprone-unused-return-value,\
+  bugprone-use-after-move,\
+  cert-env33-c,\
+  cert-err58-cpp,\
+  cert-msc30-c,\
+  cert-msc50-cpp,\
+  clang-analyzer-core.NullDereference,\
+  clang-analyzer-core.StackAddressEscape,\
+  clang-analyzer-deadcode.DeadStores,\
+  clang-diagnostic-*,\
+  -clang-diagnostic-missing-designated-field-initializers,\
+  concurrency-mt-unsafe,\
+  cppcoreguidelines-avoid-non-const-global-variables,\
+  cppcoreguidelines-missing-std-forward,\
+  cppcoreguidelines-pro-type-member-init,\
+  cppcoreguidelines-special-member-functions,\
+  cppcoreguidelines-virtual-class-destructor,\
+  google-build-using-namespace,\
+  google-explicit-constructor,\
+  google-readability-avoid-underscore-in-googletest-name,\
+  misc-definitions-in-headers,\
+  misc-redundant-expression,\
+  modernize-make-shared,\
+  modernize-use-emplace,\
+  modernize-use-noexcept,\
+  modernize-use-override,\
+  modernize-use-using,\
+  performance-faster-string-find,\
+  performance-for-range-copy,\
+  performance-implicit-conversion-in-loop,\
+  performance-inefficient-algorithm,\
+  performance-inefficient-string-concatenation,\
+  performance-inefficient-vector-operation,\
+  performance-move-const-arg,\
+  performance-move-constructor-init,\
+  performance-no-automatic-move,\
+  performance-no-int-to-ptr,\
+  performance-noexcept-move-constructor,\
+  performance-noexcept-swap,\
+  performance-trivially-destructible,\
+  performance-type-promotion-in-math-fn,\
+  performance-unnecessary-copy-initialization,\
+  performance-unnecessary-value-param,\
+  readability-braces-around-statements,\
+  readability-duplicate-include,\
+  readability-isolate-declaration,\
+  readability-operators-representation,\
+  readability-redundant-string-init"
+
+WarningsAsErrors: "bugprone-use-after-move"
+
+CheckOptions:
+- key: bugprone-easily-swappable-parameters.MinimumLength
+  value: 4
+- key: cppcoreguidelines-avoid-non-const-global-variables.AllowThreadLocal
+  value: true
+- key: cppcoreguidelines-special-member-functions.AllowSoleDefaultDtor
+  value: true
+- key: cppcoreguidelines-special-member-functions.AllowImplicitlyDeletedCopyOrMove
+  value: true
+- key: modernize-use-using.IgnoreExternC
+  value: true
+- key: performance-move-const-arg.CheckTriviallyCopyableMove
+  value: false
+- key: performance-unnecessary-value-param.AllowedTypes
+  value: '[Pp]ointer$;[Pp]tr$;[Rr]ef(erence)?$'
+- key: performance-unnecessary-copy-initialization.AllowedTypes
+  value: '[Pp]ointer$;[Pp]tr$;[Rr]ef(erence)?$'
+- key: readability-operators-representation.BinaryOperators
+  value: '&&;&=;&;|;~;!;!=;||;|=;^;^='
+- key: readability-redundant-string-init.StringNames
+  value: '::std::basic_string'
+- key: readability-named-parameter.InsertPlainNamesInForwardDecls
+  value: true
+...
diff --git a/.github/actions/build-folly/action.yml b/.github/actions/build-folly/action.yml
index 70229199958b..84f99de18d25 100644
--- a/.github/actions/build-folly/action.yml
+++ b/.github/actions/build-folly/action.yml
@@ -1,7 +1,17 @@
 name: build-folly
+description: Build folly and dependencies (skipped if cache hit)
+inputs:
+  cache-hit:
+    description: Whether the folly cache was hit
+    required: true
 runs:
   using: composite
   steps:
   - name: Build folly and dependencies
+    if: ${{ inputs.cache-hit != 'true' }}
     run: make build_folly
     shell: bash
+  - name: Skip folly build (using cached version)
+    if: ${{ inputs.cache-hit == 'true' }}
+    run: echo "Folly build skipped - using cached version"
+    shell: bash
diff --git a/.github/actions/cache-folly/action.yml b/.github/actions/cache-folly/action.yml
new file mode 100644
index 000000000000..f54a5a9a5a2e
--- /dev/null
+++ b/.github/actions/cache-folly/action.yml
@@ -0,0 +1,33 @@
+name: cache-folly
+description: Cache folly build to speed up CI
+outputs:
+  cache-hit:
+    description: Whether the cache was hit
+    value: ${{ steps.cache-folly-build.outputs.cache-hit }}
+runs:
+  using: composite
+  steps:
+  - name: Extract FOLLY_MK_HASH
+    id: extract-folly-hash
+    shell: bash
+    run: |
+      FOLLY_MK_HASH=$(md5sum folly.mk | cut -d' ' -f1)
+      echo "hash=$FOLLY_MK_HASH" >> $GITHUB_OUTPUT
+  - name: Extract FOLLY_INSTALL_DIR
+    id: extract-folly-install-dir
+    shell: bash
+    run: |
+      FOLLY_INSTALL_DIR=$(cd third-party/folly && python3 build/fbcode_builder/getdeps.py show-inst-dir)
+      echo "dir=$(echo $FOLLY_INSTALL_DIR | sed 's|installed/folly|installed|')" >> $GITHUB_OUTPUT
+  - name: Cache folly build
+    id: cache-folly-build
+    uses: actions/cache@v4
+    with:
+      # Cache the folly build directory
+      path: ${{ steps.extract-folly-install-dir.outputs.dir }}
+      # Key is based on:
+      # - OS and architecture
+      # - The docker image, which may not always be specified/known
+      # - Hash of folly.mk, which includes the folly repository commit hash
+      # NOTE: this is still only intended for DEBUG folly builds
+      key: folly-build-${{ runner.os }}-${{ runner.arch }}-${{ github.job_container.image }}-${{ steps.extract-folly-hash.outputs.hash }}
diff --git a/.github/actions/cache-getdeps-downloads/action.yml b/.github/actions/cache-getdeps-downloads/action.yml
new file mode 100644
index 000000000000..ca871bf1c8cd
--- /dev/null
+++ b/.github/actions/cache-getdeps-downloads/action.yml
@@ -0,0 +1,21 @@
+name: cache-getdeps-downloads
+description: Cache getdeps downloads to avoid unreliable mirrors and speed up builds
+outputs:
+  cache-hit:
+    description: Whether the cache was hit
+    value: ${{ steps.cache-downloads.outputs.cache-hit }}
+runs:
+  using: composite
+  steps:
+  - name: Cache getdeps downloads
+    id: cache-downloads
+    uses: actions/cache@v4
+    with:
+      # Use a fixed path that we control - folly.mk will sync with getdeps downloads dir
+      path: /tmp/rocksdb-getdeps-cache
+      # Use a rolling cache key - the cache accumulates downloads over time
+      # The key includes a weekly timestamp to ensure periodic refresh
+      key: getdeps-downloads-${{ runner.os }}-${{ runner.arch }}-week-${{ github.run_id }}
+      restore-keys: |
+        getdeps-downloads-${{ runner.os }}-${{ runner.arch }}-week-
+        getdeps-downloads-${{ runner.os }}-${{ runner.arch }}-
diff --git a/.github/actions/install-maven/action.yml b/.github/actions/install-maven/action.yml
index 69a925272ac1..815ec751f2de 100644
--- a/.github/actions/install-maven/action.yml
+++ b/.github/actions/install-maven/action.yml
@@ -4,8 +4,8 @@ runs:
   steps:
   - name: Install Maven
     run: |
-      wget --no-check-certificate https://dlcdn.apache.org/maven/maven-3/3.9.6/binaries/apache-maven-3.9.6-bin.tar.gz
-      tar zxf apache-maven-3.9.6-bin.tar.gz
-      echo "export M2_HOME=$(pwd)/apache-maven-3.9.6" >> $GITHUB_ENV
-      echo "$(pwd)/apache-maven-3.9.6/bin" >> $GITHUB_PATH
+      wget --no-check-certificate https://archive.apache.org/dist/maven/maven-3/3.9.11/binaries/apache-maven-3.9.11-bin.tar.gz
+      tar zxf apache-maven-3.9.11-bin.tar.gz
+      echo "export M2_HOME=$(pwd)/apache-maven-3.9.11" >> $GITHUB_ENV
+      echo "$(pwd)/apache-maven-3.9.11/bin" >> $GITHUB_PATH
     shell: bash
diff --git a/.github/actions/setup-folly/action.yml b/.github/actions/setup-folly/action.yml
index 41cec847ce60..8702b92aa857 100644
--- a/.github/actions/setup-folly/action.yml
+++ b/.github/actions/setup-folly/action.yml
@@ -3,5 +3,9 @@ runs:
   using: composite
   steps:
   - name: Checkout folly sources
-    run: make checkout_folly
+    run: |
+      make checkout_folly
+    shell: bash
+  - name: Install patchelf and libaio
+    run: apt-get update -y && apt-get install -y patchelf libaio-dev
     shell: bash
diff --git a/.github/actions/windows-build-steps/action.yml b/.github/actions/windows-build-steps/action.yml
index 9213f2e828fc..699d4aa0e580 100644
--- a/.github/actions/windows-build-steps/action.yml
+++ b/.github/actions/windows-build-steps/action.yml
@@ -4,6 +4,16 @@ runs:
   steps:
   - name: Add msbuild to PATH
     uses: microsoft/setup-msbuild@v1.3.1
+  - name: Cache ccache directory
+    id: ccache-cache
+    uses: actions/cache@v4
+    with:
+      path: C:\a\rocksdb\rocksdb\.ccache
+      key: rocksdb-build-${{ runner.os }}-${{ runner.arch }}-ccache-${{ hashFiles('CMakeLists.txt', 'cmake/**/*.cmake') }}-v1
+  - name: ccache
+    uses: hendrikmuhs/ccache-action@v1.2
+    with:
+      max-size: "10GB"
   - name: Custom steps
     env:
       THIRDPARTY_HOME: ${{ github.workspace }}/thirdparty
@@ -11,9 +21,9 @@ runs:
       CMAKE_BIN: C:/Program Files/CMake/bin/cmake.exe
       CTEST_BIN: C:/Program Files/CMake/bin/ctest.exe
       JAVA_HOME: C:/Program Files/BellSoft/LibericaJDK-8
-      SNAPPY_HOME: ${{ github.workspace }}/thirdparty/snappy-1.1.8
-      SNAPPY_INCLUDE: ${{ github.workspace }}/thirdparty/snappy-1.1.8;${{ github.workspace }}/thirdparty/snappy-1.1.8/build
-      SNAPPY_LIB_DEBUG: ${{ github.workspace }}/thirdparty/snappy-1.1.8/build/Debug/snappy.lib
+      SNAPPY_HOME: ${{ github.workspace }}/thirdparty/snappy-1.2.2
+      SNAPPY_INCLUDE: ${{ github.workspace }}/thirdparty/snappy-1.2.2;${{ github.workspace }}/thirdparty/snappy-1.2.2/build
+      SNAPPY_LIB_DEBUG: ${{ github.workspace }}/thirdparty/snappy-1.2.2/build/Debug/snappy.lib
     run: |-
       # NOTE: if ... Exit $LASTEXITCODE lines needed to exit and report failure
       echo ===================== Install Dependencies =====================
@@ -22,14 +32,14 @@ runs:
       mkdir $Env:THIRDPARTY_HOME
       cd $Env:THIRDPARTY_HOME
       echo "Building Snappy dependency..."
-      curl -Lo snappy-1.1.8.zip https://github.com/google/snappy/archive/refs/tags/1.1.8.zip
+      curl -Lo snappy-1.2.2.zip https://github.com/google/snappy/archive/refs/tags/1.2.2.zip
       if(!$?) { Exit $LASTEXITCODE }
-      unzip -q snappy-1.1.8.zip
+      unzip -q snappy-1.2.2.zip
       if(!$?) { Exit $LASTEXITCODE }
-      cd snappy-1.1.8
+      cd snappy-1.2.2
       mkdir build
       cd build
-      & cmake -G "$Env:CMAKE_GENERATOR" ..
+      & cmake -G "$Env:CMAKE_GENERATOR" .. -DSNAPPY_BUILD_TESTS=OFF -DSNAPPY_BUILD_BENCHMARKS=OFF
       if(!$?) { Exit $LASTEXITCODE }
       msbuild Snappy.sln -maxCpuCount -property:Configuration=Debug -property:Platform=x64
       if(!$?) { Exit $LASTEXITCODE }
@@ -38,11 +48,12 @@ runs:
       $env:Path = $env:JAVA_HOME + ";" + $env:Path
       mkdir build
       cd build
-      & cmake -G "$Env:CMAKE_GENERATOR" -DCMAKE_BUILD_TYPE=Debug -DOPTDBG=1 -DPORTABLE="$Env:CMAKE_PORTABLE" -DSNAPPY=1 -DJNI=1 ..
+      & cmake -G "$Env:CMAKE_GENERATOR" -DCMAKE_BUILD_TYPE=Debug -DWIN_CI=1 -DPORTABLE="$Env:CMAKE_PORTABLE" -DSNAPPY=1 -DXPRESS=1 -DJNI=1 ..
       if(!$?) { Exit $LASTEXITCODE }
       cd ..
       echo "Building with VS version: $Env:CMAKE_GENERATOR"
-      msbuild build/rocksdb.sln -maxCpuCount -property:Configuration=Debug -property:Platform=x64
+      # use more parallel processes than the number of processes available, as most of the compile command would be cache hit
+      msbuild build/rocksdb.sln /m:32 /p:LinkIncremental=false -property:Configuration=Debug -property:Platform=x64
       if(!$?) { Exit $LASTEXITCODE }
       echo ========================= Test RocksDB =========================
       build_tools\run_ci_db_test.ps1 -SuiteRun arena_test,db_basic_test,db_test,db_test2,db_merge_operand_test,bloom_test,c_test,coding_test,crc32c_test,dynamic_bloom_test,env_basic_test,env_test,hash_test,random_test -Concurrency 16
@@ -52,3 +63,7 @@ runs:
       & ctest -C Debug -j 16
       if(!$?) { Exit $LASTEXITCODE }
     shell: pwsh
+  - name: Show ccache stats
+    shell: pwsh
+    run: |
+      ccache --show-stats -v
diff --git a/.github/workflows/clang-tidy-comment.yml b/.github/workflows/clang-tidy-comment.yml
new file mode 100644
index 000000000000..9615c890f85f
--- /dev/null
+++ b/.github/workflows/clang-tidy-comment.yml
@@ -0,0 +1,105 @@
+name: clang-tidy
+on:
+  push:
+  pull_request_target:
+    types: [opened, synchronize, reopened]
+
+permissions:
+  pull-requests: write
+
+jobs:
+  clang-tidy:
+    if: github.repository_owner == 'facebook'
+    runs-on:
+      labels: 4-core-ubuntu
+    container:
+      image: ghcr.io/facebook/rocksdb_ubuntu:24.0
+    steps:
+    - uses: actions/checkout@v4.1.0
+      with:
+        ref: ${{ github.event_name == 'pull_request_target' && github.event.pull_request.head.sha || github.sha }}
+    - name: Mark workspace as safe for git
+      run: git config --global --add safe.directory $GITHUB_WORKSPACE
+    - name: Determine diff base
+      id: diff-base
+      run: |
+        if [ "${{ github.event_name }}" = "pull_request_target" ]; then
+          BASE="${{ github.event.pull_request.base.sha }}"
+        else
+          BASE="${{ github.event.before }}"
+        fi
+        if [ -z "$BASE" ] || echo "$BASE" | grep -q '^0\{40\}$'; then
+          echo "skip=true" >> "$GITHUB_OUTPUT"
+          echo "No valid diff base; skipping clang-tidy."
+        else
+          git fetch --depth=1 origin "$BASE"
+          echo "ref=$BASE" >> "$GITHUB_OUTPUT"
+          echo "skip=false" >> "$GITHUB_OUTPUT"
+        fi
+    - name: Install clang-tidy
+      if: steps.diff-base.outputs.skip != 'true'
+      run: apt-get update && apt-get install -y clang-tidy
+    - name: Generate compile_commands.json
+      if: steps.diff-base.outputs.skip != 'true'
+      run: |
+        mkdir build && cd build
+        cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+              -DCMAKE_C_COMPILER=clang-18 \
+              -DCMAKE_CXX_COMPILER=clang++-18 ..
+        cd ..
+        ln -sf build/compile_commands.json compile_commands.json
+    - name: Run clang-tidy on changed files
+      id: clang-tidy
+      if: steps.diff-base.outputs.skip != 'true'
+      run: |
+        python3 tools/run_clang_tidy.py \
+          -j 4 \
+          --diff-base ${{ steps.diff-base.outputs.ref }} \
+          --github-annotations \
+          --github-step-summary \
+          --comment-output clang-tidy-comment.md
+      continue-on-error: true
+    - name: Post clang-tidy results to PR
+      if: github.event_name == 'pull_request_target' && always() && steps.diff-base.outputs.skip != 'true'
+      uses: actions/github-script@v7
+      with:
+        script: |
+          const fs = require('fs');
+          const commentPath = 'clang-tidy-comment.md';
+          if (!fs.existsSync(commentPath)) {
+            core.info('No comment file generated; skipping PR comment.');
+            return;
+          }
+          const body = fs.readFileSync(commentPath, 'utf8');
+          const marker = '<!-- clang-tidy-bot -->';
+          const prNumber = context.payload.pull_request.number;
+          try {
+            const { data: comments } = await github.rest.issues.listComments({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: prNumber,
+            });
+            const existing = comments.find(c => c.body.includes(marker));
+            if (existing) {
+              await github.rest.issues.updateComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                comment_id: existing.id,
+                body,
+              });
+              core.info(`Updated existing comment ${existing.id}`);
+            } else {
+              await github.rest.issues.createComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: prNumber,
+                body,
+              });
+              core.info('Created new PR comment');
+            }
+          } catch (err) {
+            core.warning(`Could not post PR comment: ${err.message}`);
+          }
+    - name: Fail if clang-tidy found issues
+      if: steps.clang-tidy.outcome == 'failure'
+      run: exit 1
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 1370a5460402..e10a95ecd0a0 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -10,7 +10,7 @@ jobs:
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
@@ -27,60 +27,72 @@ jobs:
         git config --global --add safe.directory /__w/rocksdb/rocksdb
         tools/check_format_compatible.sh
     - uses: "./.github/actions/post-steps"
-  build-linux-run-microbench:
+  build-linux-non-shm:
     if: ${{ github.repository_owner == 'facebook' }}
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
       options: --shm-size=16gb
+    env:
+      TEST_TMPDIR: "/tmp/rocksdb_test_tmp"
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
-    - run: DEBUG_LEVEL=0 make -j32 run_microbench
+    - run: make V=1 -j32 check
     - uses: "./.github/actions/post-steps"
-  build-linux-non-shm:
+  build-linux-clang-18-asan-ubsan-with-folly:
     if: ${{ github.repository_owner == 'facebook' }}
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:24.0
       options: --shm-size=16gb
     env:
-      TEST_TMPDIR: "/tmp/rocksdb_test_tmp"
+      CC: clang-18
+      CXX: clang++-18
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
-    - run: make V=1 -j32 check
+    - uses: "./.github/actions/cache-getdeps-downloads"
+    - uses: "./.github/actions/setup-folly"
+    - uses: "./.github/actions/build-folly"
+    - run: LIB_MODE=static USE_CLANG=1 USE_FOLLY=1 COMPILE_WITH_UBSAN=1 COMPILE_WITH_ASAN=1 make -j32 check
     - uses: "./.github/actions/post-steps"
-  build-linux-clang-13-asan-ubsan-with-folly:
+  build-linux-cmake-with-folly:
     if: ${{ github.repository_owner == 'facebook' }}
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
       options: --shm-size=16gb
-    env:
-      CC: clang-13
-      CXX: clang++-13
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
+    - uses: "./.github/actions/cache-getdeps-downloads"
     - uses: "./.github/actions/setup-folly"
+    - uses: "./.github/actions/cache-folly"
+      id: cache-folly
     - uses: "./.github/actions/build-folly"
-    - run: LIB_MODE=static USE_CLANG=1 USE_FOLLY=1 COMPILE_WITH_UBSAN=1 COMPILE_WITH_ASAN=1 make -j32 check
+      with:
+        cache-hit: ${{ steps.cache-folly.outputs.cache-hit }}
+    - run: "(mkdir build && cd build && cmake -DUSE_FOLLY=1 -DWITH_GFLAGS=1 -DROCKSDB_BUILD_SHARED=0 .. && make VERBOSE=1 -j20 && ctest -j20)"
     - uses: "./.github/actions/post-steps"
-  build-linux-valgrind:
+  build-linux-release-with-folly:
     if: ${{ github.repository_owner == 'facebook' }}
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
-    - run: make V=1 -j32 valgrind_test
+    - uses: "./.github/actions/cache-getdeps-downloads"
+    - uses: "./.github/actions/setup-folly"
+    - run: "DEBUG_LEVEL=0 make -j20 build_folly"
+    - run: "USE_FOLLY=1 LIB_MODE=static DEBUG_LEVEL=0 V=1 make -j20 release"
+    - run: "(mkdir build && cd build && cmake -DUSE_FOLLY=1 -DWITH_GFLAGS=1 -DROCKSDB_BUILD_SHARED=0 -DCMAKE_BUILD_TYPE=Release .. && make VERBOSE=1 -j20 && ctest -j20)"
     - uses: "./.github/actions/post-steps"
   build-windows-vs2022-avx2:
     if: ${{ github.repository_owner == 'facebook' }}
@@ -91,15 +103,6 @@ jobs:
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/windows-build-steps"
-  build-windows-vs2022:
-    if: ${{ github.repository_owner == 'facebook' }}
-    runs-on: windows-2022
-    env:
-      CMAKE_GENERATOR: Visual Studio 17 2022
-      CMAKE_PORTABLE: 1
-    steps:
-    - uses: actions/checkout@v4.1.0
-    - uses: "./.github/actions/windows-build-steps"
   build-linux-arm-test-full:
     if: ${{ github.repository_owner == 'facebook' }}
     runs-on:
@@ -110,3 +113,59 @@ jobs:
       - run: sudo apt-get update && sudo apt-get install -y build-essential libgflags-dev
       - run: make V=1 J=4 -j4 check
       - uses: "./.github/actions/post-steps"
+  build-linux-arm-crashtest:
+    if: ${{ github.repository_owner == 'facebook' }}
+    runs-on:
+      labels: 4-core-ubuntu-arm
+    steps:
+    - uses: actions/checkout@v4.1.0
+    - uses: "./.github/actions/pre-steps"
+    - run: sudo apt-get update && sudo apt-get install -y build-essential libgflags-dev libsnappy-dev zlib1g-dev libbz2-dev liblz4-dev libzstd-dev
+    - run: sudo mount -o remount,size=16G /dev/shm
+    - run: sudo dd bs=1048576 count=4096 if=/dev/zero of=/swapfile && sudo chmod 600 /swapfile && sudo mkswap /swapfile && sudo swapon /swapfile
+    - run: ulimit -S -n `ulimit -H -n` && make V=1 -j8 CRASH_TEST_EXT_ARGS='--duration=1800 --max_key=2500000' blackbox_crash_test_with_atomic_flush
+    - run: rm -rf /dev/shm/rocksdb.*
+    - run: ulimit -S -n `ulimit -H -n` && make V=1 -j8 CRASH_TEST_EXT_ARGS='--duration=1800 --max_key=2500000' blackbox_crash_test_with_multiops_wc_txn
+    - uses: "./.github/actions/post-steps"
+  build-examples:
+    if: ${{ github.repository_owner == 'facebook' }}
+    runs-on:
+      labels: 4-core-ubuntu
+    container:
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
+      options: --shm-size=16gb
+    steps:
+    - uses: actions/checkout@v4.1.0
+    - uses: "./.github/actions/pre-steps"
+    - name: Build examples
+      run: make V=1 -j4 static_lib && cd examples && make V=1 -j4
+    - uses: "./.github/actions/post-steps"
+  build-fuzzers:
+    if: ${{ github.repository_owner == 'facebook' }}
+    runs-on:
+      labels: 4-core-ubuntu
+    container:
+      image: ghcr.io/facebook/rocksdb_ubuntu:24.0
+      options: --shm-size=16gb
+    steps:
+    - uses: actions/checkout@v4.1.0
+    - uses: "./.github/actions/pre-steps"
+    - name: Build rocksdb lib
+      run: CC=clang-18 CXX=clang++-18 USE_CLANG=1 make -j4 static_lib
+    - name: Build fuzzers
+      run: cd fuzz && make sst_file_writer_fuzzer db_fuzzer db_map_fuzzer
+    - uses: "./.github/actions/post-steps"
+  build-linux-cmake-with-folly-lite:
+    if: ${{ github.repository_owner == 'facebook' }}
+    runs-on:
+      labels: 16-core-ubuntu
+    container:
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
+      options: --shm-size=16gb
+    steps:
+    - uses: actions/checkout@v4.1.0
+    - uses: "./.github/actions/pre-steps"
+    - uses: "./.github/actions/cache-getdeps-downloads"
+    - uses: "./.github/actions/setup-folly"
+    - run: "(mkdir build && cd build && cmake -DUSE_FOLLY_LITE=1 -DWITH_GFLAGS=1 -DCMAKE_CXX_FLAGS=-DGLOG_USE_GLOG_EXPORT .. && make VERBOSE=1 -j20 && ctest -j20)"
+    - uses: "./.github/actions/post-steps"
diff --git a/.github/workflows/pr-jobs.yml b/.github/workflows/pr-jobs.yml
index 7faaff6637a7..0b5ea4b81d23 100644
--- a/.github/workflows/pr-jobs.yml
+++ b/.github/workflows/pr-jobs.yml
@@ -1,7 +1,18 @@
 name: facebook/rocksdb/pr-jobs
 on: [push, pull_request]
 permissions: {}
+env:
+  # Set to a job name to run only that job (on any repo), or leave empty for
+  # normal behavior (all jobs on facebook repo only).
+  ONLY_JOB: ''
 jobs:
+  config:
+    runs-on: ubuntu-latest
+    outputs:
+      only_job: ${{ steps.set.outputs.only_job }}
+    steps:
+    - id: set
+      run: echo "only_job=$ONLY_JOB" >> "$GITHUB_OUTPUT"
   # NOTE: multiple workflows would be recommended, but the current GHA UI in
   # PRs doesn't make it clear when there's an overall error with a workflow,
   # making it easy to overlook something broken. Grouping everything into one
@@ -19,6 +30,10 @@ jobs:
   # increasing the risk of misconfiguration, especially on forks that might
   # want to run with this GHA setup.
   #
+  # SELECTIVE JOB EXECUTION: Set the ONLY_JOB env var at the top of this file
+  # to a job name (e.g. "build-linux-clang-tidy") to run only that job,
+  # bypassing the repository owner check. Leave it empty for normal behavior.
+  #
   # DEBUGGING WITH SSH: Temporarily add this as a job step, either before the
   # step of interest without the "if:" line or after the failing step with the
   # "if:" line. Then use ssh command printed in CI output.
@@ -30,7 +45,8 @@ jobs:
 
   # ======================== Fast Initial Checks ====================== #
   check-format-and-targets:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'check-format-and-targets' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on: ubuntu-24.04
     steps:
     - uses: actions/checkout@v4.1.0
@@ -44,6 +60,10 @@ jobs:
       run: python -m pip install --upgrade pip
     - name: Install argparse
       run: pip install argparse
+    - name: Install clang-format
+      run: |
+        pip install https://files.pythonhosted.org/packages/fb/ac/3c04772acc0257f5730e83adb542b2603c1a62d1315010ab593a980af404/clang_format-21.1.2-py2.py3-none-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl
+        clang-format --version
     - name: Download clang-format-diff.py
       run: wget https://rocksdb-deps.s3.us-west-2.amazonaws.com/llvm/llvm-project/release/12.x/clang/tools/clang-format/clang-format-diff.py
     - name: Check format
@@ -52,13 +72,22 @@ jobs:
       run: make check-buck-targets
     - name: Simple source code checks
       run: make check-sources
+    - name: Sanity check check_format_compatible.sh
+      run: |-
+        export TEST_TMPDIR=/dev/shm/rocksdb
+        rm -rf /dev/shm/rocksdb
+        mkdir /dev/shm/rocksdb
+        git reset --hard
+        git config --global --add safe.directory /__w/rocksdb/rocksdb
+        SANITY_CHECK=1 LONG_TEST=1 tools/check_format_compatible.sh
   # ========================= Linux With Tests ======================== #
   build-linux:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-linux' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
@@ -66,11 +95,12 @@ jobs:
     - run: make V=1 J=32 -j32 check
     - uses: "./.github/actions/post-steps"
   build-linux-cmake-mingw:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-linux-cmake-mingw' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 4-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:24.0
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
@@ -84,255 +114,175 @@ jobs:
         which javac && javac -version
         mkdir build && cd build && cmake -DJNI=1 -DWITH_GFLAGS=OFF .. -DCMAKE_C_COMPILER=x86_64-w64-mingw32-gcc -DCMAKE_CXX_COMPILER=x86_64-w64-mingw32-g++ -DCMAKE_SYSTEM_NAME=Windows && make -j4 rocksdb rocksdbjni
     - uses: "./.github/actions/post-steps"
-  build-linux-cmake-with-folly:
-    if: ${{ github.repository_owner == 'facebook' }}
-    runs-on:
-      labels: 16-core-ubuntu
-    container:
-      image: zjay437/rocksdb:0.6
-      options: --shm-size=16gb
-    env:
-      CC: gcc-10
-      CXX: g++-10
-    steps:
-    - uses: actions/checkout@v4.1.0
-    - uses: "./.github/actions/pre-steps"
-    - uses: "./.github/actions/setup-folly"
-    - uses: "./.github/actions/build-folly"
-    - run: "(mkdir build && cd build && cmake -DUSE_FOLLY=1 -DWITH_GFLAGS=1 -DROCKSDB_BUILD_SHARED=0 .. && make V=1 -j20 && ctest -j20)"
-    - uses: "./.github/actions/post-steps"
-  build-linux-cmake-with-folly-lite-no-test:
-    if: ${{ github.repository_owner == 'facebook' }}
-    runs-on:
-      labels: 16-core-ubuntu
-    container:
-      image: zjay437/rocksdb:0.6
-      options: --shm-size=16gb
-    env:
-      CC: gcc-10
-      CXX: g++-10
-    steps:
-    - uses: actions/checkout@v4.1.0
-    - uses: "./.github/actions/pre-steps"
-    - uses: "./.github/actions/setup-folly"
-    - run: "(mkdir build && cd build && cmake -DUSE_FOLLY_LITE=1 -DWITH_GFLAGS=1 .. && make V=1 -j20)"
-    - uses: "./.github/actions/post-steps"
   build-linux-make-with-folly:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-linux-make-with-folly' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
       options: --shm-size=16gb
-    env:
-      CC: gcc-10
-      CXX: g++-10
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
+    - uses: "./.github/actions/cache-getdeps-downloads"
     - uses: "./.github/actions/setup-folly"
+    - uses: "./.github/actions/cache-folly"
+      id: cache-folly
     - uses: "./.github/actions/build-folly"
+      with:
+        cache-hit: ${{ steps.cache-folly.outputs.cache-hit }}
     - run: USE_FOLLY=1 LIB_MODE=static V=1 make -j32 check
     - uses: "./.github/actions/post-steps"
   build-linux-make-with-folly-lite-no-test:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-linux-make-with-folly-lite-no-test' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
       options: --shm-size=16gb
-    env:
-      CC: gcc-10
-      CXX: g++-10
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
+    - uses: "./.github/actions/cache-getdeps-downloads"
     - uses: "./.github/actions/setup-folly"
-    - run: USE_FOLLY_LITE=1 V=1 make -j32 all
+    - run: USE_FOLLY_LITE=1 EXTRA_CXXFLAGS=-DGLOG_USE_GLOG_EXPORT V=1 make -j32 all
     - uses: "./.github/actions/post-steps"
   build-linux-cmake-with-folly-coroutines:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-linux-cmake-with-folly-coroutines' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
       options: --shm-size=16gb
-    env:
-      CC: gcc-10
-      CXX: g++-10
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
+    - uses: "./.github/actions/cache-getdeps-downloads"
     - uses: "./.github/actions/setup-folly"
+    - uses: "./.github/actions/cache-folly"
+      id: cache-folly
     - uses: "./.github/actions/build-folly"
-    - run: "(mkdir build && cd build && cmake -DUSE_COROUTINES=1 -DWITH_GFLAGS=1 -DROCKSDB_BUILD_SHARED=0 .. && make V=1 -j20 && ctest -j20)"
+      with:
+        cache-hit: ${{ steps.cache-folly.outputs.cache-hit }}
+    - run: "(mkdir build && cd build && cmake -DUSE_COROUTINES=1 -DWITH_GFLAGS=1 -DROCKSDB_BUILD_SHARED=0 .. && make VERBOSE=1 -j20 && ctest -j20)"
     - uses: "./.github/actions/post-steps"
-  build-linux-cmake-with-benchmark:
-    if: ${{ github.repository_owner == 'facebook' }}
+  build-linux-cmake-with-benchmark-no-thread-status:
+    if: needs.config.outputs.only_job == 'build-linux-cmake-with-benchmark-no-thread-status' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
-    - run: mkdir build && cd build && cmake -DWITH_GFLAGS=1 -DWITH_BENCHMARK=1 .. && make V=1 -j20 && ctest -j20
+    - run: mkdir build && cd build && cmake -DWITH_GFLAGS=1 -DWITH_BENCHMARK=1 -DCMAKE_CXX_FLAGS=-DNROCKSDB_THREAD_STATUS .. && make VERBOSE=1 -j20 && ctest -j20
     - uses: "./.github/actions/post-steps"
   build-linux-encrypted_env-no_compression:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-linux-encrypted_env-no_compression' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
     - run: ENCRYPTED_ENV=1 ROCKSDB_DISABLE_SNAPPY=1 ROCKSDB_DISABLE_ZLIB=1 ROCKSDB_DISABLE_BZIP=1 ROCKSDB_DISABLE_LZ4=1 ROCKSDB_DISABLE_ZSTD=1 make V=1 J=32 -j32 check
-    - run: "./sst_dump --help | grep -E -q 'Supported compression types: kNoCompression$' # Verify no compiled in compression\n"
+    - run: "./sst_dump --help | grep -E -q 'Supported built-in compression types: kNoCompression$' # Verify no compiled in compression\n"
     - uses: "./.github/actions/post-steps"
   # ======================== Linux No Test Runs ======================= #
   build-linux-release:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-linux-release' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
     - run: make V=1 -j32 LIB_MODE=shared release
     - run: ls librocksdb.so
-    - run: "./db_stress --version"
+    - run: "./trace_analyzer --version" # A tool dependent on gflags that can run in release build
     - run: make clean
-    - run: make V=1 -j32 release
+    - run: USE_RTTI=1 make V=1 -j32 release
     - run: ls librocksdb.a
-    - run: "./db_stress --version"
+    - run: "./trace_analyzer --version"
     - run: make clean
     - run: apt-get remove -y libgflags-dev
     - run: make V=1 -j32 LIB_MODE=shared release
     - run: ls librocksdb.so
-    - run: if ./db_stress --version; then false; else true; fi
+    - run: if ./trace_analyzer --version; then false; else true; fi
     - run: make clean
-    - run: make V=1 -j32 release
+    - run: USE_RTTI=1 make V=1 -j32 release
     - run: ls librocksdb.a
-    - run: if ./db_stress --version; then false; else true; fi
-    - uses: "./.github/actions/post-steps"
-  build-linux-release-rtti:
-    if: ${{ github.repository_owner == 'facebook' }}
-    runs-on:
-      labels: 8-core-ubuntu
-    container:
-      image: zjay437/rocksdb:0.6
-      options: --shm-size=16gb
-    steps:
-    - uses: actions/checkout@v4.1.0
-    - run: USE_RTTI=1 DEBUG_LEVEL=0 make V=1 -j16 static_lib tools db_bench
-    - run: "./db_stress --version"
-    - run: make clean
-    - run: apt-get remove -y libgflags-dev
-    - run: USE_RTTI=1 DEBUG_LEVEL=0 make V=1 -j16 static_lib tools db_bench
-    - run: if ./db_stress --version; then false; else true; fi
-  build-examples:
-    if: ${{ github.repository_owner == 'facebook' }}
-    runs-on:
-      labels: 4-core-ubuntu
-    container:
-      image: zjay437/rocksdb:0.6
-      options: --shm-size=16gb
-    steps:
-    - uses: actions/checkout@v4.1.0
-    - uses: "./.github/actions/pre-steps"
-    - name: Build examples
-      run: make V=1 -j4 static_lib && cd examples && make V=1 -j4
-    - uses: "./.github/actions/post-steps"
-  build-fuzzers:
-    if: ${{ github.repository_owner == 'facebook' }}
-    runs-on:
-      labels: 4-core-ubuntu
-    container:
-      image: zjay437/rocksdb:0.6
-      options: --shm-size=16gb
-    steps:
-    - uses: actions/checkout@v4.1.0
-    - uses: "./.github/actions/pre-steps"
-    - name: Build rocksdb lib
-      run: CC=clang-13 CXX=clang++-13 USE_CLANG=1 make -j4 static_lib
-    - name: Build fuzzers
-      run: cd fuzz && make sst_file_writer_fuzzer db_fuzzer db_map_fuzzer
-    - uses: "./.github/actions/post-steps"
-  build-linux-clang-no_test_run:
-    if: ${{ github.repository_owner == 'facebook' }}
-    runs-on:
-      labels: 8-core-ubuntu
-    container:
-      image: zjay437/rocksdb:0.6
-      options: --shm-size=16gb
-    steps:
-    - uses: actions/checkout@v4.1.0
-    - run: CC=clang CXX=clang++ USE_CLANG=1 PORTABLE=1 make V=1 -j16 all
+    - run: if ./trace_analyzer --version; then false; else true; fi
     - uses: "./.github/actions/post-steps"
   build-linux-clang-13-no_test_run:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-linux-clang-13-no_test_run' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
-      labels: 16-core-ubuntu
-    container:
-      image: zjay437/rocksdb:0.6
-      options: --shm-size=16gb
-    steps:
-    - uses: actions/checkout@v4.1.0
-    - uses: "./.github/actions/pre-steps"
-    - run: CC=clang-13 CXX=clang++-13 USE_CLANG=1 make -j32 all microbench
-    - uses: "./.github/actions/post-steps"
-  build-linux-gcc-8-no_test_run:
-    if: ${{ github.repository_owner == 'facebook' }}
-    runs-on:
-      labels: 16-core-ubuntu
+      labels: 8-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
-    - run: CC=gcc-8 CXX=g++-8 V=1 make -j32 all
+    # FIXME: get back to "all microbench" targets
+    - run: CC=clang-13 CXX=clang++-13 USE_CLANG=1 EXTRA_CXXFLAGS=-stdlib=libc++ EXTRA_LDFLAGS=-stdlib=libc++ make -j32 shared_lib
+    - run: make clean
+    # FIXME: get back to "release" target
+    - run: CC=clang-13 CXX=clang++-13 USE_CLANG=1 EXTRA_CXXFLAGS=-stdlib=libc++ EXTRA_LDFLAGS=-stdlib=libc++ DEBUG_LEVEL=0 make -j32 shared_lib
     - uses: "./.github/actions/post-steps"
-  build-linux-gcc-10-cxx20-no_test_run:
-    if: ${{ github.repository_owner == 'facebook' }}
+  build-linux-clang-18-no_test_run:
+    if: needs.config.outputs.only_job == 'build-linux-clang-18-no_test_run' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:24.0
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
-    - run: CC=gcc-10 CXX=g++-10 V=1 ROCKSDB_CXX_STANDARD=c++20 make -j32 all
+    - run: CC=clang-18 CXX=clang++-18 USE_CLANG=1 make -j32 all microbench
+    - run: make clean
+    - run: CC=clang-18 CXX=clang++-18 USE_CLANG=1 DEBUG_LEVEL=0 make -j32 release
     - uses: "./.github/actions/post-steps"
-  build-linux-gcc-11-no_test_run:
-    if: ${{ github.repository_owner == 'facebook' }}
+  build-linux-gcc-14-no_test_run:
+    if: needs.config.outputs.only_job == 'build-linux-gcc-14-no_test_run' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:24.0
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
-    - run: LIB_MODE=static CC=gcc-11 CXX=g++-11 V=1 make -j32 all microbench
+    - run: CC=gcc-14 CXX=g++-14 V=1 make -j32 all microbench
     - uses: "./.github/actions/post-steps"
+
   # ======================== Linux Other Checks ======================= #
-  build-linux-clang10-clang-analyze:
-    if: ${{ github.repository_owner == 'facebook' }}
+  build-linux-clang18-clang-analyze:
+    if: needs.config.outputs.only_job == 'build-linux-clang18-clang-analyze' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:24.0
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
-    - run: CC=clang-10 CXX=clang++-10 ROCKSDB_DISABLE_ALIGNED_NEW=1 CLANG_ANALYZER="/usr/bin/clang++-10" CLANG_SCAN_BUILD=scan-build-10 USE_CLANG=1 make V=1 -j32 analyze
+    - run: CC=clang-18 CXX=clang++-18 ROCKSDB_DISABLE_ALIGNED_NEW=1 CLANG_ANALYZER="/usr/bin/clang++-18" CLANG_SCAN_BUILD=scan-build-18 USE_CLANG=1 make V=1 -j32 analyze
     - uses: "./.github/actions/post-steps"
     - name: compress test report
       run: tar -cvzf scan_build_report.tar.gz scan_build_report
@@ -341,8 +291,10 @@ jobs:
       with:
         name: scan-build-report
         path: scan_build_report.tar.gz
+
   build-linux-unity-and-headers:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-linux-unity-and-headers' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 4-core-ubuntu
     container:
@@ -356,11 +308,12 @@ jobs:
     - run: make V=1 -j8 -k check-headers
     - uses: "./.github/actions/post-steps"
   build-linux-mini-crashtest:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-linux-mini-crashtest' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 4-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
@@ -368,119 +321,122 @@ jobs:
     - run: ulimit -S -n `ulimit -H -n` && make V=1 -j8 CRASH_TEST_EXT_ARGS='--duration=960 --max_key=2500000' blackbox_crash_test_with_atomic_flush
     - uses: "./.github/actions/post-steps"
   # ======================= Linux with Sanitizers ===================== #
-  build-linux-clang10-asan:
-    if: ${{ github.repository_owner == 'facebook' }}
+  build-linux-clang18-asan-ubsan:
+    if: needs.config.outputs.only_job == 'build-linux-clang18-asan-ubsan' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 32-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
-      options: --shm-size=16gb
-    steps:
-    - uses: actions/checkout@v4.1.0
-    - uses: "./.github/actions/pre-steps"
-    - run: COMPILE_WITH_ASAN=1 CC=clang-10 CXX=clang++-10 ROCKSDB_DISABLE_ALIGNED_NEW=1 USE_CLANG=1 make V=1 -j32 check
-    - uses: "./.github/actions/post-steps"
-  build-linux-clang10-ubsan:
-    if: ${{ github.repository_owner == 'facebook' }}
-    runs-on:
-      labels: 16-core-ubuntu
-    container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:24.0
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
-    - run: COMPILE_WITH_UBSAN=1 CC=clang-10 CXX=clang++-10 ROCKSDB_DISABLE_ALIGNED_NEW=1 USE_CLANG=1 make V=1 -j32 ubsan_check
+    - run: COMPILE_WITH_ASAN=1 COMPILE_WITH_UBSAN=1 CC=clang-18 CXX=clang++-18 ROCKSDB_DISABLE_ALIGNED_NEW=1 USE_CLANG=1 make V=1 -j40 check
     - uses: "./.github/actions/post-steps"
-  build-linux-clang13-mini-tsan:
-    if: ${{ github.repository_owner == 'facebook' }}
+  build-linux-clang18-mini-tsan:
+    if: needs.config.outputs.only_job == 'build-linux-clang18-mini-tsan' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 32-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:24.0
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
-    - run: COMPILE_WITH_TSAN=1 CC=clang-13 CXX=clang++-13 ROCKSDB_DISABLE_ALIGNED_NEW=1 USE_CLANG=1 make V=1 -j32 check
+    - run: COMPILE_WITH_TSAN=1 CC=clang-18 CXX=clang++-18 ROCKSDB_DISABLE_ALIGNED_NEW=1 USE_CLANG=1 make V=1 -j32 check
     - uses: "./.github/actions/post-steps"
   build-linux-static_lib-alt_namespace-status_checked:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-linux-static_lib-alt_namespace-status_checked' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
-    - run: ASSERT_STATUS_CHECKED=1 TEST_UINT128_COMPAT=1 ROCKSDB_MODIFY_NPHASH=1 LIB_MODE=static OPT="-DROCKSDB_NAMESPACE=alternative_rocksdb_ns" make V=1 -j24 check
+    - run: ASSERT_STATUS_CHECKED=1 TEST_UINT128_COMPAT=1 ROCKSDB_MODIFY_NPHASH=1 LIB_MODE=static OPT="-DROCKSDB_USE_STD_SEMAPHORES -DROCKSDB_NAMESPACE=alternative_rocksdb_ns" make V=1 -j24 check
     - uses: "./.github/actions/post-steps"
   # ========================= MacOS build only ======================== #
   build-macos:
-    if: ${{ github.repository_owner == 'facebook' }}
-    runs-on: macos-13
+    if: needs.config.outputs.only_job == 'build-macos' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
+    runs-on: macos-15-xlarge
     env:
       ROCKSDB_DISABLE_JEMALLOC: 1
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: maxim-lobanov/setup-xcode@v1.6.0
       with:
-        xcode-version: 14.3.1
+        xcode-version: 16.4.0
     - uses: "./.github/actions/increase-max-open-files-on-macos"
     - uses: "./.github/actions/install-gflags-on-macos"
     - uses: "./.github/actions/pre-steps-macos"
     - name: Build
-      run: ulimit -S -n `ulimit -H -n` && make V=1 J=16 -j16 all
+      run: ulimit -S -n `ulimit -H -n` && make V=1 J=16 -j8 all
     - uses: "./.github/actions/post-steps"
   # ========================= MacOS with Tests ======================== #
   build-macos-cmake:
-    if: ${{ github.repository_owner == 'facebook' }}
-    runs-on: macos-13
+    if: needs.config.outputs.only_job == 'build-macos-cmake' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
+    runs-on: macos-15-xlarge
     strategy:
       matrix:
-        run_even_tests: [true, false]
+        run_sharded_tests: [0, 1, 2, 3]
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: maxim-lobanov/setup-xcode@v1.6.0
       with:
-        xcode-version: 14.3.1
+        xcode-version: 16.4.0
     - uses: "./.github/actions/increase-max-open-files-on-macos"
     - uses: "./.github/actions/install-gflags-on-macos"
     - uses: "./.github/actions/pre-steps-macos"
     - name: cmake generate project file
       run: ulimit -S -n `ulimit -H -n` && mkdir build && cd build && cmake -DWITH_GFLAGS=1 ..
     - name: Build tests
-      run: cd build && make V=1 -j16
-    - name: Run even tests
-      run: ulimit -S -n `ulimit -H -n` && cd build && ctest -j16 -I 0,,2
-      if: ${{ matrix.run_even_tests }}
-    - name: Run odd tests
-      run: ulimit -S -n `ulimit -H -n` && cd build && ctest -j16 -I 1,,2
-      if: ${{ ! matrix.run_even_tests  }}
+      run: cd build && make VERBOSE=1 -j8
+    - name: Run shard 0 out of 4 test shards
+      run: ulimit -S -n `ulimit -H -n` && cd build && ctest -j8 -I 0,,4
+      if: ${{ matrix.run_sharded_tests == 0 }}
+    - name: Run shard 1 out of 4 test shards
+      run: ulimit -S -n `ulimit -H -n` && cd build && ctest -j8 -I 1,,4
+      if: ${{ matrix.run_sharded_tests == 1 }}
+    - name: Run shard 2 out of 4 test shards
+      run: ulimit -S -n `ulimit -H -n` && cd build && ctest -j8 -I 2,,4
+      if: ${{ matrix.run_sharded_tests == 2 }}
+    - name: Run shard 3 out of 4 test shards
+      run: ulimit -S -n `ulimit -H -n` && cd build && ctest -j8 -I 3,,4
+      if: ${{ matrix.run_sharded_tests == 3 }}
     - uses: "./.github/actions/post-steps"
   # ======================== Windows with Tests ======================= #
   # NOTE: some windows jobs are in "nightly" to save resources
-  build-windows-vs2019:
-    if: ${{ github.repository_owner == 'facebook' }}
-    runs-on: windows-2019
+  build-windows-vs2022:
+    if: needs.config.outputs.only_job == 'build-windows-vs2022' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
+    runs-on: windows-8-core
     env:
-      CMAKE_GENERATOR: Visual Studio 16 2019
+      CMAKE_GENERATOR: Visual Studio 17 2022
       CMAKE_PORTABLE: 1
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/windows-build-steps"
   # ============================ Java Jobs ============================ #
   build-linux-java:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-linux-java' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 4-core-ubuntu
     container:
-      image: evolvedbinary/rocksjava:centos6_x64-be
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
       options: --shm-size=16gb
     steps:
     # The docker image is intentionally based on an OS that has an older GLIBC version.
     # That GLIBC is incompatibile with GitHub's actions/checkout. Thus we implement a manual checkout step.
+    # NOTE: replaced evolvedbinary/rocksjava:centos7_x64-be with ghcr.io/facebook/rocksdb_ubuntu:22.1
+    # until a more appropriate docker image with C++20 support is made.
     - name: Checkout
       env:
         GH_TOKEN: ${{ github.token }}
@@ -497,18 +453,22 @@ jobs:
         which java && java -version
         which javac && javac -version
     - name: Test RocksDBJava
-      run: scl enable devtoolset-7 'make V=1 J=8 -j8 jtest'
-    # NOTE: post-steps skipped because of compatibility issues with docker image
+    # NOTE: replaced scl enable devtoolset-7 'make V=1 J=8 -j8 jtest'
+      run: make V=1 J=8 -j8 jtest
+    # post-steps skipped because of compatibility issues with docker image
   build-linux-java-static:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-linux-java-static' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 4-core-ubuntu
     container:
-      image: evolvedbinary/rocksjava:centos6_x64-be
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
       options: --shm-size=16gb
     steps:
     # The docker image is intentionally based on an OS that has an older GLIBC version.
     # That GLIBC is incompatibile with GitHub's actions/checkout. Thus we implement a manual checkout step.
+    # NOTE: replaced evolvedbinary/rocksjava:centos7_x64-be with ghcr.io/facebook/rocksdb_ubuntu:22.1
+    # until a more appropriate docker image with C++20 support is made.
     - name: Checkout
       env:
         GH_TOKEN: ${{ github.token }}
@@ -525,11 +485,13 @@ jobs:
         which java && java -version
         which javac && javac -version
     - name: Build RocksDBJava Static Library
-      run: scl enable devtoolset-7 'make V=1 J=8 -j8 rocksdbjavastatic'
-    # NOTE: post-steps skipped because of compatibility issues with docker image
+    # NOTE: replaced scl enable devtoolset-7 'make V=1 J=8 -j8 rocksdbjavastatic'
+      run: make V=1 J=8 -j8 rocksdbjavastatic
+    # post-steps skipped because of compatibility issues with docker image
   build-macos-java:
-    if: ${{ github.repository_owner == 'facebook' }}
-    runs-on: macos-13
+    if: needs.config.outputs.only_job == 'build-macos-java' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
+    runs-on: macos-15-xlarge
     env:
       JAVA_HOME: "/Library/Java/JavaVirtualMachines/liberica-jdk-8.jdk/Contents/Home"
       ROCKSDB_DISABLE_JEMALLOC: 1
@@ -537,7 +499,7 @@ jobs:
     - uses: actions/checkout@v4.1.0
     - uses: maxim-lobanov/setup-xcode@v1.6.0
       with:
-        xcode-version: 14.3.1
+        xcode-version: 16.4.0
     - uses: "./.github/actions/increase-max-open-files-on-macos"
     - uses: "./.github/actions/install-gflags-on-macos"
     - uses: "./.github/actions/install-jdk8-on-macos"
@@ -551,15 +513,16 @@ jobs:
       run: make V=1 J=16 -j16 jtest
     - uses: "./.github/actions/post-steps"
   build-macos-java-static:
-    if: ${{ github.repository_owner == 'facebook' }}
-    runs-on: macos-13
+    if: needs.config.outputs.only_job == 'build-macos-java-static' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
+    runs-on: macos-15-xlarge
     env:
       JAVA_HOME: "/Library/Java/JavaVirtualMachines/liberica-jdk-8.jdk/Contents/Home"
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: maxim-lobanov/setup-xcode@v1.6.0
       with:
-        xcode-version: 14.3.1
+        xcode-version: 16.4.0
     - uses: "./.github/actions/increase-max-open-files-on-macos"
     - uses: "./.github/actions/install-gflags-on-macos"
     - uses: "./.github/actions/install-jdk8-on-macos"
@@ -573,15 +536,16 @@ jobs:
       run: make V=1 J=16 -j16 rocksdbjavastaticosx
     - uses: "./.github/actions/post-steps"
   build-macos-java-static-universal:
-    if: ${{ github.repository_owner == 'facebook' }}
-    runs-on: macos-13
+    if: needs.config.outputs.only_job == 'build-macos-java-static-universal' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
+    runs-on: macos-15-xlarge
     env:
       JAVA_HOME: "/Library/Java/JavaVirtualMachines/liberica-jdk-8.jdk/Contents/Home"
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: maxim-lobanov/setup-xcode@v1.6.0
       with:
-        xcode-version: 14.3.1
+        xcode-version: 16.4.0
     - uses: "./.github/actions/increase-max-open-files-on-macos"
     - uses: "./.github/actions/install-gflags-on-macos"
     - uses: "./.github/actions/install-jdk8-on-macos"
@@ -595,11 +559,12 @@ jobs:
       run: make V=1 J=16 -j16 rocksdbjavastaticosx_ub
     - uses: "./.github/actions/post-steps"
   build-linux-java-pmd:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-linux-java-pmd' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 4-core-ubuntu
     container:
-      image: evolvedbinary/rocksjava:rockylinux8_x64-be
+      image: evolvedbinary/rocksjava:alpine3_x64-be
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
@@ -621,7 +586,8 @@ jobs:
         name: maven-site
         path: "${{ github.workspace }}/java/target/site"
   build-linux-arm:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-linux-arm' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 4-core-ubuntu-arm
     steps:
diff --git a/.github/workflows/weekly.yml b/.github/workflows/weekly.yml
new file mode 100644
index 000000000000..37d36513a783
--- /dev/null
+++ b/.github/workflows/weekly.yml
@@ -0,0 +1,20 @@
+name: facebook/rocksdb/weekly
+on:
+  schedule:
+  - cron: 0 9 * * 0
+  workflow_dispatch:
+permissions: {}
+jobs:
+  build-linux-valgrind:
+    if: ${{ github.repository_owner == 'facebook' }}
+    runs-on:
+      labels: 16-core-ubuntu
+    timeout-minutes: 840
+    container:
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
+      options: --shm-size=16gb
+    steps:
+    - uses: actions/checkout@v4.1.0
+    - uses: "./.github/actions/pre-steps"
+    - run: make V=1 -j20 valgrind_test
+    - uses: "./.github/actions/post-steps"
diff --git a/BUCK b/BUCK
index bffed60e4add..c05b7bb33d3a 100644
--- a/BUCK
+++ b/BUCK
@@ -1,12 +1,14 @@
 # This file @generated by:
 #$ python3 buckifier/buckify_rocksdb.py
 # --> DO NOT EDIT MANUALLY <--
-# This file is a Facebook-specific integration for buck builds, so can
-# only be validated by Facebook employees.
+# This file is a Meta-specific integration for buck builds, so can
+# only be validated by Meta employees.
 load("//rocks/buckifier:defs.bzl", "cpp_library_wrapper","rocks_cpp_library_wrapper","cpp_binary_wrapper","cpp_unittest_wrapper","fancy_bench_wrapper","add_c_test_wrapper")
 load("@fbcode_macros//build_defs:export_files.bzl", "export_file")
 
 
+oncall("rocksdb_point_of_contact")
+
 cpp_library_wrapper(name="rocksdb_lib", srcs=[
         "cache/cache.cc",
         "cache/cache_entry_roles.cc",
@@ -88,6 +90,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
         "db/memtable_list.cc",
         "db/merge_helper.cc",
         "db/merge_operator.cc",
+        "db/multi_scan.cc",
         "db/output_validator.cc",
         "db/periodic_task_scheduler.cc",
         "db/range_del_aggregator.cc",
@@ -113,6 +116,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
         "db/write_controller.cc",
         "db/write_stall_stats.cc",
         "db/write_thread.cc",
+        "db_stress_tool/db_stress_compression_manager.cc",
         "env/composite_env.cc",
         "env/env.cc",
         "env/env_chroot.cc",
@@ -214,7 +218,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
         "table/cuckoo/cuckoo_table_builder.cc",
         "table/cuckoo/cuckoo_table_factory.cc",
         "table/cuckoo/cuckoo_table_reader.cc",
-        "table/external_table_reader.cc",
+        "table/external_table.cc",
         "table/format.cc",
         "table/get_context.cc",
         "table/iterator.cc",
@@ -249,6 +253,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
         "trace_replay/trace_record_result.cc",
         "trace_replay/trace_replay.cc",
         "util/async_file_reader.cc",
+        "util/auto_tune_compressor.cc",
         "util/build_version.cc",
         "util/cleanable.cc",
         "util/coding.cc",
@@ -263,10 +268,12 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
         "util/dynamic_bloom.cc",
         "util/file_checksum_helper.cc",
         "util/hash.cc",
+        "util/io_dispatcher_imp.cc",
         "util/murmurhash.cc",
         "util/random.cc",
         "util/rate_limiter.cc",
         "util/ribbon_config.cc",
+        "util/simple_mixed_compressor.cc",
         "util/slice.cc",
         "util/status.cc",
         "util/stderr_logger.cc",
@@ -415,16 +422,19 @@ cpp_library_wrapper(name="rocksdb_tools_lib", srcs=[
 
 cpp_library_wrapper(name="rocksdb_cache_bench_tools_lib", srcs=["cache/cache_bench_tool.cc"], deps=[":rocksdb_lib"], headers=[], link_whole=False, extra_test_libs=False)
 
+cpp_library_wrapper(name="rocksdb_point_lock_bench_tools_lib", srcs=["utilities/transactions/lock/point/point_lock_bench_tool.cc"], deps=[":rocksdb_lib"], headers=[], link_whole=False, extra_test_libs=False)
+
 rocks_cpp_library_wrapper(name="rocksdb_stress_lib", srcs=[
         "db_stress_tool/batched_ops_stress.cc",
         "db_stress_tool/cf_consistency_stress.cc",
         "db_stress_tool/db_stress_common.cc",
+        "db_stress_tool/db_stress_compaction_service.cc",
+        "db_stress_tool/db_stress_compression_manager.cc",
         "db_stress_tool/db_stress_driver.cc",
         "db_stress_tool/db_stress_filters.cc",
         "db_stress_tool/db_stress_gflags.cc",
         "db_stress_tool/db_stress_listener.cc",
         "db_stress_tool/db_stress_shared_state.cc",
-        "db_stress_tool/db_stress_stat.cc",
         "db_stress_tool/db_stress_test_base.cc",
         "db_stress_tool/db_stress_tool.cc",
         "db_stress_tool/db_stress_wide_merge_operator.cc",
@@ -446,6 +456,8 @@ cpp_binary_wrapper(name="db_bench", srcs=["tools/db_bench.cc"], deps=[":rocksdb_
 
 cpp_binary_wrapper(name="cache_bench", srcs=["cache/cache_bench.cc"], deps=[":rocksdb_cache_bench_tools_lib"], extra_preprocessor_flags=[], extra_bench_libs=False)
 
+cpp_binary_wrapper(name="point_lock_bench", srcs=["utilities/transactions/lock/point/point_lock_bench.cc"], deps=[":rocksdb_point_lock_bench_tools_lib"], extra_preprocessor_flags=[], extra_bench_libs=False)
+
 cpp_binary_wrapper(name="ribbon_bench", srcs=["microbench/ribbon_bench.cc"], deps=[], extra_preprocessor_flags=[], extra_bench_libs=True)
 
 cpp_binary_wrapper(name="db_basic_bench", srcs=["microbench/db_basic_bench.cc"], deps=[], extra_preprocessor_flags=[], extra_bench_libs=True)
@@ -4709,6 +4721,12 @@ cpp_unittest_wrapper(name="compressed_secondary_cache_test",
             extra_compiler_flags=[])
 
 
+cpp_unittest_wrapper(name="compression_test",
+            srcs=["util/compression_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
 cpp_unittest_wrapper(name="configurable_test",
             srcs=["options/configurable_test.cc"],
             deps=[":rocksdb_test_lib"],
@@ -4805,6 +4823,12 @@ cpp_unittest_wrapper(name="db_clip_test",
             extra_compiler_flags=[])
 
 
+cpp_unittest_wrapper(name="db_compaction_abort_test",
+            srcs=["db/db_compaction_abort_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
 cpp_unittest_wrapper(name="db_compaction_filter_test",
             srcs=["db/db_compaction_filter_test.cc"],
             deps=[":rocksdb_test_lib"],
@@ -4829,6 +4853,12 @@ cpp_unittest_wrapper(name="db_encryption_test",
             extra_compiler_flags=[])
 
 
+cpp_unittest_wrapper(name="db_etc3_test",
+            srcs=["db/db_etc3_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
 cpp_unittest_wrapper(name="db_flush_test",
             srcs=["db/db_flush_test.cc"],
             deps=[":rocksdb_test_lib"],
@@ -5185,6 +5215,18 @@ cpp_unittest_wrapper(name="inlineskiplist_test",
             extra_compiler_flags=[])
 
 
+cpp_unittest_wrapper(name="interval_test",
+            srcs=["util/interval_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
+cpp_unittest_wrapper(name="io_dispatcher_test",
+            srcs=["util/io_dispatcher_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
 cpp_unittest_wrapper(name="io_posix_test",
             srcs=["env/io_posix_test.cc"],
             deps=[":rocksdb_test_lib"],
@@ -5365,6 +5407,12 @@ cpp_unittest_wrapper(name="plain_table_db_test",
             extra_compiler_flags=[])
 
 
+cpp_unittest_wrapper(name="point_lock_manager_stress_test",
+            srcs=["utilities/transactions/lock/point/point_lock_manager_stress_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
 cpp_unittest_wrapper(name="point_lock_manager_test",
             srcs=["utilities/transactions/lock/point/point_lock_manager_test.cc"],
             deps=[":rocksdb_test_lib"],
@@ -5683,6 +5731,12 @@ cpp_unittest_wrapper(name="write_prepared_transaction_test",
             extra_compiler_flags=[])
 
 
+cpp_unittest_wrapper(name="write_prepared_transaction_test_seqno",
+            srcs=["utilities/transactions/write_prepared_transaction_test_seqno.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
 cpp_unittest_wrapper(name="write_unprepared_transaction_test",
             srcs=["utilities/transactions/write_unprepared_transaction_test.cc"],
             deps=[":rocksdb_test_lib"],
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 000000000000..39ef7dbc380d
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,312 @@
+# RocksDB Code Generation and Review Guidance
+
+This document provides guidance for generating and reviewing code in the RocksDB project, derived from analysis of code review feedback across hundreds of complex merged Pull Requests. Use this as a reference when writing code with AI assistants or conducting code reviews.
+
+---
+
+## General Best Practices
+
+### Code Quality and Maintainability
+
+**Clarity and Readability:** Write clear, self-documenting code. Use meaningful variable names, add comments for complex logic, and structure code to minimize cognitive load. Avoid clever tricks that sacrifice readability for marginal performance gains unless absolutely necessary.
+
+**Consistent Style:** Follow existing code style conventions. RocksDB uses `.clang-format` for formatting, specific naming conventions, and structural patterns. Deviations from these patterns are frequently flagged in reviews.
+
+**Error Handling:** Ensure robust error handling throughout the codebase. Use RocksDB's `Status` type consistently, propagate errors appropriately, and avoid silently ignoring failures. Reviewers pay close attention to edge cases and failure modes.
+
+### Testing Philosophy
+
+**Comprehensive Coverage:** Every change should include appropriate test coverage. This includes unit tests for isolated functionality, integration tests for component interactions, and stress tests for concurrency and performance validation. Reviewers will ask for additional tests if coverage is insufficient.
+
+**Edge Cases and Failure Modes:** Tests should explicitly cover edge cases, boundary conditions, and potential failure scenarios. This is especially important for changes affecting core database operations, compaction, or recovery logic.
+
+**Platform-Specific Testing:** RocksDB supports multiple platforms (Linux, Windows, macOS) and compilers (GCC, Clang, MSVC). Changes should be tested across relevant platforms, particularly when touching platform-specific code or using compiler-specific features.
+
+### Performance Considerations
+
+**⚠️ PERFORMANCE IS CRITICAL:** RocksDB is a high-performance storage engine where every CPU cycle and memory access matters. When writing code, always evaluate from a performance perspective. This is not optional—performance-aware coding is a fundamental requirement for all contributions.
+
+**Benchmarking and Profiling:** Performance claims should be backed by empirical evidence. Use RocksDB's benchmarking tools (e.g., `db_bench`) to validate improvements. Reviewers will request benchmark results for changes that could impact performance.
+
+**Memory Allocation:** Minimize dynamic memory allocations, especially in hot paths. Prefer stack allocation over heap allocation. Reuse buffers when possible. Consider using arena allocators or memory pools for frequent small allocations. Every `new`, `malloc`, or container resize has a cost.
+
+**Memory Copy:** Avoid unnecessary memory copies. Use move semantics, `std::string_view`, `Slice`, and pass-by-reference where appropriate. Be aware of implicit copies in STL containers and function returns. Prefer in-place operations over copy-and-modify patterns.
+
+**CPU Cache Efficiency:** Design data structures and access patterns to be cache-friendly. Keep frequently accessed data together (data locality). Prefer sequential memory access over random access. Be mindful of cache line sizes (typically 64 bytes) and avoid false sharing in concurrent code. Consider struct packing and field ordering to improve cache utilization.
+
+**Loop Optimization:** Look for opportunities to collapse nested loops, reduce loop overhead, and minimize branch mispredictions. Hoist invariant computations out of loops. Consider loop unrolling for tight inner loops. Batch operations when possible to amortize per-operation overhead.
+
+**SIMD and Vectorization:** Leverage SIMD instructions (SSE, AVX) for data-parallel operations when appropriate. Structure data to enable auto-vectorization by the compiler. Consider explicit SIMD intrinsics for critical hot paths like checksum computation, encoding/decoding, and bulk data processing.
+
+**Branch Prediction:** Minimize unpredictable branches in hot paths. Use `LIKELY`/`UNLIKELY` macros to hint branch prediction. Consider branchless alternatives for simple conditionals. Order switch cases and if-else chains by frequency.
+
+**Memory and Resource Management:** Be mindful of memory allocations, especially in hot paths. Use RAII patterns, smart pointers, and RocksDB's memory management utilities appropriately.
+
+**Hot Path Analysis:** When deciding how aggressively to optimize code, consider whether it's on a hot path:
+- **Hot path** (executed thousands+ times, e.g., data access, iteration, compaction loops): Performance is paramount. Apply all optimization techniques—loop collapsing, SIMD, cache optimization, pre-allocation, etc. The cost of each operation is multiplied by execution frequency.
+- **Cold path** (executed rarely, e.g., DB open, configuration parsing, error handling): Maintainability and clarity are more important. Prefer readable code over micro-optimizations. Complex optimizations here add maintenance burden with negligible performance benefit.
+- **Warm path** (moderate frequency): Balance both concerns. Use profiling data to guide optimization decisions.
+
+**Avoid Premature Optimization:** While performance is critical, focus on correctness first, then optimize based on profiling data. However, be performance-aware from the start—choosing the right algorithm and data structure upfront is not premature optimization. Use the hot path analysis above to decide how much optimization effort is warranted.
+
+### API Design and Compatibility
+
+**Backwards Compatibility:** RocksDB maintains strong backwards compatibility guarantees. Breaking changes are rare and require extensive justification. When deprecating features, follow the project's deprecation policy (typically spanning multiple releases).
+
+**API Consistency:** New APIs should be consistent with existing patterns. Use similar naming conventions, parameter ordering, and return types. Reviewers will suggest changes to improve consistency with the broader codebase.
+
+**Documentation:** Public APIs must be thoroughly documented. Include usage examples, parameter descriptions, and notes on thread safety, performance characteristics, and compatibility considerations.
+
+---
+
+## Component-Specific Guidance
+
+### Database Core (`db`)
+
+The database core handles write-ahead logging (WAL), memtables, compaction, and recovery. This component receives the most scrutiny in code reviews.
+
+**Concurrency and Thread Safety:** Database operations are highly concurrent. Reviewers carefully examine locking strategies, atomic operations, and memory ordering. Document synchronization assumptions clearly. Use appropriate memory ordering semantics (`acquire`/`release` vs. `seq_cst`).
+
+**Compaction Logic:** Changes to compaction are complex and high-risk. Ensure that compaction logic respects configured parameters, handles edge cases (empty databases, single-file compactions), and maintains correctness under concurrent operations.
+
+**Error Propagation:** Database operations can fail in many ways (I/O errors, corruption, resource exhaustion). Ensure that errors are properly propagated, logged, and handled. Avoid assertions in production code paths.
+
+**Testing:** Database core changes require extensive testing, including unit tests, integration tests, and stress tests. Test with various configurations, compaction styles, and concurrent workloads.
+
+### Public Headers (`include`)
+
+Public headers define RocksDB's API surface. Changes here have the highest compatibility impact.
+
+**API Design:** New APIs should be intuitive, consistent with existing patterns, and well-documented. Consider how the API will be used in practice and avoid adding unnecessary complexity.
+
+**Backwards Compatibility:** Breaking changes to public APIs require extensive justification and a deprecation plan. Maintain ABI compatibility for bug fixes and patch releases.
+
+**Documentation:** Every public API must be thoroughly documented with usage examples, parameter descriptions, and notes on thread safety and performance characteristics.
+
+**Deprecation:** When deprecating APIs, follow the project's policy. Mark deprecated APIs clearly, provide migration guidance, and maintain support for at least one major release.
+
+### Internal Utilities (`util`)
+
+Internal utilities provide common functionality used throughout the codebase.
+
+**Code Reuse:** Utilities should be general-purpose and reusable. Avoid duplicating functionality that already exists elsewhere in the codebase.
+
+**Error Handling:** Utility functions should handle errors robustly and propagate them appropriately. Consider edge cases like overflow, underflow, and invalid inputs.
+
+**Testing:** Utility functions should have comprehensive test coverage, including edge cases and failure modes. Consider adding death tests for assertions.
+
+**Performance:** Utilities are often used in hot paths. Ensure that implementations are efficient and avoid unnecessary allocations or copies.
+
+### Table Management (`table`)
+
+Table management handles SST file format, block-based tables, and table readers/writers.
+
+**Block Format and Checksums:** Changes to block format require extreme care. Ensure that checksums are computed and verified correctly. Test with various compression algorithms and block sizes.
+
+**Iterator Correctness:** Table iterators are used throughout the codebase. Ensure that iterator semantics (Seek, Next, Prev) are correct, especially at boundaries and with deletions.
+
+**Caching and Prefetching:** Table readers interact with the block cache and prefetching logic. Ensure that cache keys are unique and that prefetching respects configured limits.
+
+**Performance:** Table operations are performance-critical. Benchmark changes that could impact read or write performance.
+
+### Utilities (`utilities`)
+
+Utilities include optional features like transactions, backup engine, and checkpoint.
+
+**Feature Isolation:** Utilities should be self-contained and not introduce unnecessary dependencies on core database internals.
+
+**Deprecation and Cleanup:** Legacy features are being phased out. When removing deprecated code, ensure that migration paths are documented and that users have sufficient warning.
+
+**Cross-Platform Compatibility:** Utilities often interact with OS-specific APIs. Ensure that code works on all supported platforms.
+
+### Options and Configuration (`options`)
+
+Options define RocksDB's configuration system.
+
+**Type Safety:** Use appropriate types for options (e.g., `uint32_t` for flags, scoped enums for enumerated values).
+
+**Deprecation Policy:** When deprecating options, follow the project's policy. Document the deprecation, provide migration guidance, and maintain support for at least one major release.
+
+**Dynamic Configuration:** Some options can be changed dynamically. Ensure that dynamic changes are thread-safe and take effect correctly.
+
+**Validation:** Validate option values and provide clear error messages for invalid configurations.
+
+### Cache (`cache`)
+
+Cache management is critical for RocksDB's performance.
+
+**Concurrency:** Cache operations are highly concurrent. Ensure that implementations are thread-safe and use appropriate synchronization primitives.
+
+**Performance:** Cache operations are in the hot path. Optimize for low latency and high throughput. Benchmark changes carefully.
+
+**Memory Management:** Cache implementations must manage memory carefully to avoid leaks and excessive allocations.
+
+**Eviction Policies:** Changes to eviction policies should be well-tested and benchmarked to ensure they improve overall performance.
+
+---
+
+## Code Review Checklist
+
+When reviewing RocksDB code (or preparing code for review), use this checklist:
+
+### Correctness
+- [ ] Does the change preserve database semantics (e.g., snapshot isolation, key ordering)?
+- [ ] Are all error cases handled appropriately?
+- [ ] Is the change thread-safe? Are synchronization primitives used correctly?
+- [ ] Are there any potential data races or deadlocks?
+
+### Testing
+- [ ] Does the change include appropriate test coverage?
+- [ ] Are edge cases and failure modes tested?
+- [ ] Have the tests been run on all supported platforms?
+- [ ] Are stress tests passing?
+
+### Performance
+- [ ] Are there benchmark results for performance-sensitive changes?
+- [ ] Does the change avoid unnecessary allocations or copies?
+- [ ] Are hot paths optimized appropriately?
+
+### API and Compatibility
+- [ ] Is the change backwards compatible?
+- [ ] Are new APIs consistent with existing patterns?
+- [ ] Is the public API documented?
+- [ ] Are deprecated features handled according to policy?
+
+### Code Quality
+- [ ] Does the code follow RocksDB's style conventions?
+- [ ] Is the code clear and maintainable?
+- [ ] Are comments and documentation sufficient?
+- [ ] Are there any code smells or anti-patterns?
+
+---
+
+## Common Review Feedback Patterns
+
+The following patterns emerged as frequent sources of review feedback:
+
+1. **Test Coverage:** Reviewers frequently request additional tests for edge cases, platform-specific behavior, and failure modes. Complex changes require comprehensive test coverage including unit tests, integration tests, and stress tests.
+
+2. **Error Handling:** Ensure proper error propagation using RocksDB's `Status` type. Avoid silent failures and provide clear error messages that include context about what failed and why.
+
+3. **API Design:** New APIs should be consistent with existing patterns. Use descriptive names that follow established conventions. Avoid breaking changes without strong justification and a clear deprecation plan.
+
+4. **Documentation:** Public APIs must be documented with usage examples and notes on thread safety, performance characteristics, and compatibility considerations. Complex internal logic should also be well-commented.
+
+5. **Performance:** Performance-sensitive changes require benchmark results to validate improvements. Use `db_bench` and other profiling tools to measure impact. Avoid premature optimization that adds complexity without measurable benefit.
+
+6. **Concurrency:** Thread safety is critical in RocksDB. Document synchronization assumptions clearly. Use appropriate memory ordering semantics. Consider potential race conditions and deadlocks.
+
+7. **Code Style:** Follow existing conventions for naming, formatting, and structure. Use `.clang-format` for consistent formatting. Prefer scoped enums (`enum class`) over unscoped enums.
+
+8. **Backwards Compatibility:** RocksDB maintains strong compatibility guarantees. Breaking changes require extensive justification. When deprecating features, provide migration guidance and maintain support across multiple releases.
+
+9. **Refactoring:** Reviewers appreciate refactoring that improves code readability and maintainability. Look for opportunities to deduplicate code and simplify complex logic.
+
+10. **Platform Compatibility:** Ensure changes work correctly on all supported platforms (Linux, Windows, macOS) and with all supported compilers (GCC, Clang, MSVC).
+
+---
+
+## Important tips
+
+### Build system
+* There are 3 build system. Make, CMake, BUCK(meta internal).
+* When a new .cc file is added, update Makefile, CMakeLists.txt, src.mk, BUCK.
+* Don't manually edit BUCK file, after updating src.mk, run
+    /usr/local/bin/python3 buckifier/buckify_rocksdb.py to update it
+* Use make to build and run the test. CMake and BUCK are not used locally.
+* Use `make dbg` command to build all of the unit test in debug mode.
+* For -j in make command, use the number of CPU cores to decide it.
+
+### Unit Test
+* After all of the unit tests are added, review them and try to extract common
+    reusable utility functions to reduce code duplication due to copy past between
+    unit tests. This should be done every time unit test is updated.
+* Don't use sleep to wait for certain events to happen. This will cause test to
+    be flaky. Instead, use sync point to synchronize thread progress.
+* Cap unit test execution with 60 seconds timeout.
+* When there are multiple unit tests need to be executed, try to use
+    gtest_parallel.py if available. E.g.
+    python3 ${GTEST_PARALLEL}/gtest_parallel.py ./table_test
+
+### Unit test dedup guidelines
+* Extract helper functions for repeated patterns such as object
+    construction, round-trip (encode → decode → verify), and common
+    assertion sequences.
+* Use table-driven tests (struct array + loop) when multiple test cases
+    share the same logic but differ only in input/expected data.
+* Prefer randomized tests over exhaustive parameter permutations. Use
+    `Random` from `util/random.h` (not `std::mt19937`). Use a time-based
+    seed with `SCOPED_TRACE("seed=" + std::to_string(seed))` so failures
+    are reproducible.
+* Keep deterministic edge-case tests separate from randomized tests
+    (error paths, boundary conditions, format verification).
+* Methods only used in tests should be private with `friend class` +
+    `TEST_F` fixture wrappers. In wrappers, always fully qualify the
+    target method to avoid infinite recursion.
+
+### Adding new public API
+    Refer to claude_md/add_public_api.md
+
+### Adding new option
+    Refer to claude_md/add_option.md
+
+### Metrics
+* When adding a new feature, evaluate whether there is opportunity to add
+    metrics. Try to avoid causing performance regression on hot path when adding
+    metrics.
+
+### Stress test
+* When adding a new feature, make sure stress test covers the new option.
+
+### DB bench update
+* When adding a performance related feature, support it in db_bench
+
+### Adding release note
+* Release note should be kept short at high level for external user consumption.
+
+### Blog posts (docs/_posts)
+* Blog post authors must be defined in `docs/_data/authors.yml` to be displayed
+
+### Final verification of the change
+* Execute make clean to clean all of the changes.
+* Execute make check to build all of the changes and execute all of the tests.
+    Note that executing all of the tests could take multiple minutes.
+
+### Monitoring make check progress
+* Use `make check-progress` to get machine-parseable JSON progress while
+    `make check` is running. This is useful for Claude Code to monitor long
+    builds without timeout issues.
+* Run `make check` in background, then poll progress:
+    ```bash
+    make check &
+    # Poll periodically:
+    make check-progress
+    ```
+* The output shows current phase and progress:
+    ```json
+    {"status":"running","phase":"compiling","completed":300,"total":919,...}
+    {"status":"running","phase":"testing","completed":1500,"total":29962,"failed":0,"percent":5,...}
+    {"status":"completed","phase":"testing","completed":29962,"total":29962,"failed":0,"percent":100,...}
+    ```
+* Phases: `compiling` -> `linking` -> `generating` -> `testing` -> `completed`
+* Key fields: `status`, `phase`, `completed`, `total`, `failed`, `percent`
+* When tests fail, `failed_tests` array shows details (up to 10 failures):
+    ```json
+    {"status":"running",...,"failed":3,"failed_tests":[
+      {"test":"cache_test-CacheTest.Usage","exit_code":1,"signal":0,"output":"...test log..."},
+      {"test":"env_test-EnvTest.Open","exit_code":0,"signal":11,"output":"...Segmentation fault..."}
+    ]}
+    ```
+* `exit_code`: non-zero means test assertion failed
+* `signal`: non-zero means test was killed (e.g., 9=SIGKILL, 6=SIGABRT, 11=SIGSEGV)
+* `output`: last 50 lines of test log including error messages and stack traces
+
+### Executing benchmark using db_bench
+* Since the goal is to measure performance, we need to build a release binary
+    using `make clean && DEBUG_LEVEL=0 make db_bench`. If there is an engine
+    crash due to bug, we need to switch back to debug build. Make sure to run
+    `make clean` before running `make dbg`.
+
+### Formatting code
+* After making change, use `make format-auto` to auto-apply formatting without
+    interactive prompts (Claude Code friendly).
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cce07d70fec7..f0e79d9306e1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,7 +27,7 @@
 #
 # Linux:
 #
-# 1. Install a recent toolchain if you're on a older distro. C++17 required (GCC >= 7, Clang >= 5)
+# 1. Install a recent toolchain if you're on a older distro. C++20 required (GCC >= 11, Clang >= 10)
 # 2. mkdir build; cd build
 # 3. cmake ..
 # 4. make -j
@@ -80,6 +80,7 @@ if(NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE "${default_build_type}" CACHE STRING
     "Default BUILD_TYPE is ${default_build_type}" FORCE)
 endif()
+message(STATUS "CMAKE_BUILD_TYPE is set to ${CMAKE_BUILD_TYPE}")
 
 find_program(CCACHE_FOUND ccache)
 if(CCACHE_FOUND)
@@ -100,7 +101,7 @@ endif()
 option(ROCKSDB_BUILD_SHARED "Build shared versions of the RocksDB libraries" ON)
 
 if( NOT DEFINED CMAKE_CXX_STANDARD )
-  set(CMAKE_CXX_STANDARD 17)
+  set(CMAKE_CXX_STANDARD 20)
 endif()
 
 include(CMakeDependentOption)
@@ -132,7 +133,9 @@ else()
     option(WITH_GFLAGS "build with GFlags" ON)
   endif()
   set(GFLAGS_LIB)
-  if(WITH_GFLAGS)
+  # Skip all gflags detection and setup when USE_FOLLY or USE_COROUTINES is enabled
+  # since Folly provides its own gflags (USE_COROUTINES automatically sets USE_FOLLY)
+  if(WITH_GFLAGS AND NOT USE_FOLLY AND NOT USE_COROUTINES)
     # Config with namespace available since gflags 2.2.2
     option(GFLAGS_USE_TARGET_NAMESPACE "Use gflags import target with namespace." ON)
     find_package(gflags CONFIG)
@@ -151,6 +154,9 @@ else()
     include_directories(${GFLAGS_INCLUDE_DIR})
     list(APPEND THIRDPARTY_LIBS ${GFLAGS_LIB})
     add_definitions(-DGFLAGS=1)
+  elseif(WITH_GFLAGS AND (USE_FOLLY OR USE_COROUTINES))
+    # Still set the DGFLAGS=1 define when using Folly since Folly provides gflags
+    add_definitions(-DGFLAGS=1)
   endif()
 
   if(WITH_SNAPPY)
@@ -203,9 +209,20 @@ if(WIN32 AND MSVC)
   endif()
 endif()
 
+option(WIN_CI "Accelerate build speed and reduce build artifect size for github CI with MSVC" OFF)
+
 if(MSVC)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi /nologo /EHsc /GS /Gd /GR /GF /fp:precise /Zc:wchar_t /Zc:forScope /errorReport:queue")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /FC /d2Zi+ /W4 /wd4127 /wd4996 /wd4100 /wd4324")
+  if(WIN_CI)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP /nologo /EHsc /Gd /GR /GF /fp:precise /Zc:wchar_t /Zc:forScope /errorReport:queue")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /FC /W4 /wd4127 /wd4996 /wd4100 /wd4324 /wd4702")
+  else()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi /nologo /EHsc /GS /Gd /GR /GF /fp:precise /Zc:wchar_t /Zc:forScope /errorReport:queue")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /FC /d2Zi+ /W4 /wd4127 /wd4996 /wd4100 /wd4324")
+  endif()
+  if(CMAKE_BUILD_TYPE STREQUAL "Release")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /DNDEBUG")
+    message(STATUS "Setting /DNDEBUG as CMAKE_BUILD_TYPE is set to ${CMAKE_BUILD_TYPE}")
+  endif()
 else()
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -W -Wextra -Wall -pthread")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wsign-compare -Wshadow -Wno-unused-parameter -Wno-unused-variable -Woverloaded-virtual -Wnon-virtual-dtor -Wno-missing-field-initializers -Wno-strict-aliasing -Wno-invalid-offsetof")
@@ -313,8 +330,7 @@ if(NOT MSVC)
 endif()
 
 # Check if -latomic is required or not
-if (NOT MSVC)
-  set(CMAKE_REQUIRED_FLAGS "--std=c++17")
+if (NOT MSVC AND NOT APPLE)
   CHECK_CXX_SOURCE_COMPILES("
 #include <atomic>
 std::atomic<uint64_t> x(0);
@@ -451,24 +467,33 @@ else()
   endif()
 endif()
 
-# Used to run CI build and tests so we can run faster
+# Used to run optimized debug build and tests so we can run faster
 option(OPTDBG "Build optimized debug build with MSVC" OFF)
 option(WITH_RUNTIME_DEBUG "build with debug version of runtime library" ON)
 if(MSVC)
-  if(OPTDBG)
+  if (WIN_CI)
     message(STATUS "Debug optimization is enabled")
     set(CMAKE_CXX_FLAGS_DEBUG "/Oxt")
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /DEBUG:FASTLINK")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /DEBUG:FASTLINK")
   else()
-    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Od /RTC1")
-
-    # Minimal Build is deprecated after MSVC 2015
-    if( MSVC_VERSION GREATER 1900 )
-      set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Gm-")
+    if(OPTDBG)
+      message(STATUS "Debug optimization is enabled")
+      set(CMAKE_CXX_FLAGS_DEBUG "/Oxt")
     else()
-      set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Gm")
-    endif()
+      set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Od /RTC1")
 
+      # Minimal Build is deprecated after MSVC 2015
+      if( MSVC_VERSION GREATER 1900 )
+        set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Gm-")
+      else()
+        set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Gm")
+      endif()
+    endif()
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /DEBUG")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /DEBUG")
   endif()
+
   if(WITH_RUNTIME_DEBUG)
     set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /${RUNTIME_LIBRARY}d")
   else()
@@ -476,8 +501,6 @@ if(MSVC)
   endif()
   set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Oxt /Zp8 /Gm- /Gy /${RUNTIME_LIBRARY}")
 
-  set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /DEBUG")
-  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /DEBUG")
 endif()
 
 if(CMAKE_COMPILER_IS_GNUCXX)
@@ -629,6 +652,12 @@ if(USE_FOLLY)
     ${FOLLY_INST_PATH}/lib/cmake/folly/folly-targets.cmake)
 
     include(${FOLLY_INST_PATH}/lib/cmake/folly/folly-config.cmake)
+
+    # Fix gflags library name for debug builds
+    if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+      set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath=${GFLAGS_INST_PATH}/lib")
+      set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${GFLAGS_INST_PATH}/lib/libgflags_debug.so.2.2")
+    endif()
   endif()
 
   add_compile_definitions(USE_FOLLY FOLLY_NO_CONFIG HAVE_CXX11_ATOMIC)
@@ -721,6 +750,7 @@ set(SOURCES
         db/memtable_list.cc
         db/merge_helper.cc
         db/merge_operator.cc
+        db/multi_scan.cc
         db/output_validator.cc
         db/periodic_task_scheduler.cc
         db/range_del_aggregator.cc
@@ -746,6 +776,7 @@ set(SOURCES
         db/write_controller.cc
         db/write_stall_stats.cc
         db/write_thread.cc
+        db_stress_tool/db_stress_compression_manager.cc
         env/composite_env.cc
         env/env.cc
         env/env_chroot.cc
@@ -835,7 +866,7 @@ set(SOURCES
         table/cuckoo/cuckoo_table_builder.cc
         table/cuckoo/cuckoo_table_factory.cc
         table/cuckoo/cuckoo_table_reader.cc
-        table/external_table_reader.cc
+        table/external_table.cc
         table/format.cc
         table/get_context.cc
         table/iterator.cc
@@ -874,17 +905,20 @@ set(SOURCES
         trace_replay/trace_record.cc
         trace_replay/trace_replay.cc
         util/async_file_reader.cc
+        util/auto_tune_compressor.cc
         util/cleanable.cc
         util/coding.cc
         util/compaction_job_stats_impl.cc
         util/comparator.cc
         util/compression.cc
+        util/simple_mixed_compressor.cc
         util/compression_context_cache.cc
         util/concurrent_task_limiter_impl.cc
         util/crc32c.cc
         util/data_structure.cc
         util/dynamic_bloom.cc
         util/hash.cc
+        util/io_dispatcher_imp.cc
         util/murmurhash.cc
         util/random.cc
         util/rate_limiter.cc
@@ -1065,12 +1099,21 @@ if(USE_FOLLY_LITE)
     third-party/folly/folly/synchronization/DistributedMutex.cpp
     third-party/folly/folly/synchronization/ParkingLot.cpp)
   include_directories(${PROJECT_SOURCE_DIR}/third-party/folly)
+  # Add boost to the include path
   exec_program(python3 ${PROJECT_SOURCE_DIR}/third-party/folly ARGS
   build/fbcode_builder/getdeps.py show-source-dir boost OUTPUT_VARIABLE
   BOOST_SOURCE_PATH)
   exec_program(ls ARGS -d ${BOOST_SOURCE_PATH}/boost* OUTPUT_VARIABLE
   BOOST_INCLUDE_DIR)
   include_directories(${BOOST_INCLUDE_DIR})
+  # Add fmt to the include path
+  exec_program(python3 ${PROJECT_SOURCE_DIR}/third-party/folly ARGS
+  build/fbcode_builder/getdeps.py show-source-dir fmt OUTPUT_VARIABLE
+  FMT_SOURCE_PATH)
+  exec_program(ls ARGS -d ${FMT_SOURCE_PATH}/fmt*/include OUTPUT_VARIABLE
+  FMT_INCLUDE_DIR)
+  include_directories(${FMT_INCLUDE_DIR})
+
   add_definitions(-DUSE_FOLLY -DFOLLY_NO_CONFIG)
   list(APPEND THIRDPARTY_LIBS glog)
 endif()
@@ -1339,9 +1382,11 @@ if(WITH_TESTS)
         db/db_bloom_filter_test.cc
         db/db_compaction_filter_test.cc
         db/db_compaction_test.cc
+        db/db_compaction_abort_test.cc
         db/db_clip_test.cc
         db/db_dynamic_level_test.cc
         db/db_encryption_test.cc
+        db/db_etc3_test.cc
         db/db_flush_test.cc
         db/db_inplace_update_test.cc
         db/db_io_failure_test.cc
@@ -1456,6 +1501,7 @@ if(WITH_TESTS)
         util/autovector_test.cc
         util/bloom_test.cc
         util/coding_test.cc
+        util/compression_test.cc
         util/crc32c_test.cc
         util/defer_test.cc
         util/dynamic_bloom_test.cc
@@ -1499,8 +1545,10 @@ if(WITH_TESTS)
         utilities/transactions/optimistic_transaction_test.cc
         utilities/transactions/transaction_test.cc
         utilities/transactions/lock/point/point_lock_manager_test.cc
+        utilities/transactions/lock/point/point_lock_manager_stress_test.cc
         utilities/transactions/write_committed_transaction_ts_test.cc
         utilities/transactions/write_prepared_transaction_test.cc
+        utilities/transactions/write_prepared_transaction_test_seqno.cc
         utilities/transactions/write_unprepared_transaction_test.cc
         utilities/transactions/lock/range/range_locking_test.cc
         utilities/transactions/timestamped_snapshot_test.cc
@@ -1609,6 +1657,12 @@ if(WITH_BENCHMARK_TOOLS)
     utilities/persistent_cache/hash_table_bench.cc)
   target_link_libraries(hash_table_bench${ARTIFACT_SUFFIX}
     ${ROCKSDB_LIB} ${GFLAGS_LIB} ${FOLLY_LIBS})
+
+  add_executable(point_lock_bench${ARTIFACT_SUFFIX}
+    utilities/transactions/lock/point/point_lock_bench.cc
+    utilities/transactions/lock/point/point_lock_bench_tool.cc)
+  target_link_libraries(point_lock_bench${ARTIFACT_SUFFIX}
+    ${ROCKSDB_LIB} ${GFLAGS_LIB} ${FOLLY_LIBS})
 endif()
 
 option(WITH_TRACE_TOOLS "build with trace tools" ON)
diff --git a/Directory.Build.props b/Directory.Build.props
new file mode 100644
index 000000000000..5862fb2c2f45
--- /dev/null
+++ b/Directory.Build.props
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project>
+    <PropertyGroup>
+        <CLToolExe>ccache_msvc_compiler.bat</CLToolExe>
+        <CLToolPath>$(MSBuildThisFileDirectory)</CLToolPath>
+        <UseMultiToolTask>true</UseMultiToolTask>
+        <EnforceProcessCountAcrossBuilds>true</EnforceProcessCountAcrossBuilds>
+    </PropertyGroup>
+</Project>
diff --git a/HISTORY.md b/HISTORY.md
index ab8466abd1ce..277ade360676 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,219 @@
 # Rocksdb Change Log
 > NOTE: Entries for next release do not go here. Follow instructions in `unreleased_history/README.txt`
 
+## 10.11.0 (01/23/2026)
+### Public API Changes
+* New SetOptions API that allows setting options for multiple CFs, avoiding the need to reserialize OPTIONS file for each CF
+* Remove remaining pieces of Lua integration
+
+### Behavior Changes
+* The new default for `BlockBasedTableOptions::format_version` is 7, which has been supported since RocksDB 10.4.0 and is required in order to use CompressionManagers supporting custom compression types.
+
+### Bug Fixes
+* Fixed a small performance bug with `format_version=7` when decompressing formats other than Snappy and ZSTD.
+* Fixed an infinite compaction loop bug with User-Defined Timestamps (UDT) where bottommost files were repeatedly marked for compaction even though their timestamp could not be collapsed.
+* Bugfix for persisted UDT record sequence number zeroing logic.
+
+## 10.10.0 (12/16/2025)
+### Bug Fixes
+* Fixed a bug in best-efforts recovery that causes use-after-free crashes when accessing SST files that were cached during the recovery.
+* Fix resumable compaction incorrectly allowing resumption from a truncated range deletion that is not well handled currently.
+* Fixed a bug in `PosixRandomFileAccess` IO uring submission queue ownership & management. Fix eliminates the false positive 'Bad cqe data' IO errors in `PosixRandomFileAccess::MultiRead` when interleaved with `PosixRandomFileAccess::ReadAsync` on the same thread.
+
+## 10.9.0 (11/21/2025)
+### New Features
+* Added an auto-tuning feature for DB manifest file size that also (by default) improves the safety of existing configurations in case `max_manifest_file_size` is repeatedly exceeded. The new recommendation is to set `max_manifest_file_size` to something small like 1MB and tune `max_manifest_space_amp_pct` as needed to balance write amp and space amp in the manifest. Refer to comments on those options in `DBOptions` for details. Both options are (now) mutable.
+* Added a new API to support option migration for multiple column families
+* Added new option target_file_size_is_upper_bound  that makes most compaction output SST files come close to the target file size without exceeding it, rather than commonly exceeding it by some fraction (current behavior). For now the new behavior is off by default, but we expect to enable it by default in the future.
+* Add a new option allow_trivial_move in CompactionOptions to allow CompactFiles to perform trivial move if possible. By default the flag of allow_trivial_move is false, so it preserve the original behavior.
+
+### Public API Changes
+* To reduce risk of ODR violations or similar, `ROCKSDB_USING_THREAD_STATUS` has been removed from public headers and replaced with static `const bool ThreadStatus::kEnabled`. Some other uses of conditional compilation have been removed from public API headers to reduce risk of ODR violations or other issues.
+
+### Behavior Changes
+* PosixWritableFile now repositions the seek pointer to the new end of file after a call to Truncate.
+* Updated standalone range deletion L0 file compaction behavior to avoid compacting with any newer L0 files (which is expensive and not useful).
+
+### Bug Fixes
+* Fix a bug where compaction with range deletion can persist kTypeMaxValid in MANIFEST as file metadata. kTypeMaxValid is not supposed to be persisted and can change as new value types are introduced. This can cause a forward compatibility issue where older versions of RocksDB don't recognize kTypeMaxValid from newer versions. A new placeholder value type kTypeTruncatedRangeDeletionSentinel is also introduced to replace kTypeMaxValid when reading existing SST files' metadata from MANIFEST. This allows us to strengthen some checks to avoid using kTypeMaxValid in the future.
+* Fixed a bug where `DB::GetSortedWalFiles()` could hang when waiting for a purge operation that found nothing to do (potentially triggered by iterator release, flush, compaction, etc.).
+* Fixed a bug in MultiScan where `max_sequential_skip_in_iterations` could cause the iterator to seek backward to already-unpinned blocks when the same user key spans multiple data blocks, leading to assertion failures or seg fault.
+* Fixed a bug for `WAL_ttl_seconds > 0` use cases where the newest archived WAL files could be incorrectly deleted when the system clock moved backwards.
+
+### Performance Improvements
+* Added optimization that allowed for the asynchronous prefetching of all data outlined in a multiscan iterator. This optimization was applied to the level iterator, which prefetches all data through each of the block-based iterators.
+
+## 10.8.0 (10/21/2025)
+### New Features
+* Add kFSPrefetch to FSSupportedOps enum to allow file systems to indicate prefetch support capability, avoiding unnecessary prefetch system calls on file systems that don't support them.
+* Added experimental support `OpenAndCompactOptions::allow_resumption` for resumable compaction that persists progress during `OpenAndCompact()`, allowing interrupted compactions to resume from the last progress persitence. The default behavior is to not persist progress.
+
+### Public API Changes
+* Allow specifying output temperature in CompactionOptions
+* Added `DB::FlushWAL(const FlushWALOptions&)` as an alternative to `DB::FlushWAL(bool sync)`, where `FlushWALOptions` includes a new `rate_limiter_priority` field (default `Env::IO_TOTAL`) that allows rate limiting and priority passing of manual WAL flush's IO operations.
+* The MultiScan API contract is updated. After a multi scan range got prepared with Prepare API call, the following seeks must seek the start of each prepared scan range in order. In addition, when limit is set, upper bound must be set to the same value of limit before each seek
+
+### Behavior Changes
+* `kChangeTemperature` FIFO compaction will now honor `compaction_target_temp` to all levels regardless of `cf_options::last_level_temperature`
+* Allow UDIs with a non BytewiseComparator
+
+### Bug Fixes
+* Fix incorrect MultiScan seek error status due to bugs in handling range limit falling between adjacent SST files key range.
+* Fix a bug in Page unpinning in MultiScan
+
+### Performance Improvements
+* Fixed a performance regression in LZ4 compression that started in version 10.6.0
+
+## 10.7.0 (09/19/2025)
+### New Features
+* Add the fail_if_no_udi_on_open flag in BlockBasedTableOption to control whether a missing user defined index block in a SST is a hard error or not.
+* A new flag memtable_verify_per_key_checksum_on_seek is added to AdvancedColumnFamilyOptions. When it is enabled, it will validate key checksum along the binary search path on skiplist based memtable during seek operation.
+* Introduce option MultiScanArgs::use_async_io to enable asynchronous I/O during MultiScan, instead of waiting for I/O to be done in Prepare().
+* Add new option `MultiScanArgs::max_prefetch_size` that limits the memory usage of per file pinning of prefetched blocks.
+* Improved `sst_dump` by allowing standalone file and directory arguments without `--file=`. Also added new options and better output for `sst_dump --command=recompress`. See `sst_dump --help`
+
+### Public API Changes
+* HyperClockCache with no `estimated_entry_charge` is now production-ready and is the preferred block cache implementation vs. LRUCache. Please consider updating your code to minimize the risk of hitting performance bottlenecks or anomalies from LRUCache. See cache.h for more detail.
+* RocksDB now requires a C++20 compatible compiler (GCC >= 11, Clang >= 10, Visual Studio >= 2019), including for any code using RocksDB headers.
+* MultiScanArgs used to have a default constructor with default parameter of BytewiseComparator. Now it always requires Comparator in its constructor.
+
+### Behavior Changes
+* The default provided block cache implementation is now HyperClockCache instead of LRUCache, when `block_cache` is nullptr (default) and `no_block_cache==false` (default). We recommend explicitly creating a HyperClockCache block cache based on memory budget and sharing it across all column families and even DB instances. This change could expose previously hidden memory or resource leaks.
+
+### Bug Fixes
+* Reported numbers for compaction and flush CPU usage now include time spent by parallel compression worker threads. This now means compaction/flush CPU usage could exceed the wall clock time.
+* Fix a race condition in FIFO size-based compaction where concurrent threads could select the same non-L0 file, causing assertion failures in debug builds or "Cannot delete table file from LSM tree" errors in release builds.
+* Fix a bug in RocksDB MultiScan with UDI when one of the scan ranges is determined to be empty by the UDI, which causes incorrect results.
+
+### Performance Improvements
+* Add a new table property "rocksdb.key.smallest.seqno" which records the smallest sequence number of all keys in file. It makes ingesting DB generated files faster by
+avoiding scanning the whole file to find the smallest sequence number.
+* Add a new experimental PerKeyPointLockManager to improve efficiency under high lock contention. PointLockManager was not efficient when there is high write contention on same key, as it uses a single conditional variable per lock stripe. PerKeyPointLockManager uses per thread conditional variable supporting fifo order. Although this is an experimental feature. By default, it is disabled. A new boolean flag TransactionDBOptions::use_per_key_point_lock_mgr is added to optionally enable it. Search the flag in code for more info.
+Together, a new configuration TransactionOptions::deadlock_timeout_us is added, which allows the transaction to wait for a short period before perform deadlock detection. When the workload has low lock contention, the deadlock_timeout_us can be configured to be slightly higher than average transaction execution time, so that transaction would likely be able to take the lock before deadlock detection is performed when it is waiting for a lock. This allows transaction to reduce CPU cost on performing deadlock detection, which could be expensive in CPU time. When the workload has high lock contention, the deadlock_timeout_us can be configured to 0, so that transaction would perform deadlock detection immediately. By default the value is 0 to keep the behavior same as before.
+* Majorly improved CPU efficiency and scalability of parallel compression (`CompressionOptions::parallel_threads` > 1), though this efficiency improvement makes parallel compression currently incompatible with UserDefinedIndex and with old setting of `decouple_partitioned_filters=false`. Parallel compression is now considered a production-ready feature. Maximum performance is available with `-DROCKSDB_USE_STD_SEMAPHORES` at compile time, but this is not currently recommended because of reported bugs in implementations of `std::counting_semaphore`/`binary_semaphore`.
+
+## 10.6.0 (08/22/2025)
+### New Features
+* Introduce column family option `cf_allow_ingest_behind`. This option aims to replace `DBOptions::allow_ingest_behind` to enable ingest behind at the per-CF level. `DBOptions::allow_ingest_behind` is deprecated.
+* Introduce `MultiScanArgs::io_coalesce_threshold` to allow a configurable IO coalescing threshold.
+
+### Public API Changes
+* `IngestExternalFileOptions::allow_db_generated_files` now allows files ingestion of any DB generated SST file, instead of only the ones with all keys having sequence number 0.
+* `decouple_partitioned_filters = true` is now the default in BlockBasedTableOptions.
+* GetTtl() API is now available in TTL DB
+* Minimum supported version of LZ4 library is now 1.7.0 (r129 from 2015)
+* Some changes to experimental Compressor and CompressionManager APIs
+* A new Filesystem::SyncFile function is added for syncing a file that was already written, such as on file ingestion. The default implementation matches previous RocksDB behavior: re-open the file for read-write, sync it, and close it. We recommend overriding for FileSystems that do not require syncing for crash recovery or do not handle (well) re-opening for writes.
+
+### Behavior Changes
+* When `allow_ingest_behind` is enabled, compaction will no longer drop tombstones based on the absence of underlying data. Tombstones will be preserved to apply to ingested files.
+
+### Bug Fixes
+* Files in dropped column family won't be returned to the caller upon successful, offline MANIFEST iteration in `GetFileChecksumsFromCurrentManifest`.
+* Fix a bug in MultiScan that causes it to fall back to a normal scan when dictionary compression is enabled.
+* Fix a crash in iterator Prepare() when fill_cache=false
+* Fix a bug in MultiScan where incorrect results can be returned when a Scan's range is across multiple files.
+* Fixed a bug in remote compaction that may mistakenly delete live SST file(s) during the cleanup phase when no keys survive the compaction (all expired)
+* Allow a user defined index to be configured from a string.
+* Make the User Defined Index interface consistently use the user key format, fixing the previous mixed usage of internal and user key.
+
+### Performance Improvements
+* Small improvement to CPU efficiency of compression using built-in algorithms, and a dramatic efficiency improvement for LZ4HC, based on reusing data structures between invocations.
+
+## 10.5.0 (07/18/2025)
+### Public API Changes
+* DB option skip_checking_sst_file_sizes_on_db_open is deprecated, in favor of validating file size in parallel in a thread pool, when db is opened. When DB is opened, with paranoid check enabled, a file with the wrong size would fail the DB open. With paranoid check disabled, the DB open would succeed, the column family with the corrupted file would not be read or write, while the other healthy column families could be read and write normally. When max_open_files option is not set to -1, only a subset of the files will be opened and checked. The rest of the files will be opened and checked when they are accessed.
+
+### Behavior Changes
+* PessimisticTransaction::GetWaitingTxns now returns waiting transaction information even if the current transaction has timed out. This allows the information to be surfaced to users for debugging purposes once it is known that the timeout has occurred.
+* A new API GetFileSize is added to FSRandomAccessFile interface class. It uses fstat vs stat on the posix implementation which is more efficient. Caller could use it to get file size faster. This function might be required in the future for FileSystem implementation outside of the RocksDB code base.
+* RocksDB now triggers eligible compactions every 12 hours when periodic compaction is configured. This solves a limitation of the compaction trigger mechanism, which would only trigger compaction after specific events like flush, compaction, or SetOptions.
+
+### Bug Fixes
+* Fix a bug in BackupEngine that can crash backup due to a null FSWritableFile passed to WritableFileWriter.
+* Fix DB::NewMultiScan iterator to respect the scan upper bound specified in ScanOptions
+
+### Performance Improvements
+* Optimized MultiScan using BlockBasedTable to coalesce I/Os and prefetch all data blocks.
+
+## 10.4.0 (06/20/2025)
+### New Features
+* Add a new CF option `memtable_avg_op_scan_flush_trigger` that supports triggering memtable flush when an iterator scans through an expensive range of keys, with the average number of skipped keys from the active memtable exceeding the threshold.
+* Vector based memtable now supports concurrent writers (DBOptions::allow_concurrent_memtable_write) #13675.
+* Add new experimental `TransactionOptions::large_txn_commit_optimize_byte_threshold` to enable optimizations for large transaction commit by transaction batch data size.
+* Add a new option `CompactionOptionsUniversal::reduce_file_locking` and if it's true, auto universal compaction picking will adjust to minimize locking of input files when bottom priority compactions are waiting to run. This can increase the likelihood of existing L0s being selected for compaction, thereby improving write stall and reducing read regression.
+* Add new `format_version=7` to aid experimental support of custom compression algorithms with CompressionManager and block-based table. This format version includes changing the format of `TableProperties::compression_name`.
+
+### Public API Changes
+* Change NewExternalTableFactory to return a unique_ptr instead of shared_ptr.
+* Add an optional min file size requirement for deletion triggered compaction. It can be specified when creating `CompactOnDeletionCollectorFactory`.
+
+### Behavior Changes
+* `TransactionOptions::large_txn_commit_optimize_threshold` now has default value 0 for disabled. `TransactionDBOptions::txn_commit_bypass_memtable_threshold` now has no effect on transactions.
+
+### Bug Fixes
+* Fix a bug where CreateColumnFamilyWithImport() could miss the SST file for the memtable flush it triggered. The exported CF then may not contain the updates in the memtable when CreateColumnFamilyWithImport() is called.
+* Fix iterator operations returning NotImplemented status if disallow_memtable_writes and paranoid_memory_checks CF options are both set.
+* Fixed handling of file checksums in IngestExternalFile() to allow providing checksums using recognized but not necessarily the DB's preferred checksum function, to ease migration between checksum functions.
+
+## 10.3.0 (05/17/2025)
+### New Features
+* Add new experimental `CompactionOptionsFIFO::allow_trivial_copy_when_change_temperature` along with `CompactionOptionsFIFO::trivial_copy_buffer_size` to allow optimizing FIFO compactions with tiering when kChangeTemperature to move files from source tier FileSystem to another tier FileSystem via trivial and direct copying raw sst file instead of reading thru the content of the SST file then rebuilding the table files.
+* Add a new field to Compaction Stats in LOG files for the pre-compression size written to each level.
+* Add new experimental `TransactionOptions::large_txn_commit_optimize_threshold` to enable optimizations for large transaction commit with per transaction threshold. `TransactionDBOptions::txn_commit_bypass_memtable_threshold` is deprecated in favor of this transaction option.
+* [internal team use only] Allow an application-defined `request_id` to be passed to RocksDB and propagated to the filesystem via IODebugContext
+
+### Bug Fixes
+* Fix a bug where transaction lock upgrade can incorrectly fail with a Deadlock status. This happens when a transaction has a non-zero timeout and tries to upgrade a shared lock that is also held by another transaction.
+* Pass wrapped WritableFileWriter pointer to ExternalTableBuilder so that the file checksum can be correctly calculated and returned by SstFileWriter for external table files.
+* Fix an infinite-loop bug in transaction locking. This can happen if a transaction reaches lock limit and its time out expires before it attempts to wait for it.
+* Fixed a potential data race with `CompressionOptions::parallel_threads > 1` and a `TablePropertiesCollector` overriding `BlockAdd()`.
+
+## 10.2.0 (04/21/2025)
+### New Features
+* Provide histogram stats `COMPACTION_PREFETCH_BYTES` to measure number of bytes for RocksDB's prefetching (as opposed to file
+system's prefetch) on SST file during compaction read
+* A new API DB::GetNewestUserDefinedTimestamp is added to return the newest user defined timestamp seen in a column family
+* Introduce API `IngestWriteBatchWithIndex()` for ingesting updates into DB while bypassing memtable writes. This improves performance when writing a large write batch to the DB.
+* Add a new CF option `memtable_op_scan_flush_trigger` that triggers a flush of the memtable if an iterator's Seek()/Next() scans over a certain number of invisible entries from the memtable.
+
+### Public API Changes
+* AdvancedColumnFamilyOptions.max_write_buffer_number_to_maintain is deleted. It's deprecated since introduction of a better option max_write_buffer_size_to_maintain since RocksDB 6.5.0.
+* Deprecated API `DB::MaxMemCompactionLevel()`.
+* Deprecated `ReadOptions::ignore_range_deletions`.
+* Deprecated API `experimental::PromoteL0()`.
+* Added arbitrary string map for additional options to be overridden for remote compactions
+* The fail_if_options_file_error option in DBOptions has been removed. The behavior now is to always return failure in any API that fails to persist the OPTIONS file.
+
+### Behavior Changes
+* Make stats `PREFETCH_BYTES_USEFUL`, `PREFETCH_HITS`, `PREFETCH_BYTES` only account for prefetching during user initiated scan
+
+### Bug Fixes
+* Fix a bug in Posix file system that the FSWritableFile created via `FileSystem::ReopenWritableFile` internally does not track the correct file size.
+* Fix a bug where tail size of remote compaction output is not persisted in primary db's manifest
+
+## 10.1.0 (03/24/2025)
+### New Features
+* Added a new `DBOptions.calculate_sst_write_lifetime_hint_set` setting that allows to customize which compaction styles SST write lifetime hint calculation is allowed on. Today RocksDB supports only two modes `kCompactionStyleLevel` and `kCompactionStyleUniversal`.
+* Add a new field `num_l0_files` in `CompactionJobInfo` about the number of L0 files in the CF right before and after the compaction
+* Added per-key-placement feature in Remote Compaction
+* Implemented API DB::GetPropertiesOfTablesByLevel that retrieves table properties for files in each LSM tree level
+
+### Public API Changes
+* `GetAllKeyVersions()` now interprets empty slices literally, as valid keys, and uses new `OptSlice` type default value for extreme upper and lower range limits.
+* `DeleteFilesInRanges()` now takes `RangeOpt` which is based on `OptSlice`. The overload taking `RangePtr` is deprecated.
+* Add an unordered map of name/value pairs, ReadOptions::property_bag, to pass opaque options through to an external table when creating an Iterator.
+* Introduced CompactionServiceJobStatus::kAborted to allow handling aborted scenario in Schedule(), Wait() or OnInstallation() APIs in Remote Compactions.
+* format\_version < 2 in BlockBasedTableOptions is no longer supported for writing new files. Support for reading such files is deprecated and might be removed in the future. `CompressedSecondaryCacheOptions::compress_format_version == 1` is also deprecated.
+
+### Behavior Changes
+* `ldb` now returns an error if the specified `--compression_type` is not supported in the build.
+* MultiGet with snapshot and ReadOptions::read_tier = kPersistedTier will now read a consistent view across CFs (instead of potentially reading some CF before and some CF after a flush).
+* CreateColumnFamily() is no longer allowed on a read-only DB (OpenForReadOnly())
+
+### Bug Fixes
+* Fixed stats for Tiered Storage with preclude_last_level feature
+
 ## 10.0.0 (02/21/2025)
 ### New Features
 * Introduced new `auto_refresh_iterator_with_snapshot` opt-in knob that (when enabled) will periodically release obsolete memory and storage resources for as long as the iterator is making progress and its supplied `read_options.snapshot` was initialized with non-nullptr value.
@@ -119,7 +332,7 @@
 * In FIFO compaction, compactions for changing file temperature (configured by option `file_temperature_age_thresholds`) will compact one file at a time, instead of merging multiple eligible file together (#13018).
 * Support ingesting db generated files using hard link, i.e. IngestExternalFileOptions::move_files/link_files and IngestExternalFileOptions::allow_db_generated_files.
 * Add a new file ingestion option `IngestExternalFileOptions::link_files` to hard link input files and preserve original files links after ingestion.
-* DB::Close now untracks files in SstFileManager, making avaialble any space used
+* DB::Close now untracks files in SstFileManager, making available any space used
 by them. Prior to this change they would be orphaned until the DB is re-opened.
 
 ### Bug Fixes
@@ -315,7 +528,7 @@ MultiGetBenchmarks.multiGetList10 no_column_family 10000 16 100 1024 thrpt 25 76
 * Removed deprecated option `ColumnFamilyOptions::check_flush_compaction_key_order`
 * Remove the default `WritableFile::GetFileSize` and `FSWritableFile::GetFileSize` implementation that returns 0 and make it pure virtual, so that subclasses are enforced to explicitly provide an implementation.
 * Removed deprecated option `ColumnFamilyOptions::level_compaction_dynamic_file_size`
-* Removed tickers with typos "rocksdb.error.handler.bg.errro.count", "rocksdb.error.handler.bg.io.errro.count", "rocksdb.error.handler.bg.retryable.io.errro.count".
+* Removed tickers with typos "rocksdb.error.handler.bg.error.count", "rocksdb.error.handler.bg.io.error.count", "rocksdb.error.handler.bg.retryable.io.error.count".
 * Remove the force mode for `EnableFileDeletions` API because it is unsafe with no known legitimate use.
 * Removed deprecated option `ColumnFamilyOptions::ignore_max_compaction_bytes_for_input`
 * `sst_dump --command=check` now compares the number of records in a table with `num_entries` in table property, and reports corruption if there is a mismatch. API `SstFileDumper::ReadSequential()` is updated to optionally do this verification. (#12322)
@@ -342,7 +555,7 @@ MultiGetBenchmarks.multiGetList10 no_column_family 10000 16 100 1024 thrpt 25 76
 * Exposed options ttl via c api.
 
 ### Behavior Changes
-* `rocksdb.blobdb.blob.file.write.micros` expands to also measure time writing the header and footer. Therefore the COUNT may be higher and values may be smaller than before. For stacked BlobDB, it no longer measures the time of explictly flushing blob file.
+* `rocksdb.blobdb.blob.file.write.micros` expands to also measure time writing the header and footer. Therefore the COUNT may be higher and values may be smaller than before. For stacked BlobDB, it no longer measures the time of explicitly flushing blob file.
 * Files will be compacted to the next level if the data age exceeds periodic_compaction_seconds except for the last level.
 * Reduced the compaction debt ratio trigger for scheduling parallel compactions
 * For leveled compaction with default compaction pri (kMinOverlappingRatio), files marked for compaction will be prioritized over files not marked when picking a file from a level for compaction.
@@ -407,7 +620,7 @@ want to continue to use force enabling, they need to explicitly pass a `true` to
 
 ### Behavior Changes
 * During off-peak hours defined by `daily_offpeak_time_utc`, the compaction picker will select a larger number of files for periodic compaction. This selection will include files that are projected to expire by the next off-peak start time, ensuring that these files are not chosen for periodic compaction outside of off-peak hours.
-* If an error occurs when writing to a trace file after `DB::StartTrace()`, the subsequent trace writes are skipped to avoid writing to a file that has previously seen error. In this case, `DB::EndTrace()` will also return a non-ok status with info about the error occured previously in its status message.
+* If an error occurs when writing to a trace file after `DB::StartTrace()`, the subsequent trace writes are skipped to avoid writing to a file that has previously seen error. In this case, `DB::EndTrace()` will also return a non-ok status with info about the error occurred previously in its status message.
 * Deleting stale files upon recovery are delegated to SstFileManger if available so they can be rate limited.
 * Make RocksDB only call `TablePropertiesCollector::Finish()` once.
 * When `WAL_ttl_seconds > 0`, we now process archived WALs for deletion at least every `WAL_ttl_seconds / 2` seconds. Previously it could be less frequent in case of small `WAL_ttl_seconds` values when size-based expiration (`WAL_size_limit_MB > 0 `) was simultaneously enabled.
@@ -1195,7 +1408,7 @@ Note: The next release will be major release 7.0. See https://github.com/faceboo
 ### Public API change
 * Extend WriteBatch::AssignTimestamp and AssignTimestamps API so that both functions can accept an optional `checker` argument that performs additional checking on timestamp sizes.
 * Introduce a new EventListener callback that will be called upon the end of automatic error recovery.
-* Add IncreaseFullHistoryTsLow API so users can advance each column family's full_history_ts_low seperately.
+* Add IncreaseFullHistoryTsLow API so users can advance each column family's full_history_ts_low separately.
 * Add GetFullHistoryTsLow API so users can query current full_history_low value of specified column family.
 
 ### Performance Improvements
diff --git a/INSTALL.md b/INSTALL.md
index 5bc5bd7b297e..1e739d485d02 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -6,7 +6,7 @@ than release mode.
 
 RocksDB's library should be able to compile without any dependency installed,
 although we recommend installing some compression libraries (see below).
-We do depend on newer gcc/clang with C++17 support (GCC >= 7, Clang >= 5).
+We do depend on newer gcc/clang with C++20 support (GCC >= 11, Clang >= 10).
 
 There are few options when compiling RocksDB:
 
@@ -60,7 +60,7 @@ most processors made since roughly 2013.
 ## Supported platforms
 
 * **Linux - Ubuntu**
-    * Upgrade your gcc to version at least 7 to get C++17 support.
+    * Upgrade your gcc to version at least 11 to get C++20 support.
     * Install gflags. First, try: `sudo apt-get install libgflags-dev`
       If this doesn't work and you're using Ubuntu, here's a nice tutorial:
       (http://askubuntu.com/questions/312173/installing-gflags-12-04)
@@ -72,7 +72,7 @@ most processors made since roughly 2013.
     * Install zstandard: `sudo apt-get install libzstd-dev`.
 
 * **Linux - CentOS / RHEL**
-    * Upgrade your gcc to version at least 7 to get C++17 support
+    * Upgrade your gcc to version at least 11 to get C++20 support
     * Install gflags:
 
               git clone https://github.com/gflags/gflags.git
@@ -122,7 +122,7 @@ most processors made since roughly 2013.
               make && sudo make install
 
 * **OS X**:
-    * Install latest C++ compiler that supports C++ 17:
+    * Install latest C++ compiler that supports C++20:
         * Update XCode:  run `xcode-select --install` (or install it from XCode App's settting).
         * Install via [homebrew](http://brew.sh/).
             * If you're first time developer in MacOS, you still need to run: `xcode-select --install` in your command line.
@@ -213,7 +213,7 @@ most processors made since roughly 2013.
              export PATH=/opt/freeware/bin:$PATH
 
 * **Solaris Sparc**
-    * Install GCC 7 and higher.
+    * Install GCC 11 and higher.
     * Use these environment variables:
 
              export CC=gcc
diff --git a/Makefile b/Makefile
index 4b1d0414ae3c..40d7437c2f6e 100644
--- a/Makefile
+++ b/Makefile
@@ -148,10 +148,8 @@ ifeq ($(USE_COROUTINES), 1)
 	USE_FOLLY = 1
 	# glog/logging.h requires HAVE_CXX11_ATOMIC
 	OPT += -DUSE_COROUTINES -DHAVE_CXX11_ATOMIC
-	ROCKSDB_CXX_STANDARD = c++2a
 	USE_RTTI = 1
 ifneq ($(USE_CLANG), 1)
-	ROCKSDB_CXX_STANDARD = c++20
 	PLATFORM_CXXFLAGS += -fcoroutines
 endif
 endif
@@ -298,6 +296,28 @@ $(info $(shell $(CC) --version))
 $(info $(shell $(CXX) --version))
 endif
 
+# ccache support
+# Set USE_CCACHE=1 to enable ccache, or let it auto-detect
+ifndef USE_CCACHE
+  CCACHE := $(shell which ccache 2>/dev/null)
+  ifneq ($(CCACHE),)
+    USE_CCACHE := 1
+  else
+    USE_CCACHE := 0
+  endif
+endif
+
+ifeq ($(USE_CCACHE), 1)
+  CCACHE := $(shell which ccache 2>/dev/null)
+  ifneq ($(CCACHE),)
+    $(info Using ccache: $(CCACHE))
+    CC := $(CCACHE) $(CC)
+    CXX := $(CCACHE) $(CXX)
+  else
+    $(warning ccache requested but not found in PATH)
+  endif
+endif
+
 missing_make_config_paths := $(shell				\
 	grep "\./\S*\|/\S*" -o $(CURDIR)/make_config.mk | 	\
 	while read path;					\
@@ -370,8 +390,6 @@ ifdef COMPILE_WITH_TSAN
         # Turn off -pg when enabling TSAN testing, because that induces
         # a link failure.  TODO: find the root cause
 	PROFILING_FLAGS =
-	# LUA is not supported under TSAN
-	LUA_PATH =
 	# Limit keys for crash test under TSAN to avoid error:
 	# "ThreadSanitizer: DenseSlabAllocator overflow. Dying."
 	CRASH_TEST_EXT_ARGS += --max_key=1000000
@@ -448,83 +466,7 @@ else
 	PLATFORM_CXXFLAGS += -isystem $(GTEST_DIR)
 endif
 
-# This provides a Makefile simulation of a Meta-internal folly integration.
-# It is not validated for general use.
-#
-# USE_FOLLY links the build targets with libfolly.a. The latter could be
-# built using 'make build_folly', or built externally and specified in
-# the CXXFLAGS and EXTRA_LDFLAGS env variables. The build_detect_platform
-# script tries to detect if an external folly dependency has been specified.
-# If not, it exports FOLLY_PATH to the path of the installed Folly and
-# dependency libraries.
-#
-# USE_FOLLY_LITE cherry picks source files from Folly to include in the
-# RocksDB library. Its faster and has fewer dependencies on 3rd party
-# libraries, but with limited functionality. For example, coroutine
-# functionality is not available.
-ifeq ($(USE_FOLLY),1)
-ifeq ($(USE_FOLLY_LITE),1)
-$(error Please specify only one of USE_FOLLY and USE_FOLLY_LITE)
-endif
-ifneq ($(strip $(FOLLY_PATH)),)
-	BOOST_PATH = $(shell (ls -d $(FOLLY_PATH)/../boost*))
-	DBL_CONV_PATH = $(shell (ls -d $(FOLLY_PATH)/../double-conversion*))
-	GFLAGS_PATH = $(shell (ls -d $(FOLLY_PATH)/../gflags*))
-	GLOG_PATH = $(shell (ls -d $(FOLLY_PATH)/../glog*))
-	LIBEVENT_PATH = $(shell (ls -d $(FOLLY_PATH)/../libevent*))
-	XZ_PATH = $(shell (ls -d $(FOLLY_PATH)/../xz*))
-	LIBSODIUM_PATH = $(shell (ls -d $(FOLLY_PATH)/../libsodium*))
-	FMT_PATH = $(shell (ls -d $(FOLLY_PATH)/../fmt*))
-
-	# For some reason, glog and fmt libraries are under either lib or lib64
-	GLOG_LIB_PATH = $(shell (ls -d $(GLOG_PATH)/lib*))
-	FMT_LIB_PATH = $(shell (ls -d $(FMT_PATH)/lib*))
-
-	# AIX: pre-defined system headers are surrounded by an extern "C" block
-	ifeq ($(PLATFORM), OS_AIX)
-		PLATFORM_CCFLAGS += -I$(BOOST_PATH)/include -I$(DBL_CONV_PATH)/include -I$(GLOG_PATH)/include -I$(LIBEVENT_PATH)/include -I$(XZ_PATH)/include -I$(LIBSODIUM_PATH)/include -I$(FOLLY_PATH)/include -I$(FMT_PATH)/include
-		PLATFORM_CXXFLAGS += -I$(BOOST_PATH)/include -I$(DBL_CONV_PATH)/include -I$(GLOG_PATH)/include -I$(LIBEVENT_PATH)/include -I$(XZ_PATH)/include -I$(LIBSODIUM_PATH)/include -I$(FOLLY_PATH)/include -I$(FMT_PATH)/include
-	else
-		PLATFORM_CCFLAGS += -isystem $(BOOST_PATH)/include -isystem $(DBL_CONV_PATH)/include -isystem $(GLOG_PATH)/include -isystem $(LIBEVENT_PATH)/include -isystem $(XZ_PATH)/include -isystem $(LIBSODIUM_PATH)/include -isystem $(FOLLY_PATH)/include -isystem $(FMT_PATH)/include
-		PLATFORM_CXXFLAGS += -isystem $(BOOST_PATH)/include -isystem $(DBL_CONV_PATH)/include -isystem $(GLOG_PATH)/include -isystem $(LIBEVENT_PATH)/include -isystem $(XZ_PATH)/include -isystem $(LIBSODIUM_PATH)/include -isystem $(FOLLY_PATH)/include -isystem $(FMT_PATH)/include
-	endif
-
-	# Add -ldl at the end as gcc resolves a symbol in a library by searching only in libraries specified later
-	# in the command line
-	PLATFORM_LDFLAGS += $(FOLLY_PATH)/lib/libfolly.a $(BOOST_PATH)/lib/libboost_context.a $(BOOST_PATH)/lib/libboost_filesystem.a $(BOOST_PATH)/lib/libboost_atomic.a $(BOOST_PATH)/lib/libboost_program_options.a $(BOOST_PATH)/lib/libboost_regex.a $(BOOST_PATH)/lib/libboost_system.a $(BOOST_PATH)/lib/libboost_thread.a $(DBL_CONV_PATH)/lib/libdouble-conversion.a $(FMT_LIB_PATH)/libfmt.a $(GLOG_LIB_PATH)/libglog.so $(GFLAGS_PATH)/lib/libgflags.so.2.2 $(LIBEVENT_PATH)/lib/libevent-2.1.so -ldl
-	PLATFORM_LDFLAGS += -Wl,-rpath=$(GFLAGS_PATH)/lib -Wl,-rpath=$(GLOG_LIB_PATH) -Wl,-rpath=$(LIBEVENT_PATH)/lib -Wl,-rpath=$(LIBSODIUM_PATH)/lib -Wl,-rpath=$(LIBEVENT_PATH)/lib
-endif
-	PLATFORM_CCFLAGS += -DUSE_FOLLY -DFOLLY_NO_CONFIG
-	PLATFORM_CXXFLAGS += -DUSE_FOLLY -DFOLLY_NO_CONFIG
-endif
-
-ifeq ($(USE_FOLLY_LITE),1)
-	# Path to the Folly source code and include files
-	FOLLY_DIR = ./third-party/folly
-ifneq ($(strip $(BOOST_SOURCE_PATH)),)
-	BOOST_INCLUDE = $(shell (ls -d $(BOOST_SOURCE_PATH)/boost*/))
-	# AIX: pre-defined system headers are surrounded by an extern "C" block
-	ifeq ($(PLATFORM), OS_AIX)
-		PLATFORM_CCFLAGS += -I$(BOOST_INCLUDE)
-		PLATFORM_CXXFLAGS += -I$(BOOST_INCLUDE)
-	else
-		PLATFORM_CCFLAGS += -isystem $(BOOST_INCLUDE)
-		PLATFORM_CXXFLAGS += -isystem $(BOOST_INCLUDE)
-	endif
-endif  # BOOST_SOURCE_PATH
-	# AIX: pre-defined system headers are surrounded by an extern "C" block
-	ifeq ($(PLATFORM), OS_AIX)
-		PLATFORM_CCFLAGS += -I$(FOLLY_DIR)
-		PLATFORM_CXXFLAGS += -I$(FOLLY_DIR)
-	else
-		PLATFORM_CCFLAGS += -isystem $(FOLLY_DIR)
-		PLATFORM_CXXFLAGS += -isystem $(FOLLY_DIR)
-	endif
-	PLATFORM_CCFLAGS += -DUSE_FOLLY -DFOLLY_NO_CONFIG
-	PLATFORM_CXXFLAGS += -DUSE_FOLLY -DFOLLY_NO_CONFIG
-# TODO: fix linking with fbcode compiler config
-	PLATFORM_LDFLAGS += -lglog
-endif
+include folly.mk
 
 ifdef TEST_CACHE_LINE_SIZE
   PLATFORM_CCFLAGS += -DTEST_CACHE_LINE_SIZE=$(TEST_CACHE_LINE_SIZE)
@@ -564,32 +506,6 @@ ifndef DISABLE_WARNING_AS_ERROR
 endif
 
 
-ifdef LUA_PATH
-
-ifndef LUA_INCLUDE
-LUA_INCLUDE=$(LUA_PATH)/include
-endif
-
-LUA_INCLUDE_FILE=$(LUA_INCLUDE)/lualib.h
-
-ifeq ("$(wildcard $(LUA_INCLUDE_FILE))", "")
-# LUA_INCLUDE_FILE does not exist
-$(error Cannot find lualib.h under $(LUA_INCLUDE).  Try to specify both LUA_PATH and LUA_INCLUDE manually)
-endif
-LUA_FLAGS = -I$(LUA_INCLUDE) -DLUA -DLUA_COMPAT_ALL
-CFLAGS += $(LUA_FLAGS)
-CXXFLAGS += $(LUA_FLAGS)
-
-ifndef LUA_LIB
-LUA_LIB = $(LUA_PATH)/lib/liblua.a
-endif
-ifeq ("$(wildcard $(LUA_LIB))", "") # LUA_LIB does not exist
-$(error $(LUA_LIB) does not exist.  Try to specify both LUA_PATH and LUA_LIB manually)
-endif
-EXEC_LDFLAGS += $(LUA_LIB)
-
-endif
-
 ifeq ($(NO_THREEWAY_CRC32C), 1)
 	CXXFLAGS += -DNO_THREEWAY_CRC32C
 endif
@@ -638,13 +554,14 @@ endif
 TEST_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(TEST_LIB_SOURCES) $(MOCK_LIB_SOURCES)) $(GTEST)
 BENCH_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(BENCH_LIB_SOURCES))
 CACHE_BENCH_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(CACHE_BENCH_LIB_SOURCES))
+POINT_LOCK_BENCH_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(POINT_LOCK_BENCH_LIB_SOURCES))
 TOOL_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(TOOL_LIB_SOURCES))
 ANALYZE_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(ANALYZER_LIB_SOURCES))
 STRESS_OBJECTS =  $(patsubst %.cc, $(OBJ_DIR)/%.o, $(STRESS_LIB_SOURCES))
 
 # Exclude build_version.cc -- a generated source file -- from all sources.  Not needed for dependencies
 ALL_SOURCES  = $(filter-out util/build_version.cc, $(LIB_SOURCES)) $(TEST_LIB_SOURCES) $(MOCK_LIB_SOURCES) $(GTEST_DIR)/gtest/gtest-all.cc
-ALL_SOURCES += $(TOOL_LIB_SOURCES) $(BENCH_LIB_SOURCES) $(CACHE_BENCH_LIB_SOURCES) $(ANALYZER_LIB_SOURCES) $(STRESS_LIB_SOURCES)
+ALL_SOURCES += $(TOOL_LIB_SOURCES) $(BENCH_LIB_SOURCES) $(CACHE_BENCH_LIB_SOURCES) $(POINT_LOCK_BENCH_LIB_SOURCES) $(ANALYZER_LIB_SOURCES) $(STRESS_LIB_SOURCES)
 ALL_SOURCES += $(TEST_MAIN_SOURCES) $(TOOL_MAIN_SOURCES) $(BENCH_MAIN_SOURCES)
 ALL_SOURCES += $(ROCKSDB_PLUGIN_SOURCES) $(ROCKSDB_PLUGIN_TESTS)
 
@@ -659,8 +576,8 @@ ifneq ($(filter check-headers, $(MAKECMDGOALS)),)
 # TODO: add/support JNI headers
 	DEV_HEADER_DIRS := $(sort include/ $(dir $(ALL_SOURCES)))
 # Some headers like in port/ are platform-specific
-	DEV_HEADERS_TO_CHECK := $(shell $(FIND) $(DEV_HEADER_DIRS) -type f -name '*.h' | grep -E -v 'port/|plugin/|lua/|range_tree/|secondary_index/')
-	PUBLIC_HEADERS_TO_CHECK := $(shell $(FIND) include/ -type f -name '*.h' | grep -E -v 'lua/')
+	DEV_HEADERS_TO_CHECK := $(shell $(FIND) $(DEV_HEADER_DIRS) -type f -name '*.h' | grep -E -v 'port/|plugin/|range_tree/|secondary_index/')
+	PUBLIC_HEADERS_TO_CHECK := $(shell $(FIND) include/ -type f -name '*.h')
 else
 	DEV_HEADERS_TO_CHECK :=
 	PUBLIC_HEADERS_TO_CHECK :=
@@ -683,7 +600,8 @@ am__v_CCH_1 =
 # user build settings
 %.h.pub: %.h # .h.pub not actually created, so re-checked on each invocation
 	$(AM_V_CCH) cd include/ && echo '#include "$(patsubst include/%,%,$<)"' | \
-	  $(CXX) -I. -DROCKSDB_NAMESPACE=42 -x c++ -c - -o /dev/null
+	  $(CXX) -std=$(or $(ROCKSDB_CXX_STANDARD),c++20) -I. -DROCKSDB_NAMESPACE=42 -x c++ -c - -o /dev/null
+	build_tools/check-public-header.sh $<
 
 check-headers: $(HEADER_OK_FILES)
 
@@ -887,7 +805,7 @@ endif  # PLATFORM_SHARED_EXT
 .PHONY: check clean coverage ldb_tests package dbg gen-pc build_size \
 	release tags tags0 valgrind_check format static_lib shared_lib all \
 	rocksdbjavastatic rocksdbjava install install-static install-shared \
-	uninstall analyze tools tools_lib check-headers checkout_folly
+	uninstall analyze tools tools_lib check-headers checkout_folly clang-tidy
 
 all: $(LIBRARY) $(BENCHMARKS) tools tools_lib test_libs $(TESTS)
 
@@ -1075,6 +993,11 @@ watch-log:
 dump-log:
 	bash -c '$(quoted_perl_command)' < LOG
 
+# Machine-parseable progress output for automated monitoring (e.g., Claude Code)
+# Outputs JSON: {"status":"running","completed":45,"total":100,"failed":0,"percent":45,"eta_seconds":120}
+check-progress:
+	@build_tools/check_progress.sh
+
 # If J != 1 and GNU parallel is installed, run the tests in parallel,
 # via the check_0 rule above.  Otherwise, run them sequentially.
 check: all
@@ -1286,6 +1209,10 @@ tags0:
 format:
 	build_tools/format-diff.sh
 
+# Non-interactive format (auto-apply without prompts, for CI/automation/Claude Code)
+format-auto:
+	build_tools/format-diff.sh -y
+
 check-format:
 	build_tools/format-diff.sh -c
 
@@ -1295,6 +1222,15 @@ check-buck-targets:
 check-sources:
 	build_tools/check-sources.sh
 
+# Run clang-tidy on locally changed files, filtered to changed lines only.
+# Requires compile_commands.json (generate with cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=ON).
+# Override CLANG_TIDY_BINARY and CLANG_TIDY_JOBS as needed:
+#   make clang-tidy CLANG_TIDY_BINARY=/usr/bin/clang-tidy CLANG_TIDY_JOBS=8
+CLANG_TIDY_BINARY ?= /opt/homebrew/opt/llvm/bin/clang-tidy
+CLANG_TIDY_JOBS ?= $(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
+clang-tidy:
+	python3 tools/run_clang_tidy.py --clang-tidy-binary $(CLANG_TIDY_BINARY) -j $(CLANG_TIDY_JOBS)
+
 package:
 	bash build_tools/make_package.sh $(SHARED_MAJOR).$(SHARED_MINOR)
 
@@ -1345,6 +1281,9 @@ block_cache_trace_analyzer: $(OBJ_DIR)/tools/block_cache_analyzer/block_cache_tr
 cache_bench: $(OBJ_DIR)/cache/cache_bench.o $(CACHE_BENCH_OBJECTS) $(LIBRARY)
 	$(AM_LINK)
 
+point_lock_bench: $(OBJ_DIR)/utilities/transactions/lock/point/point_lock_bench.o $(POINT_LOCK_BENCH_OBJECTS) $(LIBRARY)
+	$(AM_LINK)
+
 persistent_cache_bench: $(OBJ_DIR)/utilities/persistent_cache/persistent_cache_bench.o $(LIBRARY)
 	$(AM_LINK)
 
@@ -1357,6 +1296,9 @@ filter_bench: $(OBJ_DIR)/util/filter_bench.o $(LIBRARY)
 db_stress: $(OBJ_DIR)/db_stress_tool/db_stress.o $(STRESS_LIBRARY) $(TOOLS_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
+db_stress_compression_manager: $(OBJ_DIR)/db_stress_tool/db_stress_compression_manager.o $(LIBRARY)
+	$(AM_LINK)
+
 write_stress: $(OBJ_DIR)/tools/write_stress.o $(LIBRARY)
 	$(AM_LINK)
 
@@ -1422,13 +1364,13 @@ agg_merge_test: $(OBJ_DIR)/utilities/agg_merge/agg_merge_test.o $(TEST_LIBRARY)
 stringappend_test: $(OBJ_DIR)/utilities/merge_operators/string_append/stringappend_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-cassandra_format_test: $(OBJ_DIR)/utilities/cassandra/cassandra_format_test.o $(OBJ_DIR)/utilities/cassandra/test_utils.o $(TEST_LIBRARY) $(LIBRARY)
+cassandra_format_test: $(OBJ_DIR)/utilities/cassandra/cassandra_format_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-cassandra_functional_test: $(OBJ_DIR)/utilities/cassandra/cassandra_functional_test.o $(OBJ_DIR)/utilities/cassandra/test_utils.o $(TEST_LIBRARY) $(LIBRARY)
+cassandra_functional_test: $(OBJ_DIR)/utilities/cassandra/cassandra_functional_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-cassandra_row_merge_test: $(OBJ_DIR)/utilities/cassandra/cassandra_row_merge_test.o $(OBJ_DIR)/utilities/cassandra/test_utils.o $(TEST_LIBRARY) $(LIBRARY)
+cassandra_row_merge_test: $(OBJ_DIR)/utilities/cassandra/cassandra_row_merge_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
 cassandra_serialize_test: $(OBJ_DIR)/utilities/cassandra/cassandra_serialize_test.o $(TEST_LIBRARY) $(LIBRARY)
@@ -1491,6 +1433,12 @@ db_test: $(OBJ_DIR)/db/db_test.o $(TEST_LIBRARY) $(LIBRARY)
 db_test2: $(OBJ_DIR)/db/db_test2.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
+db_etc3_test: $(OBJ_DIR)/db/db_etc3_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+compression_test: $(OBJ_DIR)/util/compression_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
 db_logical_block_size_cache_test: $(OBJ_DIR)/db/db_logical_block_size_cache_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
@@ -1512,6 +1460,9 @@ db_compaction_filter_test: $(OBJ_DIR)/db/db_compaction_filter_test.o $(TEST_LIBR
 db_compaction_test: $(OBJ_DIR)/db/db_compaction_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
+db_compaction_abort_test: $(OBJ_DIR)/db/db_compaction_abort_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
 db_clip_test: $(OBJ_DIR)/db/db_clip_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
@@ -1875,6 +1826,9 @@ heap_test: $(OBJ_DIR)/util/heap_test.o $(TEST_LIBRARY) $(LIBRARY)
 point_lock_manager_test: utilities/transactions/lock/point/point_lock_manager_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
+point_lock_manager_stress_test: utilities/transactions/lock/point/point_lock_manager_stress_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
 transaction_test: $(OBJ_DIR)/utilities/transactions/transaction_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
@@ -1884,6 +1838,9 @@ write_committed_transaction_ts_test: $(OBJ_DIR)/utilities/transactions/write_com
 write_prepared_transaction_test: $(OBJ_DIR)/utilities/transactions/write_prepared_transaction_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
+write_prepared_transaction_test_seqno: $(OBJ_DIR)/utilities/transactions/write_prepared_transaction_test_seqno.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
 write_unprepared_transaction_test: $(OBJ_DIR)/utilities/transactions/write_unprepared_transaction_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
@@ -1989,6 +1946,9 @@ blob_source_test: $(OBJ_DIR)/db/blob/blob_source_test.o $(TEST_LIBRARY) $(LIBRAR
 blob_garbage_meter_test: $(OBJ_DIR)/db/blob/blob_garbage_meter_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
+io_dispatcher_test: $(OBJ_DIR)/util/io_dispatcher_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
 timer_test: $(OBJ_DIR)/util/timer_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
@@ -2034,6 +1994,9 @@ wide_column_serialization_test: $(OBJ_DIR)/db/wide/wide_column_serialization_tes
 wide_columns_helper_test: $(OBJ_DIR)/db/wide/wide_columns_helper_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
+interval_test: $(OBJ_DIR)/util/interval_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
 #-------------------------------------------------
 # make install related stuff
 PREFIX ?= /usr/local
@@ -2144,14 +2107,14 @@ ZLIB_DOWNLOAD_BASE ?= http://zlib.net
 BZIP2_VER ?= 1.0.8
 BZIP2_SHA256 ?= ab5a03176ee106d3f0fa90e381da478ddae405918153cca248e682cd0c4a2269
 BZIP2_DOWNLOAD_BASE ?= http://sourceware.org/pub/bzip2
-SNAPPY_VER ?= 1.2.1
-SNAPPY_SHA256 ?= 736aeb64d86566d2236ddffa2865ee5d7a82d26c9016b36218fcc27ea4f09f86
+SNAPPY_VER ?= 1.2.2
+SNAPPY_SHA256 ?= 90f74bc1fbf78a6c56b3c4a082a05103b3a56bb17bca1a27e052ea11723292dc
 SNAPPY_DOWNLOAD_BASE ?= https://github.com/google/snappy/archive
-LZ4_VER ?= 1.9.4
-LZ4_SHA256 ?= 0b0e3aa07c8c063ddf40b082bdf7e37a1562bda40a0ff5272957f3e987e0e54b
+LZ4_VER ?= 1.10.0
+LZ4_SHA256 ?= 537512904744b35e232912055ccf8ec66d768639ff3abe5788d90d792ec5f48b
 LZ4_DOWNLOAD_BASE ?= https://github.com/lz4/lz4/archive
-ZSTD_VER ?= 1.5.5
-ZSTD_SHA256 ?= 98e9c3d949d1b924e28e01eccb7deed865eefebf25c2f21c702e5cd5b63b85e1
+ZSTD_VER ?= 1.5.7
+ZSTD_SHA256 ?= 37d7284556b20954e56e1ca85b80226768902e2edabd3b649e9e72c0c9012ee3
 ZSTD_DOWNLOAD_BASE ?= https://github.com/facebook/zstd/archive
 CURL_SSL_OPTS ?= --tlsv1
 
@@ -2242,7 +2205,7 @@ libsnappy.a: snappy-$(SNAPPY_VER).tar.gz
 	-rm -rf snappy-$(SNAPPY_VER)
 	tar xvzf snappy-$(SNAPPY_VER).tar.gz
 	mkdir snappy-$(SNAPPY_VER)/build
-	cd snappy-$(SNAPPY_VER)/build && CFLAGS='$(ARCHFLAG) ${JAVA_STATIC_DEPS_CCFLAGS} ${EXTRA_CFLAGS}' CXXFLAGS='$(ARCHFLAG) ${JAVA_STATIC_DEPS_CXXFLAGS} ${EXTRA_CXXFLAGS}' LDFLAGS='${JAVA_STATIC_DEPS_LDFLAGS} ${EXTRA_LDFLAGS}' cmake -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DSNAPPY_BUILD_BENCHMARKS=OFF -DSNAPPY_BUILD_TESTS=OFF --compile-no-warning-as-error ${PLATFORM_CMAKE_FLAGS} .. && $(MAKE) ${SNAPPY_MAKE_TARGET}
+	cd snappy-$(SNAPPY_VER)/build && CFLAGS='$(ARCHFLAG) ${JAVA_STATIC_DEPS_CCFLAGS} ${EXTRA_CFLAGS}' CXXFLAGS='$(ARCHFLAG) ${JAVA_STATIC_DEPS_CXXFLAGS} ${EXTRA_CXXFLAGS}' LDFLAGS='${JAVA_STATIC_DEPS_LDFLAGS} ${EXTRA_LDFLAGS}' cmake -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DSNAPPY_BUILD_BENCHMARKS=OFF -DSNAPPY_BUILD_TESTS=OFF ${PLATFORM_CMAKE_FLAGS} .. && $(MAKE) ${SNAPPY_MAKE_TARGET}
 	cp snappy-$(SNAPPY_VER)/build/libsnappy.a .
 
 lz4-$(LZ4_VER).tar.gz:
@@ -2372,27 +2335,27 @@ rocksdbjavastaticreleasedocker: rocksdbjavastaticosx rocksdbjavastaticdockerx86
 
 rocksdbjavastaticdockerx86:
 	mkdir -p java/target
-	docker run --rm --name rocksdb_linux_x86-be --platform linux/386 --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:centos6_x86-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
+	docker run --rm --name rocksdb_linux_x86-be --platform linux/386 --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:centos7_x86-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
 
 rocksdbjavastaticdockerx86_64:
 	mkdir -p java/target
-	docker run --rm --name rocksdb_linux_x64-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:centos6_x64-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
+	docker run --rm --name rocksdb_linux_x64-be --platform linux/amd64 --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:centos7_x64-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
 
 rocksdbjavastaticdockerppc64le:
 	mkdir -p java/target
-	docker run --rm --name rocksdb_linux_ppc64le-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:centos7_ppc64le-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
+	docker run --rm --name rocksdb_linux_ppc64le-be --platform linux/ppc64le --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:centos7_ppc64le-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
 
 rocksdbjavastaticdockerarm64v8:
 	mkdir -p java/target
-	docker run --rm --name rocksdb_linux_arm64v8-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:centos7_arm64v8-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
+	docker run --rm --name rocksdb_linux_arm64v8-be --platform linux/aarch64 --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:centos7_arm64v8-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
 
 rocksdbjavastaticdockers390x:
 	mkdir -p java/target
-	docker run --rm --name rocksdb_linux_s390x-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:ubuntu18_s390x-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
+	docker run --rm --name rocksdb_linux_s390x-be --platform linux/s390x --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:ubuntu18_s390x-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
 
 rocksdbjavastaticdockerriscv64:
 	mkdir -p java/target
-	docker run --rm --name rocksdb_linux_riscv64-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:ubuntu20_riscv64-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
+	docker run --rm --name rocksdb_linux_riscv64-be --platform linux/riscv64 --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:ubuntu20_riscv64-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
 
 rocksdbjavastaticdockerx86musl:
 	mkdir -p java/target
@@ -2400,19 +2363,19 @@ rocksdbjavastaticdockerx86musl:
 
 rocksdbjavastaticdockerx86_64musl:
 	mkdir -p java/target
-	docker run --rm --name rocksdb_linux_x64-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:alpine3_x64-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
+	docker run --rm --name rocksdb_linux_x64-musl-be --platform linux/amd64 --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:alpine3_x64-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
 
 rocksdbjavastaticdockerppc64lemusl:
 	mkdir -p java/target
-	docker run --rm --name rocksdb_linux_ppc64le-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:alpine3_ppc64le-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
+	docker run --rm --name rocksdb_linux_ppc64le-musl-be --platform linux/ppc64le --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:alpine3_ppc64le-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
 
 rocksdbjavastaticdockerarm64v8musl:
 	mkdir -p java/target
-	docker run --rm --name rocksdb_linux_arm64v8-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:alpine3_arm64v8-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
+	docker run --rm --name rocksdb_linux_arm64v8-musl-be --platform linux/aarch64 --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:alpine3_arm64v8-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
 
 rocksdbjavastaticdockers390xmusl:
 	mkdir -p java/target
-	docker run --rm --name rocksdb_linux_s390x-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:alpine3_s390x-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
+	docker run --rm --name rocksdb_linux_s390x-musl-be --platform linux/s390x --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:alpine3_s390x-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
 
 rocksdbjavastaticpublish: rocksdbjavastaticrelease rocksdbjavastaticpublishcentral
 
@@ -2467,8 +2430,8 @@ jtest_run:
 jtest: rocksdbjava
 	cd java;$(MAKE) sample test
 
-jpmd: rocksdbjava rocksdbjavageneratepom
-	cd java;$(MAKE) pmd
+jpmd: rocksdbjavageneratepom
+	cd java;$(MAKE) java java_test pmd
 
 jdb_bench:
 	cd java;$(MAKE) db_bench;
@@ -2478,38 +2441,6 @@ commit_prereq:
 	false # J=$(J) build_tools/precommit_checker.py unit clang_unit release clang_release tsan asan ubsan lite unit_non_shm
 	# $(MAKE) clean && $(MAKE) jclean && $(MAKE) rocksdbjava;
 
-# For public CI runs, checkout folly in a way that can build with RocksDB.
-# This is mostly intended as a test-only simulation of Meta-internal folly
-# integration.
-checkout_folly:
-	if [ -e third-party/folly ]; then \
-		cd third-party/folly && ${GIT_COMMAND} fetch origin; \
-	else \
-		cd third-party && ${GIT_COMMAND} clone https://github.com/facebook/folly.git; \
-	fi
-	@# Pin to a particular version for public CI, so that PR authors don't
-	@# need to worry about folly breaking our integration. Update periodically
-	cd third-party/folly && git reset --hard 78286282478e1ae05b2e8cbcf0e2139eab283bea
-	@# NOTE: this hack is required for clang in some cases
-	perl -pi -e 's/int rv = syscall/int rv = (int)syscall/' third-party/folly/folly/detail/Futex.cpp
-	@# NOTE: this hack is required for gcc in some cases
-	perl -pi -e 's/(__has_include.<experimental.memory_resource>.)/__cpp_rtti && $$1/' third-party/folly/folly/memory/MemoryResource.h
-	@# NOTE: boost source will be needed for any build including `USE_FOLLY_LITE` builds as those depend on boost headers
-	cd third-party/folly && $(PYTHON) build/fbcode_builder/getdeps.py fetch boost
-
-CXX_M_FLAGS = $(filter -m%, $(CXXFLAGS))
-
-build_folly:
-	FOLLY_INST_PATH=`cd third-party/folly; $(PYTHON) build/fbcode_builder/getdeps.py show-inst-dir`; \
-	if [ "$$FOLLY_INST_PATH" ]; then \
-		rm -rf $${FOLLY_INST_PATH}/../../*; \
-	else \
-		echo "Please run checkout_folly first"; \
-		false; \
-	fi
-	cd third-party/folly && \
-		CXXFLAGS=" $(CXX_M_FLAGS) -DHAVE_CXX11_ATOMIC " $(PYTHON) build/fbcode_builder/getdeps.py build --no-tests
-
 # ---------------------------------------------------------------------------
 #   Build size testing
 # ---------------------------------------------------------------------------
@@ -2630,7 +2561,7 @@ list_all_tests:
 
 # Remove the rules for which dependencies should not be generated and see if any are left.
 #If so, include the dependencies; if not, do not include the dependency files
-ROCKS_DEP_RULES=$(filter-out clean format check-format check-buck-targets check-headers check-sources jclean jtest package analyze tags rocksdbjavastatic% unity.% unity_test checkout_folly, $(MAKECMDGOALS))
+ROCKS_DEP_RULES=$(filter-out clean format check-format check-buck-targets check-headers check-sources clang-tidy jclean jtest package analyze tags rocksdbjavastatic% unity.% unity_test checkout_folly, $(MAKECMDGOALS))
 ifneq ("$(ROCKS_DEP_RULES)", "")
 -include $(DEPFILES)
 endif
diff --git a/buckifier/buckify_rocksdb.py b/buckifier/buckify_rocksdb.py
index 035254b5ad1f..647353e44f3c 100755
--- a/buckifier/buckify_rocksdb.py
+++ b/buckifier/buckify_rocksdb.py
@@ -135,6 +135,9 @@ def generate_buck(repo_path, deps_map):
 
     BUCK = TARGETSBuilder("%s/BUCK" % repo_path, extra_argv)
 
+    # Add oncall("rocksdb_point_of_contact") at the top
+    BUCK.add_oncall("rocksdb_point_of_contact")
+
     # rocksdb_lib
     BUCK.add_library(
         "rocksdb_lib",
@@ -206,6 +209,12 @@ def generate_buck(repo_path, deps_map):
         src_mk.get("CACHE_BENCH_LIB_SOURCES", []),
         [":rocksdb_lib"],
     )
+    # rocksdb_point_lock_bench_tools_lib
+    BUCK.add_library(
+        "rocksdb_point_lock_bench_tools_lib",
+        src_mk.get("POINT_LOCK_BENCH_LIB_SOURCES", []),
+        [":rocksdb_lib"],
+    )
     # rocksdb_stress_lib
     BUCK.add_rocksdb_library(
         "rocksdb_stress_lib",
@@ -229,6 +238,12 @@ def generate_buck(repo_path, deps_map):
     BUCK.add_binary(
         "cache_bench", ["cache/cache_bench.cc"], [":rocksdb_cache_bench_tools_lib"]
     )
+    # point_lock_bench binary
+    BUCK.add_binary(
+        "point_lock_bench",
+        ["utilities/transactions/lock/point/point_lock_bench.cc"],
+        [":rocksdb_point_lock_bench_tools_lib"]
+    )
     # bench binaries
     for src in src_mk.get("MICROBENCH_SOURCES", []):
         name = src.rsplit("/", 1)[1].split(".")[0] if "/" in src else src.split(".")[0]
diff --git a/buckifier/targets_builder.py b/buckifier/targets_builder.py
index e62eaf958504..1f0f412e18e3 100644
--- a/buckifier/targets_builder.py
+++ b/buckifier/targets_builder.py
@@ -45,6 +45,11 @@ def __init__(self, path, extra_argv):
         self.total_bin = 0
         self.total_test = 0
         self.tests_cfg = ""
+    
+    def add_oncall(self, oncall):
+       with open(self.path, "ab") as targets_file:
+            targets_file.write(targets_cfg.oncall_template.format(name=oncall).encode("utf-8"))
+                
 
     def add_library(
         self,
diff --git a/buckifier/targets_cfg.py b/buckifier/targets_cfg.py
index 4e58d1210200..e9ff129a604a 100644
--- a/buckifier/targets_cfg.py
+++ b/buckifier/targets_cfg.py
@@ -1,10 +1,12 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#  This source code is licensed under both the GPLv2 (found in the COPYING file in the root directory)
+#  and the Apache 2.0 License (found in the LICENSE.Apache file in the root directory).
 
 rocksdb_target_header_template = """# This file \100generated by:
 #$ python3 buckifier/buckify_rocksdb.py{extra_argv}
 # --> DO NOT EDIT MANUALLY <--
-# This file is a Facebook-specific integration for buck builds, so can
-# only be validated by Facebook employees.
+# This file is a Meta-specific integration for buck builds, so can
+# only be validated by Meta employees.
 load("//rocks/buckifier:defs.bzl", "cpp_library_wrapper","rocks_cpp_library_wrapper","cpp_binary_wrapper","cpp_unittest_wrapper","fancy_bench_wrapper","add_c_test_wrapper")
 load("@fbcode_macros//build_defs:export_files.bzl", "export_file")
 
@@ -41,3 +43,8 @@
 export_file_template = """
 export_file(name = "{name}")
 """
+
+
+oncall_template = """
+oncall("{name}")
+"""
diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index 629b670b43d6..cfb8d143664b 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -45,18 +45,21 @@ if test -z "$OUTPUT"; then
   exit 1
 fi
 
-# we depend on C++17, but should be compatible with newer standards
+# we depend on C++20, but should be compatible with newer standards
 if [ "$ROCKSDB_CXX_STANDARD" ]; then
   PLATFORM_CXXFLAGS="-std=$ROCKSDB_CXX_STANDARD"
 else
-  PLATFORM_CXXFLAGS="-std=c++17"
+  PLATFORM_CXXFLAGS="-std=c++20"
 fi
 
 # we currently depend on POSIX platform
 COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX"
 
-# Default to fbcode gcc on internal fb machines
-if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then
+# Default to fbcode gcc on Meta internal machines
+IS_META_HOST="$(hostname | grep -E '(facebook|meta).com|fbinfra.net')"
+if [ -z "$ROCKSDB_NO_FBCODE" -a "$IS_META_HOST" ]; then
+  if [ -d /mnt/gvfs/third-party ]; then
+    echo "NOTE: Using fbcode build" >&2
     FBCODE_BUILD="true"
     # If we're compiling with TSAN or shared lib, we need pic build
     PIC_BUILD=$COMPILE_WITH_TSAN
@@ -64,6 +67,11 @@ if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then
       PIC_BUILD=1
     fi
     source "$PWD/build_tools/fbcode_config_platform010.sh"
+  else
+    echo "************************************************************************" >&2
+    echo "WARNING: -d /mnt/gvfs/third-party failed; no fbcode build" >&2
+    echo "************************************************************************" >&2
+  fi
 fi
 
 # Delete existing output, if it exists
@@ -71,7 +79,9 @@ rm -f "$OUTPUT"
 touch "$OUTPUT"
 
 if test -z "$CC"; then
-    if [ -x "$(command -v cc)" ]; then
+    if [ "$USE_CLANG" -a -x "$(command -v clang)" ]; then
+        CC=clang
+    elif [ -x "$(command -v cc)" ]; then
         CC=cc
     elif [ -x "$(command -v clang)" ]; then
         CC=clang
@@ -81,7 +91,9 @@ if test -z "$CC"; then
 fi
 
 if test -z "$CXX"; then
-    if [ -x "$(command -v g++)" ]; then
+    if [ "$USE_CLANG" -a -x "$(command -v clang++)" ]; then
+        CXX=clang++
+    elif [ -x "$(command -v g++)" ]; then
         CXX=g++
     elif [ -x "$(command -v clang++)" ]; then
         CXX=clang++
@@ -91,7 +103,9 @@ if test -z "$CXX"; then
 fi
 
 if test -z "$AR"; then
-    if [ -x "$(command -v gcc-ar)" ]; then
+    if [ "$USE_CLANG" -a -x "$(command -v llvm-ar)" ]; then
+        AR=llvm-ar
+    elif [ -x "$(command -v gcc-ar)" ]; then
         AR=gcc-ar
     elif [ -x "$(command -v llvm-ar)" ]; then
         AR=llvm-ar
@@ -297,7 +311,8 @@ EOF
 EOF
         then
           COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=1"
-          PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
+          # Hack: don't link extra gflags assuming it comes with folly
+          [ "$USE_FOLLY" ] || PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
         # check if namespace is gflags
         elif $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null << EOF
             #include <gflags/gflags.h>
@@ -306,7 +321,8 @@ EOF
 EOF
         then
           COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=1 -DGFLAGS_NAMESPACE=gflags"
-          PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
+          # Hack: don't link extra gflags assuming it comes with folly
+          [ "$USE_FOLLY" ] || PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
         # check if namespace is google
         elif $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null << EOF
             #include <gflags/gflags.h>
@@ -758,6 +774,7 @@ fi
 if [ "$USE_FOLLY_LITE" ]; then
   if [ "$FOLLY_DIR" ]; then
     BOOST_SOURCE_PATH=`cd $FOLLY_DIR && $PYTHON build/fbcode_builder/getdeps.py show-source-dir boost`
+    FMT_SOURCE_PATH=`cd $FOLLY_DIR && $PYTHON build/fbcode_builder/getdeps.py show-source-dir fmt`
   fi
 fi
 
@@ -802,6 +819,7 @@ echo "FIND=$FIND" >> "$OUTPUT"
 echo "WATCH=$WATCH" >> "$OUTPUT"
 echo "FOLLY_PATH=$FOLLY_PATH" >> "$OUTPUT"
 echo "BOOST_SOURCE_PATH=$BOOST_SOURCE_PATH" >> "$OUTPUT"
+echo "FMT_SOURCE_PATH=$FMT_SOURCE_PATH" >> "$OUTPUT"
 
 # This will enable some related identifiers for the preprocessor
 if test -n "$JEMALLOC"; then
@@ -813,7 +831,6 @@ fi
 if test -n "$WITH_JEMALLOC_FLAG"; then
   echo "WITH_JEMALLOC_FLAG=$WITH_JEMALLOC_FLAG" >> "$OUTPUT"
 fi
-echo "LUA_PATH=$LUA_PATH" >> "$OUTPUT"
 if test -n "$USE_FOLLY"; then
   echo "USE_FOLLY=$USE_FOLLY" >> "$OUTPUT"
 fi
diff --git a/build_tools/check-public-header.sh b/build_tools/check-public-header.sh
new file mode 100755
index 000000000000..bb1bc147dc0a
--- /dev/null
+++ b/build_tools/check-public-header.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
+#
+# Check for some simple mistakes in public headers (on the command line)
+# that should prevent commit or push
+
+BAD=""
+
+# Look for potential for ODR violations caused by public headers depending on
+# build parameters that could vary between RocksDB build and application build.
+# * Cases like ROCKSDB_NAMESPACE, and ROCKSDB_ASSERT_STATUS_CHECKED are
+#   intentional, hard to avoid. (We expect definitions to change and the user
+#   should also.)
+# * Cases like _WIN32, OS_WIN, and __cplusplus are essentially ODR-safe.
+# * Cases like
+#   #ifdef BLAH  // ODR-SAFE
+#   #undef BLAH
+#   #endif
+#   that should not cause ODR violations can be exempted with the ODR-SAFE
+#   marker recognized here.
+
+grep -nHE '^#if' -- "$@" | grep -vE 'ROCKSDB_NAMESPACE|ROCKSDB_ASSERT_STATUS_CHECKED|_WIN32|OS_WIN|ODR-SAFE|__cplusplus|ROCKSDB_DLL|ROCKSDB_LIBRARY_EXPORTS'
+if [ "$?" != "1" ]; then
+  echo "^^^^^ #if in public API could cause an ODR violation."
+  echo "      Add // ODR-SAFE if verified safe."
+  BAD=1
+fi
+
+if [ "$BAD" ]; then
+  exit 1
+fi
diff --git a/build_tools/check_progress.sh b/build_tools/check_progress.sh
new file mode 100755
index 000000000000..d52a91dabd05
--- /dev/null
+++ b/build_tools/check_progress.sh
@@ -0,0 +1,231 @@
+#!/bin/bash
+# Output test progress in JSON format for machine parsing
+# Usage: build_tools/check_progress.sh
+
+LOG_FILE="LOG"
+T_DIR="t"
+SRC_MK="src.mk"
+
+# Maximum lines of test output to include per failed test
+MAX_OUTPUT_LINES=50
+
+# Helper to escape string for JSON (handles newlines, quotes, backslashes, tabs)
+json_escape() {
+    local str="$1"
+    # Use python for reliable JSON escaping if available, otherwise use sed
+    if command -v python3 &>/dev/null; then
+        printf '%s' "$str" | python3 -c 'import json,sys; print(json.dumps(sys.stdin.read())[1:-1], end="")'
+    else
+        printf '%s' "$str" | sed 's/\\/\\\\/g; s/"/\\"/g; s/\t/\\t/g; s/\r/\\r/g' | awk '{printf "%s\\n", $0}' | sed 's/\\n$//'
+    fi
+}
+
+# Helper to output JSON and exit
+output_json() {
+    local status="$1"
+    local completed="${2:-0}"
+    local total="${3:-0}"
+    local failed="${4:-0}"
+    local percent="${5:-0}"
+    local eta="${6:-0}"
+    local avg_time="${7:-0}"
+    local last_item="${8:-}"
+    local phase="${9:-}"
+    local failed_tests="${10:-}"
+
+    # Build JSON output
+    local json="{\"status\":\"$status\""
+
+    if [[ -n "$phase" ]]; then
+        json="$json,\"phase\":\"$phase\""
+    fi
+
+    json="$json,\"completed\":$completed,\"total\":$total,\"failed\":$failed,\"percent\":$percent"
+    json="$json,\"eta_seconds\":$eta,\"avg_time\":\"$avg_time\",\"last_item\":\"$(json_escape "$last_item")\""
+
+    if [[ -n "$failed_tests" ]]; then
+        json="$json,\"failed_tests\":[$failed_tests]"
+    fi
+
+    json="$json}"
+    echo "$json"
+}
+
+# Get failed test info with log output
+get_failed_tests_json() {
+    local log_file="$1"
+    local t_dir="$2"
+    local max_failures=10
+    local count=0
+    local first=true
+
+    # Get failed tests from LOG file
+    while IFS=$'\t' read -r seq host starttime runtime send recv exitval signal cmd; do
+        # Skip header line
+        [[ "$seq" == "Seq" ]] && continue
+
+        # Check if failed (exitval != 0 or signal != 0)
+        if [[ "$exitval" != "0" || "$signal" != "0" ]]; then
+            # Extract test name from command
+            test_name=$(echo "$cmd" | sed 's,.*/run-,,;s, .*,,')
+
+            # Get log file path
+            log_path="$t_dir/log-run-$test_name"
+
+            # Read test output (last N lines)
+            if [[ -f "$log_path" ]]; then
+                output=$(tail -n "$MAX_OUTPUT_LINES" "$log_path" 2>/dev/null)
+            else
+                output="(log file not found: $log_path)"
+            fi
+
+            # Escape output for JSON
+            escaped_output=$(json_escape "$output")
+
+            # Build JSON object for this failure
+            if [[ "$first" == "true" ]]; then
+                first=false
+            else
+                printf ","
+            fi
+            printf '{"test":"%s","exit_code":%d,"signal":%d,"output":"%s"}' \
+                "$test_name" "$exitval" "$signal" "$escaped_output"
+
+            ((count++))
+            if [[ $count -ge $max_failures ]]; then
+                break
+            fi
+        fi
+    done < "$log_file"
+}
+
+# Check if tests are running (LOG file exists)
+if [[ -f "$LOG_FILE" ]]; then
+    # Count total tests from t/run-* files
+    if [[ -d "$T_DIR" ]]; then
+        total=$(find "$T_DIR" -name 'run-*' -type f 2>/dev/null | wc -l)
+    else
+        total=0
+    fi
+
+    # If no parallel tests generated yet
+    if [[ "$total" -eq 0 ]]; then
+        output_json "running" 0 0 0 0 0 "0" "" "generating"
+        exit 0
+    fi
+
+    # Parse LOG file (skip header line)
+    # LOG format: Seq Host Starttime JobRuntime Send Receive Exitval Signal Command
+    completed=$(tail -n +2 "$LOG_FILE" 2>/dev/null | wc -l)
+
+    # Count failures
+    failed=$(awk -F'\t' 'NR>1 && ($7 != 0 || $8 != 0) {count++} END {print count+0}' "$LOG_FILE" 2>/dev/null)
+
+    # Get failed tests JSON with output (only if there are failures)
+    if [[ "$failed" -gt 0 ]]; then
+        failed_tests=$(get_failed_tests_json "$LOG_FILE" "$T_DIR")
+    else
+        failed_tests=""
+    fi
+
+    # Calculate percentage
+    if [[ "$total" -gt 0 ]]; then
+        percent=$((completed * 100 / total))
+    else
+        percent=0
+    fi
+
+    # Get last completed test name (extract from command column)
+    last_test=$(tail -1 "$LOG_FILE" 2>/dev/null | awk -F'\t' '{print $9}' | sed 's,.*/run-,,;s, .*,,;s,^./,,')
+
+    # Calculate ETA based on average time
+    if [[ "$completed" -gt 0 ]]; then
+        avg_time=$(awk -F'\t' 'NR>1 {sum+=$4; count++} END {if(count>0) printf "%.1f", sum/count; else print "0"}' "$LOG_FILE")
+        remaining=$((total - completed))
+        eta=$(awk "BEGIN {printf \"%.0f\", $avg_time * $remaining}")
+    else
+        avg_time="0"
+        eta="0"
+    fi
+
+    # Determine status
+    if [[ "$completed" -ge "$total" ]]; then
+        status="completed"
+    elif [[ "$completed" -gt 0 ]]; then
+        status="running"
+    else
+        status="starting"
+    fi
+
+    output_json "$status" "$completed" "$total" "$failed" "$percent" "$eta" "$avg_time" "$last_test" "testing" "$failed_tests"
+    exit 0
+fi
+
+# No LOG file - check if we're in compilation/linking phase
+# Count expected source files from src.mk
+if [[ -f "$SRC_MK" ]]; then
+    # Count LIB_SOURCES (library object files to compile)
+    expected_lib_objects=$(grep -E '\.cc\s*\\?$' "$SRC_MK" | grep -v '^#' | wc -l)
+
+    # Count TEST_MAIN_SOURCES (test binaries to link)
+    expected_test_binaries=$(sed -n '/^TEST_MAIN_SOURCES =/,/^[^ ]/p' "$SRC_MK" | grep -cE '\.cc\s*\\?$' 2>/dev/null || echo 0)
+else
+    expected_lib_objects=0
+    expected_test_binaries=0
+fi
+
+# Check for test generation phase (t/ directory being created)
+if [[ -d "$T_DIR" ]]; then
+    total=$(find "$T_DIR" -name 'run-*' -type f 2>/dev/null | wc -l)
+    if [[ "$total" -gt 0 ]]; then
+        output_json "running" 0 "$total" 0 0 0 "0" "" "generating"
+        exit 0
+    fi
+fi
+
+# Count compiled object files (in subdirectories matching source structure)
+# Object files are created as dir/file.o (e.g., cache/cache.o, db/db_impl.o)
+compiled_objects=0
+if [[ "$expected_lib_objects" -gt 0 ]]; then
+    # Count .o files in source directories
+    compiled_objects=$(find cache db env file logging memory memtable monitoring options port table test_util trace_replay util utilities -name '*.o' -type f 2>/dev/null | wc -l)
+fi
+
+# Count linked test binaries (test binaries are in current directory with _test suffix)
+linked_tests=0
+if [[ "$expected_test_binaries" -gt 0 ]]; then
+    linked_tests=$(find . -maxdepth 1 -name '*_test' -type f -executable 2>/dev/null | wc -l)
+fi
+
+# Determine phase based on what exists
+if [[ "$compiled_objects" -eq 0 && "$linked_tests" -eq 0 ]]; then
+    # Nothing compiled yet - not started or just beginning
+    output_json "not_started" 0 0 0 0 0 "0" ""
+    exit 0
+fi
+
+# Calculate total work units: compiling + linking
+total_work=$((expected_lib_objects + expected_test_binaries))
+completed_work=$((compiled_objects + linked_tests))
+
+if [[ "$total_work" -gt 0 ]]; then
+    percent=$((completed_work * 100 / total_work))
+else
+    percent=0
+fi
+
+# Determine phase
+if [[ "$compiled_objects" -lt "$expected_lib_objects" ]]; then
+    phase="compiling"
+    # Get most recently modified .o file as last_item
+    last_item=$(find cache db env file logging memory memtable monitoring options port table test_util trace_replay util utilities -name '*.o' -type f -printf '%T@ %p\n' 2>/dev/null | sort -rn | head -1 | cut -d' ' -f2- | sed 's,^\./,,;s,\.o$,,')
+elif [[ "$linked_tests" -lt "$expected_test_binaries" ]]; then
+    phase="linking"
+    # Get most recently modified test binary as last_item
+    last_item=$(find . -maxdepth 1 -name '*_test' -type f -executable -printf '%T@ %p\n' 2>/dev/null | sort -rn | head -1 | cut -d' ' -f2- | sed 's,^\./,,')
+else
+    phase="generating"
+    last_item=""
+fi
+
+output_json "running" "$completed_work" "$total_work" 0 "$percent" 0 "0" "$last_item" "$phase"
diff --git a/build_tools/dependencies_platform010.sh b/build_tools/dependencies_platform010.sh
index 9b19a801c85f..a55663cb25da 100644
--- a/build_tools/dependencies_platform010.sh
+++ b/build_tools/dependencies_platform010.sh
@@ -19,4 +19,3 @@ BENCHMARK_BASE=/mnt/gvfs/third-party2/benchmark/780c7a0f9cf0967961e69ad08e61cddd
 KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/624a2f8f6c93c3c1df8aa4a6255d8202631a6c80/fb/platform010/da39a3e
 BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/39579e8603b48b3540f8b0633f43adf29acccb8b/2.37/centos8-native/da39a3e
 VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/cd9cc656d49ecb53797ce4d055e49fde29fd57ff/3.19.0/platform010/76ebdda
-LUA_BASE=/mnt/gvfs/third-party2/lua/363787fa5cac2a8aa20638909210443278fa138e/5.3.4/platform010/9079c97
diff --git a/build_tools/fbcode_config.sh b/build_tools/fbcode_config.sh
index 02732bde3d1c..802e757795c7 100644
--- a/build_tools/fbcode_config.sh
+++ b/build_tools/fbcode_config.sh
@@ -164,12 +164,4 @@ EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GF
 
 VALGRIND_VER="$VALGRIND_BASE/bin/"
 
-LUA_PATH="$LUA_BASE"
-
-if test -z $PIC_BUILD; then
-  LUA_LIB=" $LUA_PATH/lib/liblua.a"
-else
-  LUA_LIB=" $LUA_PATH/lib/liblua_pic.a"
-fi
-
-export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD LUA_PATH LUA_LIB
+export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD
diff --git a/build_tools/fbcode_config_platform010.sh b/build_tools/fbcode_config_platform010.sh
index 87a28b4f92d0..0fc99ecad159 100644
--- a/build_tools/fbcode_config_platform010.sh
+++ b/build_tools/fbcode_config_platform010.sh
@@ -172,4 +172,4 @@ EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GF
 
 VALGRIND_VER="$VALGRIND_BASE/bin/"
 
-export CC CXX AR AS CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD LUA_PATH LUA_LIB
+export CC CXX AR AS CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD
diff --git a/build_tools/format-diff.sh b/build_tools/format-diff.sh
index 9dc85496c91b..55ee4bd6e24f 100755
--- a/build_tools/format-diff.sh
+++ b/build_tools/format-diff.sh
@@ -7,14 +7,18 @@ print_usage () {
   echo "Usage:"
   echo "format-diff.sh [OPTIONS]"
   echo "-c: check only."
+  echo "-y: auto-apply formatting without prompts (non-interactive mode)."
   echo "-h: print this message."
 }
 
-while getopts ':ch' OPTION; do
+while getopts ':cyh' OPTION; do
   case "$OPTION" in
     c)
       CHECK_ONLY=1
       ;;
+    y)
+      AUTO_APPLY=1
+      ;;
     h)
       print_usage
       exit 1
@@ -118,6 +122,9 @@ fi
 # fi
 set -e
 
+# Exclude third-party from formatting
+EXCLUDE=':!third-party/'
+
 uncommitted_code=`git diff HEAD`
 
 # If there's no uncommitted changes, we assume user are doing post-commit
@@ -137,14 +144,78 @@ then
   # should be relevant for formatting fixes.
   FORMAT_UPSTREAM_MERGE_BASE="$(git merge-base "$FORMAT_UPSTREAM" HEAD)"
   # Get the differences
-  diffs=$(git diff -U0 "$FORMAT_UPSTREAM_MERGE_BASE" | $CLANG_FORMAT_DIFF -p 1) || true
+  diffs=$(git diff -U0 "$FORMAT_UPSTREAM_MERGE_BASE" -- $EXCLUDE | $CLANG_FORMAT_DIFF -p 1) || true
   echo "Checking format of changes not yet in $FORMAT_UPSTREAM..."
 else
   # Check the format of uncommitted lines,
-  diffs=$(git diff -U0 HEAD | $CLANG_FORMAT_DIFF -p 1) || true
+  diffs=$(git diff -U0 HEAD -- $EXCLUDE | $CLANG_FORMAT_DIFF -p 1) || true
   echo "Checking format of uncommitted changes..."
 fi
 
+# Check for missing copyright in new files
+echo "Checking for copyright headers in new files..."
+
+# Get list of new files (added, not just modified)
+if [ -z "$uncommitted_code" ]; then
+  # Post-commit: check files added since merge base
+  new_files=$(git diff --name-only --diff-filter=A "$FORMAT_UPSTREAM_MERGE_BASE" -- '*.h' '*.cc' '*.py' $EXCLUDE)
+else
+  # Pre-commit: check staged new files
+  new_files=$(git diff --name-only --diff-filter=A --cached HEAD -- '*.h' '*.cc' '*.py' $EXCLUDE)
+fi
+
+if [ -n "$new_files" ]; then
+  files_missing_copyright=""
+
+  for file in $new_files; do
+    if [ -f "$file" ]; then
+      # Check if file is missing copyright
+      # For .py files, check for Python-style comment
+      # For .h and .cc files, check for C++-style comment
+      if [[ "$file" == *.py ]]; then
+        if ! grep -q "Copyright (c) Meta Platforms, Inc. and affiliates" "$file"; then
+          files_missing_copyright="$files_missing_copyright $file"
+          # Add copyright header to Python file
+          temp_file=$(mktemp)
+          {
+            echo "#  Copyright (c) Meta Platforms, Inc. and affiliates."
+            echo "#  This source code is licensed under both the GPLv2 (found in the COPYING file in the root directory)"
+            echo "#  and the Apache 2.0 License (found in the LICENSE.Apache file in the root directory)."
+            echo
+            cat "$file"
+          } > "$temp_file"
+          mv "$temp_file" "$file"
+          echo "Added copyright header to $file"
+        fi
+      elif [[ "$file" == *.h ]] || [[ "$file" == *.cc ]]; then
+        if ! grep -q "Copyright (c) Meta Platforms, Inc. and affiliates" "$file"; then
+          files_missing_copyright="$files_missing_copyright $file"
+          # Add copyright header to C++ file
+          temp_file=$(mktemp)
+          {
+            echo "//  Copyright (c) Meta Platforms, Inc. and affiliates. "
+            echo "//  This source code is licensed under both the GPLv2 (found in the "
+            echo "//  COPYING file in the root directory) and Apache 2.0 License "
+            echo "//  (found in the LICENSE.Apache file in the root directory)."
+            echo
+            cat "$file"
+          } > "$temp_file"
+          mv "$temp_file" "$file"
+          echo "Added copyright header to $file"
+        fi
+      fi
+    fi
+  done
+
+  if [ -n "$files_missing_copyright" ]; then
+    echo "Copyright headers were added to new files."
+  else
+    echo "All new files have copyright headers."
+  fi
+else
+  echo "No new files to check for copyright headers."
+fi
+
 if [ -z "$diffs" ]
 then
   echo "Nothing needs to be reformatted!"
@@ -173,11 +244,16 @@ echo "$diffs" |
   sed -e "s/\(^-.*$\)/`echo -e \"$COLOR_RED\1$COLOR_END\"`/" |
   sed -e "s/\(^+.*$\)/`echo -e \"$COLOR_GREEN\1$COLOR_END\"`/"
 
-echo -e "Would you like to fix the format automatically (y/n): \c"
+# Handle auto-apply mode (non-interactive)
+if [ "$AUTO_APPLY" ]; then
+  to_fix="y"
+else
+  echo -e "Would you like to fix the format automatically (y/n): \c"
 
-# Make sure under any mode, we can read user input.
-exec < /dev/tty
-read to_fix
+  # Make sure under any mode, we can read user input.
+  exec < /dev/tty
+  read to_fix
+fi
 
 if [ "$to_fix" != "y" ]
 then
@@ -187,14 +263,15 @@ fi
 # Do in-place format adjustment.
 if [ -z "$uncommitted_code" ]
 then
-  git diff -U0 "$FORMAT_UPSTREAM_MERGE_BASE" | $CLANG_FORMAT_DIFF -i -p 1
+  git diff -U0 "$FORMAT_UPSTREAM_MERGE_BASE" -- $EXCLUDE | $CLANG_FORMAT_DIFF -i -p 1
 else
-  git diff -U0 HEAD | $CLANG_FORMAT_DIFF -i -p 1
+  git diff -U0 HEAD -- $EXCLUDE | $CLANG_FORMAT_DIFF -i -p 1
 fi
 echo "Files reformatted!"
 
 # Amend to last commit if user do the post-commit format check
-if [ -z "$uncommitted_code" ]; then
+# Skip amend prompt in auto-apply mode (user can amend manually if desired)
+if [ -z "$uncommitted_code" ] && [ -z "$AUTO_APPLY" ]; then
   echo -e "Would you like to amend the changes to last commit (`git log HEAD --oneline | head -1`)? (y/n): \c"
   read to_amend
 
diff --git a/build_tools/getdeps_fallback_mirror.py b/build_tools/getdeps_fallback_mirror.py
new file mode 100644
index 000000000000..7b3bb31b584d
--- /dev/null
+++ b/build_tools/getdeps_fallback_mirror.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python3
+"""
+Pre-download packages with unreliable mirrors using fallback mirrors.
+Reads package info from folly's getdeps manifest files.
+"""
+import sys
+import os
+import hashlib
+import subprocess
+import configparser
+
+def sha256_file(path):
+    """Calculate SHA256 hash of a file."""
+    h = hashlib.sha256()
+    try:
+        with open(path, 'rb') as f:
+            for chunk in iter(lambda: f.read(65536), b''):
+                h.update(chunk)
+        return h.hexdigest()
+    except Exception:
+        return None
+
+def parse_manifest(manifest_path):
+    """Parse a getdeps manifest file to extract download info."""
+    config = configparser.ConfigParser()
+    try:
+        config.read(manifest_path)
+        if 'download' in config:
+            return {
+                'url': config['download'].get('url', ''),
+                'sha256': config['download'].get('sha256', ''),
+            }
+    except Exception:
+        pass
+    return None
+
+def get_fallback_mirrors(url):
+    """Get fallback mirror URLs for a given URL."""
+    # Fallback mirror patterns for known unreliable hosts
+    mirror_fallbacks = {
+        "ftp.gnu.org/gnu/": [
+            "https://mirrors.kernel.org/gnu/",
+            "https://ftpmirror.gnu.org/gnu/",
+            "https://ftp.gnu.org/gnu/",
+        ],
+        "ftpmirror.gnu.org/gnu/": [
+            "https://mirrors.kernel.org/gnu/",
+            "https://ftpmirror.gnu.org/gnu/",
+            "https://ftp.gnu.org/gnu/",
+        ],
+    }
+
+    for pattern, mirrors in mirror_fallbacks.items():
+        if pattern in url:
+            # Extract the path after the pattern
+            path_start = url.find(pattern) + len(pattern)
+            path = url[path_start:]
+            return [mirror + path for mirror in mirrors]
+    return [url]  # No fallback, use original
+
+def main():
+    if len(sys.argv) != 4:
+        print(f"Usage: {sys.argv[0]} <download_dir> <cache_dir> <manifests_dir>")
+        sys.exit(1)
+
+    download_dir, cache_dir, manifests_dir = sys.argv[1], sys.argv[2], sys.argv[3]
+
+    # Packages known to have unreliable mirrors
+    packages_to_check = ["autoconf", "automake", "libtool"]
+
+    for package in packages_to_check:
+        manifest_path = os.path.join(manifests_dir, package)
+        if not os.path.exists(manifest_path):
+            continue
+
+        info = parse_manifest(manifest_path)
+        if not info or not info['url'] or not info['sha256']:
+            continue
+
+        # Determine filename from URL
+        url = info['url']
+        expected_sha256 = info['sha256']
+        url_filename = os.path.basename(url)
+
+        # getdeps uses format: {package}-{filename}
+        filename = f"{package}-{url_filename}"
+        filepath = os.path.join(download_dir, filename)
+        cache_path = os.path.join(cache_dir, filename)
+
+        # Check if already valid
+        if os.path.exists(filepath) and sha256_file(filepath) == expected_sha256:
+            print(f"  {filename}: OK (already downloaded)")
+            continue
+
+        # Check cache
+        if os.path.exists(cache_path) and sha256_file(cache_path) == expected_sha256:
+            print(f"  {filename}: OK (from cache)")
+            subprocess.run(['cp', cache_path, filepath], check=True)
+            continue
+
+        # Try fallback mirrors
+        mirrors = get_fallback_mirrors(url)
+        downloaded = False
+        for mirror_url in mirrors:
+            print(f"  {filename}: trying {mirror_url}...")
+            try:
+                subprocess.run(['wget', '-q', '-O', filepath, mirror_url], check=True, timeout=120)
+                if sha256_file(filepath) == expected_sha256:
+                    print(f"  {filename}: OK (downloaded)")
+                    subprocess.run(['cp', filepath, cache_path], check=False)
+                    downloaded = True
+                    break
+                else:
+                    os.remove(filepath)
+            except Exception:
+                if os.path.exists(filepath):
+                    os.remove(filepath)
+
+        if not downloaded:
+            print(f"  {filename}: WARNING - all mirrors failed")
+
+if __name__ == "__main__":
+    main()
diff --git a/build_tools/ubuntu22_image/Dockerfile b/build_tools/ubuntu22_image/Dockerfile
new file mode 100644
index 000000000000..cb627f33daa7
--- /dev/null
+++ b/build_tools/ubuntu22_image/Dockerfile
@@ -0,0 +1,88 @@
+# INSTRUCTIONS:
+# I was not able to build docker images on an isolated devserver because of
+# issues with proxy internet access. Use a public cloud or other Linux system.
+# (I used a Debian system after installing docker features, adding my user to
+# the docker and docker-registry groups, and logging out and back in to pick
+# those up.)
+#
+# Follow https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-container-registry#authenticating-with-a-personal-access-token-classic
+# to login with your GitHub credentials, as in
+#
+# $ docker login ghcr.io -u pdillinger
+#
+# and paste the limited-purpose GitHub token into the terminal.
+#
+# Then in the build_tools/ubuntu22_image directory, (bump minor version for
+# random docker file updates, major version tracks Ubuntu release)
+#
+# $ docker build -t ghcr.io/facebook/rocksdb_ubuntu:22.0
+# $ docker push ghcr.io/facebook/rocksdb_ubuntu:22.0
+#
+# Might need to change visibility to public through
+# https://github.com/orgs/facebook/packages/container/rocksdb_ubuntu/settings
+# or similar.
+
+# from official ubuntu 22.04
+FROM ubuntu:22.04
+# update system
+RUN apt-get update
+RUN apt-get upgrade -y
+# install basic tools
+RUN apt-get install -y vim wget curl
+# install tzdata noninteractive
+RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata
+# install git and default compilers
+RUN apt-get install -y git gcc g++ clang clang-tools
+# install basic package
+RUN apt-get install -y lsb-release software-properties-common gnupg
+# install gflags, tbb
+RUN apt-get install -y libgflags-dev libtbb-dev
+# install compression libs
+RUN apt-get install -y libsnappy-dev zlib1g-dev libbz2-dev liblz4-dev libzstd-dev
+# install cmake
+RUN apt-get install -y cmake
+RUN apt-get install -y libssl-dev
+# install clang-13
+WORKDIR /root
+RUN wget https://apt.llvm.org/llvm.sh
+RUN chmod +x llvm.sh
+RUN ./llvm.sh 13 all
+# There are incompatibilities between clang with -std=c++20 and libstdc++
+# provided by gcc, so we have to compile with clang-13 using -stdlib=libc++
+# and only one version of libc++ can be installed on the system at one time.
+# So to avoid confusion we remove unusable clang-14 also.
+RUN apt-get install libc++-13-dev libc++abi-13-dev
+RUN apt-get purge -y clang-14 && apt-get autoremove -y
+
+# install gcc-10 and more, default is 11
+RUN apt-get install -y gcc-10 g++-10
+RUN add-apt-repository -y ppa:ubuntu-toolchain-r/test
+RUN apt-get install -y gcc-13 g++-13
+# install apt-get install -y valgrind
+RUN apt-get install -y valgrind
+# install folly depencencies
+# Missing compatible libunwind: RUN apt-get install -y libgoogle-glog-dev
+# So instead install from source. This currently requires compiling with
+# -DGLOG_USE_GLOG_EXPORT
+RUN wget https://github.com/google/glog/archive/refs/tags/v0.7.1.tar.gz && tar xzf v0.7.1.tar.gz && cd glog-0.7.1/ && cmake -S . -B build -G "Unix Makefiles" && cmake --build build && cmake --build build --target install && cd .. && rm -rf v0.7.1.tar.gz glog-0.7.1
+# install openjdk 8
+RUN apt-get install -y openjdk-8-jdk
+ENV JAVA_HOME /usr/lib/jvm/java-1.8.0-openjdk-amd64
+# install mingw
+RUN apt-get install -y mingw-w64
+
+# install gtest-parallel package
+RUN git clone --single-branch --branch master --depth 1 https://github.com/google/gtest-parallel.git ~/gtest-parallel
+ENV PATH $PATH:/root/gtest-parallel
+
+# install libprotobuf for fuzzers test
+RUN apt-get install -y ninja-build binutils liblzma-dev libz-dev pkg-config autoconf libtool
+RUN git clone --branch v1.0 https://github.com/google/libprotobuf-mutator.git ~/libprotobuf-mutator && cd ~/libprotobuf-mutator && git checkout ffd86a32874e5c08a143019aad1aaf0907294c9f && mkdir build && cd build && cmake .. -GNinja -DCMAKE_C_COMPILER=clang-13 -DCMAKE_CXX_COMPILER=clang++-13 -DCMAKE_BUILD_TYPE=Release -DLIB_PROTO_MUTATOR_DOWNLOAD_PROTOBUF=ON && ninja && ninja install
+ENV PKG_CONFIG_PATH /usr/local/OFF/:/root/libprotobuf-mutator/build/external.protobuf/lib/pkgconfig/
+ENV PROTOC_BIN /root/libprotobuf-mutator/build/external.protobuf/bin/protoc
+
+# install the latest google benchmark
+RUN git clone --depth 1 --branch v1.7.0 https://github.com/google/benchmark.git ~/benchmark && cd ~/benchmark && mkdir build && cd build && cmake .. -GNinja -DCMAKE_BUILD_TYPE=Release -DBENCHMARK_ENABLE_GTEST_TESTS=0 && ninja && ninja install && cd ~ && rm -rf /root/benchmark
+
+# clean up
+RUN rm -rf /var/lib/apt/lists/*
diff --git a/build_tools/ubuntu24_image/Dockerfile b/build_tools/ubuntu24_image/Dockerfile
new file mode 100644
index 000000000000..0f7e98ca6e9f
--- /dev/null
+++ b/build_tools/ubuntu24_image/Dockerfile
@@ -0,0 +1,72 @@
+# INSTRUCTIONS:
+# I was not able to build docker images on an isolated devserver because of
+# issues with proxy internet access. Use a public cloud or other Linux system.
+# (I used a Debian system after installing docker features, adding my user to
+# the docker and docker-registry groups, and logging out and back in to pick
+# those up.)
+#
+# Follow https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-container-registry#authenticating-with-a-personal-access-token-classic
+# to login with your GitHub credentials, as in
+#
+# $ docker login ghcr.io -u pdillinger
+#
+# and paste the limited-purpose GitHub token into the terminal.
+#
+# Then in the build_tools/ubuntu24_image directory, (bump minor version for
+# random docker file updates, major version tracks Ubuntu release)
+#
+# $ docker build -t ghcr.io/facebook/rocksdb_ubuntu:24.0
+# $ docker push ghcr.io/facebook/rocksdb_ubuntu:24.0
+#
+# Might need to change visibility to public through
+# https://github.com/orgs/facebook/packages/container/rocksdb_ubuntu/settings
+# or similar.
+
+# from official ubuntu 24.04
+FROM ubuntu:24.04
+# update system
+RUN apt-get update
+RUN apt-get upgrade -y
+# install basic tools
+RUN apt-get install -y vim wget curl
+# install tzdata noninteractive
+RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata
+# install git and default compilers
+RUN apt-get install -y git gcc g++ clang clang-tools
+# install basic package
+RUN apt-get install -y lsb-release software-properties-common gnupg
+# install gflags, tbb
+RUN apt-get install -y libgflags-dev libtbb-dev
+# install compression libs
+RUN apt-get install -y libsnappy-dev zlib1g-dev libbz2-dev liblz4-dev libzstd-dev
+# install cmake
+RUN apt-get install -y cmake
+RUN apt-get install -y libssl-dev
+
+# install gcc-12 and more, default is 13
+RUN apt-get install -y gcc-12 g++-12 gcc-14 g++-14
+# install apt-get install -y valgrind
+RUN apt-get install -y valgrind
+# install folly depencencies
+RUN apt-get install -y libgoogle-glog-dev
+# install openjdk 8
+RUN apt-get install -y openjdk-8-jdk
+ENV JAVA_HOME /usr/lib/jvm/java-1.8.0-openjdk-amd64
+# install mingw
+RUN apt-get install -y mingw-w64
+
+# install gtest-parallel package
+RUN git clone --single-branch --branch master --depth 1 https://github.com/google/gtest-parallel.git ~/gtest-parallel
+ENV PATH $PATH:/root/gtest-parallel
+
+# install libprotobuf for fuzzers test
+RUN apt-get install -y ninja-build binutils liblzma-dev libz-dev pkg-config autoconf libtool
+RUN git clone --branch v1.0 https://github.com/google/libprotobuf-mutator.git ~/libprotobuf-mutator && cd ~/libprotobuf-mutator && git checkout ffd86a32874e5c08a143019aad1aaf0907294c9f && mkdir build && cd build && cmake .. -GNinja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release -DLIB_PROTO_MUTATOR_DOWNLOAD_PROTOBUF=ON && ninja && ninja install
+ENV PKG_CONFIG_PATH /usr/local/OFF/:/root/libprotobuf-mutator/build/external.protobuf/lib/pkgconfig/
+ENV PROTOC_BIN /root/libprotobuf-mutator/build/external.protobuf/bin/protoc
+
+# install the latest google benchmark
+RUN git clone --depth 1 --branch v1.7.0 https://github.com/google/benchmark.git ~/benchmark && cd ~/benchmark && mkdir build && cd build && cmake .. -GNinja -DCMAKE_BUILD_TYPE=Release -DBENCHMARK_ENABLE_GTEST_TESTS=0 && ninja && ninja install && cd ~ && rm -rf /root/benchmark
+
+# clean up
+RUN rm -rf /var/lib/apt/lists/*
diff --git a/build_tools/update_dependencies.sh b/build_tools/update_dependencies.sh
index afc39ab8009a..6584cd6edaca 100755
--- a/build_tools/update_dependencies.sh
+++ b/build_tools/update_dependencies.sh
@@ -101,6 +101,5 @@ get_lib_base benchmark  LATEST  platform010
 get_lib_base kernel-headers fb platform010
 get_lib_base binutils   LATEST centos8-native
 get_lib_base valgrind   LATEST platform010
-get_lib_base lua        5.3.4  platform010
 
 git diff $OUTPUT
diff --git a/cache/cache.cc b/cache/cache.cc
index 3556f61243e9..f94a379d200c 100644
--- a/cache/cache.cc
+++ b/cache/cache.cc
@@ -54,11 +54,6 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct CompressedSecondaryCacheOptions, compression_type),
           OptionType::kCompressionType, OptionVerificationType::kNormal,
           OptionTypeFlags::kMutable}},
-        {"compress_format_version",
-         {offsetof(struct CompressedSecondaryCacheOptions,
-                   compress_format_version),
-          OptionType::kUInt32T, OptionVerificationType::kNormal,
-          OptionTypeFlags::kMutable}},
         {"enable_custom_split_merge",
          {offsetof(struct CompressedSecondaryCacheOptions,
                    enable_custom_split_merge),
diff --git a/cache/cache_bench_tool.cc b/cache/cache_bench_tool.cc
index a5e589f4689f..7b62fbae662a 100644
--- a/cache/cache_bench_tool.cc
+++ b/cache/cache_bench_tool.cc
@@ -60,6 +60,8 @@ DEFINE_uint32(value_bytes, 8 * KiB, "Size of each value added.");
 DEFINE_uint32(value_bytes_estimate, 0,
               "If > 0, overrides estimated_entry_charge or "
               "min_avg_entry_charge depending on cache_type.");
+DEFINE_double(compressible_to_ratio, 0.5,
+              "Approximate size ratio that values can be compressed to.");
 
 DEFINE_int32(
     degenerate_hash_bits, 0,
@@ -117,7 +119,7 @@ DEFINE_uint32(seed, 0, "Hashing/random seed to use. 0 = choose at random");
 DEFINE_string(secondary_cache_uri, "",
               "Full URI for creating a custom secondary cache object");
 
-DEFINE_string(cache_type, "lru_cache", "Type of block cache.");
+DEFINE_string(cache_type, "hyper_clock_cache", "Type of block cache.");
 
 DEFINE_bool(use_jemalloc_no_dump_allocator, false,
             "Whether to use JemallocNoDumpAllocator");
@@ -182,6 +184,11 @@ DEFINE_bool(sck_randomize, false,
 DEFINE_bool(sck_footer_unique_id, false,
             "(-stress_cache_key) Simulate using proposed footer unique id");
 // ## END stress_cache_key sub-tool options ##
+// ## BEGIN stress_cache_instances sub-tool options ##
+DEFINE_uint32(stress_cache_instances, 0,
+              "If > 0, run cache instance stress test instead");
+// Uses cache_size and cache_type, maybe more
+// ## END stress_cache_instance sub-tool options ##
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -291,10 +298,19 @@ struct KeyGen {
 
 Cache::ObjectPtr createValue(Random64& rnd, MemoryAllocator* alloc) {
   char* rv = AllocateBlock(FLAGS_value_bytes, alloc).release();
-  // Fill with some filler data, and take some CPU time
-  for (uint32_t i = 0; i < FLAGS_value_bytes; i += 8) {
+  // Fill with some filler data, and take some CPU time, but add redundancy
+  // as requested for compressibility.
+  uint32_t random_fill_size = std::max(
+      uint32_t{1}, std::min(FLAGS_value_bytes,
+                            static_cast<uint32_t>(FLAGS_compressible_to_ratio *
+                                                  FLAGS_value_bytes)));
+  uint32_t i = 0;
+  for (; i < random_fill_size; i += 8) {
     EncodeFixed64(rv + i, rnd.Next());
   }
+  for (; i < FLAGS_value_bytes; i++) {
+    rv[i] = rv[i % random_fill_size];
+  }
   return rv;
 }
 
@@ -309,16 +325,16 @@ Status SaveToFn(Cache::ObjectPtr from_obj, size_t /*from_offset*/,
 
 Status CreateFn(const Slice& data, CompressionType /*type*/,
                 CacheTier /*source*/, Cache::CreateContext* /*context*/,
-                MemoryAllocator* /*allocator*/, Cache::ObjectPtr* out_obj,
+                MemoryAllocator* alloc, Cache::ObjectPtr* out_obj,
                 size_t* out_charge) {
-  *out_obj = new char[data.size()];
+  *out_obj = AllocateBlock(data.size(), alloc).release();
   memcpy(*out_obj, data.data(), data.size());
   *out_charge = data.size();
   return Status::OK();
 };
 
 void DeleteFn(Cache::ObjectPtr value, MemoryAllocator* alloc) {
-  CustomDeleter{alloc}(static_cast<char*>(value));
+  CacheAllocationDeleter{alloc}(static_cast<char*>(value));
 }
 
 Cache::CacheItemHelper helper1_wos(CacheEntryRole::kDataBlock, DeleteFn);
@@ -376,7 +392,12 @@ class CacheBench {
       fprintf(stderr, "Percentages must add to 100.\n");
       exit(1);
     }
+    cache_ = MakeCache();
+  }
+
+  ~CacheBench() = default;
 
+  static std::shared_ptr<Cache> MakeCache() {
     std::shared_ptr<MemoryAllocator> allocator;
     if (FLAGS_use_jemalloc_no_dump_allocator) {
       JemallocAllocatorOptions opts;
@@ -395,12 +416,12 @@ class CacheBench {
       opts.hash_seed = BitwiseAnd(FLAGS_seed, INT32_MAX);
       opts.memory_allocator = allocator;
       opts.eviction_effort_cap = FLAGS_eviction_effort_cap;
-      if (FLAGS_cache_type == "fixed_hyper_clock_cache" ||
-          FLAGS_cache_type == "hyper_clock_cache") {
+      if (FLAGS_cache_type == "fixed_hyper_clock_cache") {
         opts.estimated_entry_charge = FLAGS_value_bytes_estimate > 0
                                           ? FLAGS_value_bytes_estimate
                                           : FLAGS_value_bytes;
-      } else if (FLAGS_cache_type == "auto_hyper_clock_cache") {
+      } else if (FLAGS_cache_type == "auto_hyper_clock_cache" ||
+                 FLAGS_cache_type == "hyper_clock_cache") {
         if (FLAGS_value_bytes_estimate > 0) {
           opts.min_avg_entry_charge = FLAGS_value_bytes_estimate;
         }
@@ -409,7 +430,7 @@ class CacheBench {
         exit(1);
       }
       ConfigureSecondaryCache(opts);
-      cache_ = opts.MakeSharedCache();
+      return opts.MakeSharedCache();
     } else if (FLAGS_cache_type == "lru_cache") {
       LRUCacheOptions opts(FLAGS_cache_size, FLAGS_num_shard_bits,
                            false /* strict_capacity_limit */,
@@ -417,15 +438,13 @@ class CacheBench {
       opts.hash_seed = BitwiseAnd(FLAGS_seed, INT32_MAX);
       opts.memory_allocator = allocator;
       ConfigureSecondaryCache(opts);
-      cache_ = NewLRUCache(opts);
+      return NewLRUCache(opts);
     } else {
       fprintf(stderr, "Cache type not supported.\n");
       exit(1);
     }
   }
 
-  ~CacheBench() = default;
-
   void PopulateCache() {
     Random64 rnd(FLAGS_seed);
     KeyGen keygen;
@@ -479,7 +498,7 @@ class CacheBench {
 
     PrintEnv();
     SharedState shared(this);
-    std::vector<std::unique_ptr<ThreadState> > threads(FLAGS_threads);
+    std::vector<std::unique_ptr<ThreadState>> threads(FLAGS_threads);
     for (uint32_t i = 0; i < FLAGS_threads; i++) {
       threads[i].reset(new ThreadState(i, &shared));
       std::thread(ThreadBody, threads[i].get()).detach();
@@ -1141,6 +1160,59 @@ class StressCacheKey {
   double multiplier_ = 0.0;
 };
 
+// cache_bench -stress_cache_instances is a partially independent embedded tool
+// for evaluating the time and space required to create and destroy many cache
+// instances, as this is considered important for a default cache implementation
+// which could see many throw-away instances in handling of Options, or created
+// in large numbers for many very small DBs with many CFs. Prefix command line
+// with /usr/bin/time to see max RSS memory.
+class StressCacheInstances {
+ public:
+  void Run() {
+    const int kNumIterations = 10;
+    const auto clock = SystemClock::Default().get();
+    caches_.reserve(FLAGS_stress_cache_instances);
+
+    uint64_t total_create_time_us = 0;
+    uint64_t total_destroy_time_us = 0;
+
+    for (int iter = 0; iter < kNumIterations; ++iter) {
+      // Create many cache instances
+      uint64_t start_create = clock->NowMicros();
+      for (uint32_t i = 0; i < FLAGS_stress_cache_instances; ++i) {
+        caches_.emplace_back(CacheBench::MakeCache());
+      }
+      uint64_t end_create = clock->NowMicros();
+      uint64_t create_time = end_create - start_create;
+      total_create_time_us += create_time;
+
+      // Destroy them
+      uint64_t start_destroy = clock->NowMicros();
+      caches_.clear();
+      uint64_t end_destroy = clock->NowMicros();
+      uint64_t destroy_time = end_destroy - start_destroy;
+      total_destroy_time_us += destroy_time;
+
+      printf(
+          "Iteration %d: Created %u caches in %.3f ms, destroyed in %.3f ms\n",
+          iter + 1, FLAGS_stress_cache_instances, create_time / 1000.0,
+          destroy_time / 1000.0);
+    }
+
+    printf("Average creation time: %.3f ms (%.1f us per cache)\n",
+           static_cast<double>(total_create_time_us) / kNumIterations / 1000.0,
+           static_cast<double>(total_create_time_us) / kNumIterations /
+               FLAGS_stress_cache_instances);
+    printf("Average destruction time: %.3f ms (%.1f us per cache)\n",
+           static_cast<double>(total_destroy_time_us) / kNumIterations / 1000.0,
+           static_cast<double>(total_destroy_time_us) / kNumIterations /
+               FLAGS_stress_cache_instances);
+  }
+
+ private:
+  std::vector<std::shared_ptr<Cache>> caches_;
+};
+
 int cache_bench_tool(int argc, char** argv) {
   ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
   ParseCommandLineFlags(&argc, &argv, true);
@@ -1151,6 +1223,11 @@ int cache_bench_tool(int argc, char** argv) {
     return 0;
   }
 
+  if (FLAGS_stress_cache_instances > 0) {
+    StressCacheInstances().Run();
+    return 0;
+  }
+
   if (FLAGS_threads <= 0) {
     fprintf(stderr, "threads number <= 0\n");
     exit(1);
diff --git a/cache/cache_entry_stats.h b/cache/cache_entry_stats.h
index 9968995da95a..f8c5e422e896 100644
--- a/cache/cache_entry_stats.h
+++ b/cache/cache_entry_stats.h
@@ -101,23 +101,23 @@ class CacheEntryStatsCollector {
   }
 
   // Gets saved stats, regardless of age
-  void GetStats(Stats *stats) {
+  void GetStats(Stats* stats) {
     std::lock_guard<std::mutex> lock(saved_mutex_);
     *stats = saved_stats_;
   }
 
-  Cache *GetCache() const { return cache_; }
+  Cache* GetCache() const { return cache_; }
 
   // Gets or creates a shared instance of CacheEntryStatsCollector in the
   // cache itself, and saves into `ptr`. This shared_ptr will hold the
   // entry in cache until all refs are destroyed.
-  static Status GetShared(Cache *raw_cache, SystemClock *clock,
-                          std::shared_ptr<CacheEntryStatsCollector> *ptr) {
+  static Status GetShared(Cache* raw_cache, SystemClock* clock,
+                          std::shared_ptr<CacheEntryStatsCollector>* ptr) {
     assert(raw_cache);
     BasicTypedCacheInterface<CacheEntryStatsCollector, CacheEntryRole::kMisc>
         cache{raw_cache};
 
-    const Slice &cache_key = GetCacheKey();
+    const Slice& cache_key = GetCacheKey();
     auto h = cache.Lookup(cache_key);
     if (h == nullptr) {
       // Not yet in cache, but Cache doesn't provide a built-in way to
@@ -152,7 +152,7 @@ class CacheEntryStatsCollector {
   }
 
  private:
-  explicit CacheEntryStatsCollector(Cache *cache, SystemClock *clock)
+  explicit CacheEntryStatsCollector(Cache* cache, SystemClock* clock)
       : saved_stats_(),
         working_stats_(),
         last_start_time_micros_(0),
@@ -160,7 +160,7 @@ class CacheEntryStatsCollector {
         cache_(cache),
         clock_(clock) {}
 
-  static const Slice &GetCacheKey() {
+  static const Slice& GetCacheKey() {
     // For each template instantiation
     static CacheKey ckey = CacheKey::CreateUniqueForProcessLifetime();
     static Slice ckey_slice = ckey.AsSlice();
@@ -175,8 +175,8 @@ class CacheEntryStatsCollector {
   uint64_t last_start_time_micros_;
   uint64_t last_end_time_micros_;
 
-  Cache *const cache_;
-  SystemClock *const clock_;
+  Cache* const cache_;
+  SystemClock* const clock_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/cache/cache_key.cc b/cache/cache_key.cc
index addff61d17b0..a5553c0d257c 100644
--- a/cache/cache_key.cc
+++ b/cache/cache_key.cc
@@ -24,7 +24,7 @@ namespace ROCKSDB_NAMESPACE {
 //              0 |      >= 1<<63 | CreateUniqueForProcessLifetime
 //            > 0 |           any | OffsetableCacheKey.WithOffset
 
-CacheKey CacheKey::CreateUniqueForCacheLifetime(Cache *cache) {
+CacheKey CacheKey::CreateUniqueForCacheLifetime(Cache* cache) {
   // +1 so that we can reserve all zeros for "unset" cache key
   uint64_t id = cache->NewId() + 1;
   // Ensure we don't collide with CreateUniqueForProcessLifetime
@@ -297,8 +297,8 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() {
 //
 // TODO: Nevertheless / regardless, an efficient way to detect (and thus
 // quantify) block cache corruptions, including collisions, should be added.
-OffsetableCacheKey::OffsetableCacheKey(const std::string &db_id,
-                                       const std::string &db_session_id,
+OffsetableCacheKey::OffsetableCacheKey(const std::string& db_id,
+                                       const std::string& db_session_id,
                                        uint64_t file_number) {
   UniqueId64x2 internal_id;
   Status s = GetSstInternalUniqueId(db_id, db_session_id, file_number,
diff --git a/cache/cache_key.h b/cache/cache_key.h
index 0b93c6bd9472..4cf5d2e7d34b 100644
--- a/cache/cache_key.h
+++ b/cache/cache_key.h
@@ -44,13 +44,13 @@ class CacheKey {
   inline Slice AsSlice() const {
     static_assert(sizeof(*this) == 16, "Standardized on 16-byte cache key");
     assert(!IsEmpty());
-    return Slice(reinterpret_cast<const char *>(this), sizeof(*this));
+    return Slice(reinterpret_cast<const char*>(this), sizeof(*this));
   }
 
   // Create a CacheKey that is unique among others associated with this Cache
   // instance. Depends on Cache::NewId. This is useful for block cache
   // "reservations".
-  static CacheKey CreateUniqueForCacheLifetime(Cache *cache);
+  static CacheKey CreateUniqueForCacheLifetime(Cache* cache);
 
   // Create a CacheKey that is unique among others for the lifetime of this
   // process. This is useful for saving in a static data member so that
@@ -87,7 +87,7 @@ class OffsetableCacheKey : private CacheKey {
 
   // Constructs an OffsetableCacheKey with the given information about a file.
   // This constructor never generates an "empty" base key.
-  OffsetableCacheKey(const std::string &db_id, const std::string &db_session_id,
+  OffsetableCacheKey(const std::string& db_id, const std::string& db_session_id,
                      uint64_t file_number);
 
   // Creates an OffsetableCacheKey from an SST unique ID, so that cache keys
@@ -134,9 +134,9 @@ class OffsetableCacheKey : private CacheKey {
     static_assert(sizeof(file_num_etc64_) == kCommonPrefixSize,
                   "8 byte common prefix expected");
     assert(!IsEmpty());
-    assert(&this->file_num_etc64_ == static_cast<const void *>(this));
+    assert(&this->file_num_etc64_ == static_cast<const void*>(this));
 
-    return Slice(reinterpret_cast<const char *>(this), kCommonPrefixSize);
+    return Slice(reinterpret_cast<const char*>(this), kCommonPrefixSize);
   }
 };
 
diff --git a/cache/cache_reservation_manager.h b/cache/cache_reservation_manager.h
index a7b06dea2073..deff5be8a285 100644
--- a/cache/cache_reservation_manager.h
+++ b/cache/cache_reservation_manager.h
@@ -44,8 +44,8 @@ class CacheReservationManager {
                                         bool increase) = 0;
   virtual Status MakeCacheReservation(
       std::size_t incremental_memory_used,
-      std::unique_ptr<CacheReservationManager::CacheReservationHandle>
-          *handle) = 0;
+      std::unique_ptr<CacheReservationManager::CacheReservationHandle>*
+          handle) = 0;
   virtual std::size_t GetTotalReservedCacheSize() = 0;
   virtual std::size_t GetTotalMemoryUsed() = 0;
 };
@@ -90,11 +90,11 @@ class CacheReservationManagerImpl
                                        bool delayed_decrease = false);
 
   // no copy constructor, copy assignment, move constructor, move assignment
-  CacheReservationManagerImpl(const CacheReservationManagerImpl &) = delete;
-  CacheReservationManagerImpl &operator=(const CacheReservationManagerImpl &) =
+  CacheReservationManagerImpl(const CacheReservationManagerImpl&) = delete;
+  CacheReservationManagerImpl& operator=(const CacheReservationManagerImpl&) =
       delete;
-  CacheReservationManagerImpl(CacheReservationManagerImpl &&) = delete;
-  CacheReservationManagerImpl &operator=(CacheReservationManagerImpl &&) =
+  CacheReservationManagerImpl(CacheReservationManagerImpl&&) = delete;
+  CacheReservationManagerImpl& operator=(CacheReservationManagerImpl&&) =
       delete;
 
   ~CacheReservationManagerImpl() override;
@@ -178,7 +178,7 @@ class CacheReservationManagerImpl
   // REQUIRES: handle != nullptr
   Status MakeCacheReservation(
       std::size_t incremental_memory_used,
-      std::unique_ptr<CacheReservationManager::CacheReservationHandle> *handle)
+      std::unique_ptr<CacheReservationManager::CacheReservationHandle>* handle)
       override;
 
   // Return the size of the cache (which is a multiple of kSizeDummyEntry)
@@ -200,7 +200,7 @@ class CacheReservationManagerImpl
   // For testing only - it is to help ensure the CacheItemHelperForRole<R>
   // accessed from CacheReservationManagerImpl and the one accessed from the
   // test are from the same translation units
-  static const Cache::CacheItemHelper *TEST_GetCacheItemHelperForRole();
+  static const Cache::CacheItemHelper* TEST_GetCacheItemHelperForRole();
 
  private:
   static constexpr std::size_t kSizeDummyEntry = 256 * 1024;
@@ -216,7 +216,7 @@ class CacheReservationManagerImpl
   bool delayed_decrease_;
   std::atomic<std::size_t> cache_allocated_size_;
   std::size_t memory_used_;
-  std::vector<Cache::Handle *> dummy_handles_;
+  std::vector<Cache::Handle*> dummy_handles_;
   CacheKey cache_key_;
 };
 
@@ -251,14 +251,14 @@ class ConcurrentCacheReservationManager
       std::shared_ptr<CacheReservationManager> cache_res_mgr) {
     cache_res_mgr_ = std::move(cache_res_mgr);
   }
-  ConcurrentCacheReservationManager(const ConcurrentCacheReservationManager &) =
+  ConcurrentCacheReservationManager(const ConcurrentCacheReservationManager&) =
       delete;
-  ConcurrentCacheReservationManager &operator=(
-      const ConcurrentCacheReservationManager &) = delete;
-  ConcurrentCacheReservationManager(ConcurrentCacheReservationManager &&) =
+  ConcurrentCacheReservationManager& operator=(
+      const ConcurrentCacheReservationManager&) = delete;
+  ConcurrentCacheReservationManager(ConcurrentCacheReservationManager&&) =
       delete;
-  ConcurrentCacheReservationManager &operator=(
-      ConcurrentCacheReservationManager &&) = delete;
+  ConcurrentCacheReservationManager& operator=(
+      ConcurrentCacheReservationManager&&) = delete;
 
   ~ConcurrentCacheReservationManager() override {}
 
@@ -286,7 +286,7 @@ class ConcurrentCacheReservationManager
 
   inline Status MakeCacheReservation(
       std::size_t incremental_memory_used,
-      std::unique_ptr<CacheReservationManager::CacheReservationHandle> *handle)
+      std::unique_ptr<CacheReservationManager::CacheReservationHandle>* handle)
       override {
     std::unique_ptr<CacheReservationManager::CacheReservationHandle>
         wrapped_handle;
diff --git a/cache/cache_test.cc b/cache/cache_test.cc
index 12bcfe6cd437..b762fe4f8af7 100644
--- a/cache/cache_test.cc
+++ b/cache/cache_test.cc
@@ -644,7 +644,7 @@ using TypedHandle = SharedCache::TypedHandle;
 
 TEST_P(CacheTest, SetCapacity) {
   if (IsHyperClock()) {
-    // TODO: update test & code for limited supoort
+    // TODO: update test & code for limited support
     ROCKSDB_GTEST_BYPASS(
         "HyperClockCache doesn't support arbitrary capacity "
         "adjustments.");
diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc
index 090213cb0d02..70155791a41c 100644
--- a/cache/clock_cache.cc
+++ b/cache/clock_cache.cc
@@ -10,12 +10,12 @@
 #include "cache/clock_cache.h"
 
 #include <algorithm>
-#include <atomic>
 #include <bitset>
 #include <cassert>
 #include <cinttypes>
 #include <cstddef>
 #include <cstdint>
+#include <cstdio>
 #include <exception>
 #include <functional>
 #include <numeric>
@@ -26,10 +26,9 @@
 #include "cache/cache_key.h"
 #include "cache/secondary_cache_adapter.h"
 #include "logging/logging.h"
-#include "monitoring/perf_context_imp.h"
-#include "monitoring/statistics_impl.h"
-#include "port/lang.h"
+#include "port/likely.h"
 #include "rocksdb/env.h"
+#include "util/autovector.h"
 #include "util/hash.h"
 #include "util/math.h"
 #include "util/random.h"
@@ -39,13 +38,11 @@ namespace ROCKSDB_NAMESPACE {
 namespace clock_cache {
 
 namespace {
-inline uint64_t GetRefcount(uint64_t meta) {
-  return ((meta >> ClockHandle::kAcquireCounterShift) -
-          (meta >> ClockHandle::kReleaseCounterShift)) &
-         ClockHandle::kCounterMask;
-}
+using SlotMeta = ClockHandle::SlotMeta;
+using AcquireCounter = SlotMeta::AcquireCounter;
+using ReleaseCounter = SlotMeta::ReleaseCounter;
 
-inline uint64_t GetInitialCountdown(Cache::Priority priority) {
+inline uint32_t GetInitialCountdown(Cache::Priority priority) {
   // Set initial clock data from priority
   // TODO: configuration parameters for priority handling and clock cycle
   // count?
@@ -66,11 +63,11 @@ inline uint64_t GetInitialCountdown(Cache::Priority priority) {
 inline void MarkEmpty(ClockHandle& h) {
 #ifndef NDEBUG
   // Mark slot as empty, with assertion
-  uint64_t meta = h.meta.Exchange(0);
-  assert(meta >> ClockHandle::kStateShift == ClockHandle::kStateConstruction);
+  auto old_meta = h.meta.Exchange({});
+  assert(old_meta.IsUnderConstruction());
 #else
   // Mark slot as empty
-  h.meta.Store(0);
+  h.meta.Store({});
 #endif
 }
 
@@ -86,18 +83,20 @@ inline void FreeDataMarkEmpty(ClockHandle& h, MemoryAllocator* allocator) {
 
 // Called to undo the effect of referencing an entry for internal purposes,
 // so it should not be marked as having been used.
-inline void Unref(const ClockHandle& h, uint64_t count = 1) {
+inline void Unref(const ClockHandle& h, uint32_t count = 1) {
   // Pretend we never took the reference
   // WART: there's a tiny chance we release last ref to invisible
   // entry here. If that happens, we let eviction take care of it.
-  uint64_t old_meta = h.meta.FetchSub(ClockHandle::kAcquireIncrement * count);
-  assert(GetRefcount(old_meta) != 0);
+  SlotMeta old_meta;
+  h.meta.Apply(AcquireCounter::MinusTransformPromiseNoUnderflow(count),
+               &old_meta);
+  assert(old_meta.GetRefcount() != 0);
   (void)old_meta;
 }
 
 inline bool ClockUpdate(ClockHandle& h, BaseClockTable::EvictionData* data,
                         bool* purgeable = nullptr) {
-  uint64_t meta;
+  SlotMeta meta;
   if (purgeable) {
     assert(*purgeable == false);
     // In AutoHCC, our eviction process follows the chain structure, so we
@@ -111,46 +110,40 @@ inline bool ClockUpdate(ClockHandle& h, BaseClockTable::EvictionData* data,
     meta = h.meta.LoadRelaxed();
   }
 
-  if (((meta >> ClockHandle::kStateShift) & ClockHandle::kStateShareableBit) ==
-      0) {
+  if (!meta.IsShareable()) {
     // Only clock update Shareable entries
     if (purgeable) {
       *purgeable = true;
       // AutoHCC only: make sure we only attempt to update non-empty slots
-      assert((meta >> ClockHandle::kStateShift) &
-             ClockHandle::kStateOccupiedBit);
+      assert(!meta.IsEmpty());
     }
     return false;
   }
-  uint64_t acquire_count =
-      (meta >> ClockHandle::kAcquireCounterShift) & ClockHandle::kCounterMask;
-  uint64_t release_count =
-      (meta >> ClockHandle::kReleaseCounterShift) & ClockHandle::kCounterMask;
+  uint32_t acquire_count = meta.GetAcquireCounter();
+  uint32_t release_count = meta.GetReleaseCounter();
   if (acquire_count != release_count) {
     // Only clock update entries with no outstanding refs
     data->seen_pinned_count++;
     return false;
   }
-  if ((meta >> ClockHandle::kStateShift == ClockHandle::kStateVisible) &&
-      acquire_count > 0) {
+  if (meta.IsVisible() && acquire_count > 0) {
     // Decrement clock
-    uint64_t new_count =
-        std::min(acquire_count - 1, uint64_t{ClockHandle::kMaxCountdown} - 1);
+    uint32_t new_count =
+        std::min(acquire_count - 1, uint32_t{ClockHandle::kMaxCountdown} - 1);
     // Compare-exchange in the decremented clock info, but
     // not aggressively
-    uint64_t new_meta =
-        (uint64_t{ClockHandle::kStateVisible} << ClockHandle::kStateShift) |
-        (meta & ClockHandle::kHitBitMask) |
-        (new_count << ClockHandle::kReleaseCounterShift) |
-        (new_count << ClockHandle::kAcquireCounterShift);
+    SlotMeta new_meta = meta;
+    new_meta.SetReleaseCounter(new_count);
+    new_meta.SetAcquireCounter(new_count);
     h.meta.CasStrongRelaxed(meta, new_meta);
     return false;
   }
   // Otherwise, remove entry (either unreferenced invisible or
   // unreferenced and expired visible).
-  if (h.meta.CasStrong(meta, (uint64_t{ClockHandle::kStateConstruction}
-                              << ClockHandle::kStateShift) |
-                                 (meta & ClockHandle::kHitBitMask))) {
+  SlotMeta construction_meta;
+  construction_meta.SetUnderConstruction();
+  construction_meta.SetHit(meta.GetHit());
+  if (h.meta.CasStrong(meta, construction_meta)) {
     // Took ownership.
     data->freed_charge += h.GetTotalCharge();
     data->freed_count += 1;
@@ -216,39 +209,39 @@ inline bool ClockUpdate(ClockHandle& h, BaseClockTable::EvictionData* data,
 // counter to reach "high" state again and bumped back to "medium." (This
 // motivates only checking for release counter in high state, not both in high
 // state.)
-inline void CorrectNearOverflow(uint64_t old_meta,
-                                AcqRelAtomic<uint64_t>& meta) {
+inline void CorrectNearOverflow(SlotMeta old_meta,
+                                BitFieldsAtomic<SlotMeta>& meta) {
   // We clear both top-most counter bits at the same time.
-  constexpr uint64_t kCounterTopBit = uint64_t{1}
-                                      << (ClockHandle::kCounterNumBits - 1);
-  constexpr uint64_t kClearBits =
-      (kCounterTopBit << ClockHandle::kAcquireCounterShift) |
-      (kCounterTopBit << ClockHandle::kReleaseCounterShift);
-  // A simple check that allows us to initiate clearing the top bits for
-  // a large portion of the "high" state space on release counter.
-  constexpr uint64_t kCheckBits =
-      (kCounterTopBit | (ClockHandle::kMaxCountdown + 1))
-      << ClockHandle::kReleaseCounterShift;
+  constexpr uint32_t kCounterTopBit = uint32_t{1}
+                                      << (SlotMeta::kCounterNumBits - 1);
+  // The threshold for correcting "near overflow" is to ensure
+  // (a) the value has a top bit set that can be cleared
+  // (b) when we clear the top bit, the eviction state will be preserved
+  //     (everything >= kMaxCountdown is treated equivalently)
+  // As mentioned above, we only check the release count.
+  constexpr uint32_t kThreshold = kCounterTopBit + ClockHandle::kMaxCountdown;
 
-  if (UNLIKELY(old_meta & kCheckBits)) {
-    meta.FetchAndRelaxed(~kClearBits);
+  if (UNLIKELY(old_meta.GetReleaseCounter() > kThreshold)) {
+    auto clear_transform = AcquireCounter::AndTransform(kCounterTopBit - 1) +
+                           ReleaseCounter::AndTransform(kCounterTopBit - 1);
+    meta.ApplyRelaxed(clear_transform);
   }
 }
 
 inline bool BeginSlotInsert(const ClockHandleBasicData& proto, ClockHandle& h,
-                            uint64_t initial_countdown, bool* already_matches) {
+                            uint32_t initial_countdown, bool* already_matches) {
   assert(*already_matches == false);
   // Optimistically transition the slot from "empty" to
   // "under construction" (no effect on other states)
-  uint64_t old_meta = h.meta.FetchOr(uint64_t{ClockHandle::kStateOccupiedBit}
-                                     << ClockHandle::kStateShift);
-  uint64_t old_state = old_meta >> ClockHandle::kStateShift;
+  auto set_occupied = SlotMeta::OccupiedFlag::SetTransform();
+  SlotMeta old_meta;
+  h.meta.Apply(set_occupied, &old_meta);
 
-  if (old_state == ClockHandle::kStateEmpty) {
+  if (old_meta.IsEmpty()) {
     // We've started inserting into an available slot, and taken
     // ownership.
     return true;
-  } else if (old_state != ClockHandle::kStateVisible) {
+  } else if (!old_meta.IsVisible()) {
     // Slot not usable / touchable now
     return false;
   }
@@ -256,15 +249,17 @@ inline bool BeginSlotInsert(const ClockHandleBasicData& proto, ClockHandle& h,
   // But first, we need to acquire a ref to read it. In fact, number of
   // refs for initial countdown, so that we boost the clock state if
   // this is a match.
-  old_meta =
-      h.meta.FetchAdd(ClockHandle::kAcquireIncrement * initial_countdown);
+  auto add_acquire =
+      AcquireCounter::PlusTransformPromiseNoOverflow(initial_countdown);
+  h.meta.Apply(add_acquire, &old_meta);
   // Like Lookup
-  if ((old_meta >> ClockHandle::kStateShift) == ClockHandle::kStateVisible) {
+  if (old_meta.IsVisible()) {
     // Acquired a read reference
     if (h.hashed_key == proto.hashed_key) {
       // Match. Release in a way that boosts the clock state
-      old_meta =
-          h.meta.FetchAdd(ClockHandle::kReleaseIncrement * initial_countdown);
+      auto add_release =
+          ReleaseCounter::PlusTransformPromiseNoOverflow(initial_countdown);
+      h.meta.Apply(add_release, &old_meta);
       // Correct for possible (but rare) overflow
       CorrectNearOverflow(old_meta, h.meta);
       // Insert detached instead (only if return handle needed)
@@ -274,8 +269,7 @@ inline bool BeginSlotInsert(const ClockHandleBasicData& proto, ClockHandle& h,
       // Mismatch.
       Unref(h, initial_countdown);
     }
-  } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) ==
-                      ClockHandle::kStateInvisible)) {
+  } else if (UNLIKELY(old_meta.IsInvisible())) {
     // Pretend we never took the reference
     Unref(h, initial_countdown);
   } else {
@@ -287,25 +281,23 @@ inline bool BeginSlotInsert(const ClockHandleBasicData& proto, ClockHandle& h,
 }
 
 inline void FinishSlotInsert(const ClockHandleBasicData& proto, ClockHandle& h,
-                             uint64_t initial_countdown, bool keep_ref) {
+                             uint32_t initial_countdown, bool keep_ref) {
   // Save data fields
   ClockHandleBasicData* h_alias = &h;
   *h_alias = proto;
 
   // Transition from "under construction" state to "visible" state
-  uint64_t new_meta = uint64_t{ClockHandle::kStateVisible}
-                      << ClockHandle::kStateShift;
+  SlotMeta new_meta;
+  new_meta.SetVisible();
 
   // Maybe with an outstanding reference
-  new_meta |= initial_countdown << ClockHandle::kAcquireCounterShift;
-  new_meta |= (initial_countdown - keep_ref)
-              << ClockHandle::kReleaseCounterShift;
+  new_meta.SetAcquireCounter(initial_countdown);
+  new_meta.SetReleaseCounter(initial_countdown - (keep_ref ? 1 : 0));
 
 #ifndef NDEBUG
   // Save the state transition, with assertion
-  uint64_t old_meta = h.meta.Exchange(new_meta);
-  assert(old_meta >> ClockHandle::kStateShift ==
-         ClockHandle::kStateConstruction);
+  auto old_meta = h.meta.Exchange(new_meta);
+  assert(old_meta.IsUnderConstruction());
 #else
   // Save the state transition
   h.meta.Store(new_meta);
@@ -313,7 +305,7 @@ inline void FinishSlotInsert(const ClockHandleBasicData& proto, ClockHandle& h,
 }
 
 bool TryInsert(const ClockHandleBasicData& proto, ClockHandle& h,
-               uint64_t initial_countdown, bool keep_ref,
+               uint32_t initial_countdown, bool keep_ref,
                bool* already_matches) {
   bool b = BeginSlotInsert(proto, h, initial_countdown, already_matches);
   if (b) {
@@ -327,50 +319,40 @@ template <class HandleImpl, class Func>
 void ConstApplyToEntriesRange(const Func& func, const HandleImpl* begin,
                               const HandleImpl* end,
                               bool apply_if_will_be_deleted) {
-  uint64_t check_state_mask = ClockHandle::kStateShareableBit;
-  if (!apply_if_will_be_deleted) {
-    check_state_mask |= ClockHandle::kStateVisibleBit;
-  }
-
   for (const HandleImpl* h = begin; h < end; ++h) {
     // Note: to avoid using compare_exchange, we have to be extra careful.
-    uint64_t old_meta = h->meta.LoadRelaxed();
+    SlotMeta old_meta = h->meta.LoadRelaxed();
     // Check if it's an entry visible to lookups
-    if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) {
-      // Increment acquire counter. Note: it's possible that the entry has
-      // completely changed since we loaded old_meta, but incrementing acquire
-      // count is always safe. (Similar to optimistic Lookup here.)
-      old_meta = h->meta.FetchAdd(ClockHandle::kAcquireIncrement);
-      // Check whether we actually acquired a reference.
-      if ((old_meta >> ClockHandle::kStateShift) &
-          ClockHandle::kStateShareableBit) {
-        // Apply func if appropriate
-        if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) {
-          func(*h);
+    if (apply_if_will_be_deleted || old_meta.IsVisible()) {
+      if (old_meta.IsShareable()) {
+        // Increment acquire counter. Note: it's possible that the entry has
+        // completely changed since we loaded old_meta, but incrementing acquire
+        // count is always safe. (Similar to optimistic Lookup here.)
+        auto add_acquire = AcquireCounter::PlusTransformPromiseNoOverflow(1);
+        h->meta.Apply(add_acquire, &old_meta);
+        // Check whether we actually acquired a reference.
+        if (old_meta.IsShareable()) {
+          // Apply func if appropriate
+          if (apply_if_will_be_deleted || old_meta.IsVisible()) {
+            func(*h);
+          }
+          // Pretend we never took the reference
+          Unref(*h);
+          // No net change, so don't need to check for overflow
+        } else {
+          // For other states, incrementing the acquire counter has no effect
+          // so we don't need to undo it. Furthermore, we cannot safely undo
+          // it because we did not acquire a read reference to lock the
+          // entry in a Shareable state.
         }
-        // Pretend we never took the reference
-        Unref(*h);
-        // No net change, so don't need to check for overflow
-      } else {
-        // For other states, incrementing the acquire counter has no effect
-        // so we don't need to undo it. Furthermore, we cannot safely undo
-        // it because we did not acquire a read reference to lock the
-        // entry in a Shareable state.
       }
     }
   }
 }
 
-constexpr uint32_t kStrictCapacityLimitBit = 1u << 31;
-
-uint32_t SanitizeEncodeEecAndScl(int eviction_effort_cap,
-                                 bool strict_capacit_limit) {
+uint32_t SanitizeEvictionEffortCap(int eviction_effort_cap) {
   eviction_effort_cap = std::max(int{1}, eviction_effort_cap);
-  eviction_effort_cap =
-      std::min(static_cast<int>(~kStrictCapacityLimitBit), eviction_effort_cap);
-  uint32_t eec_and_scl = static_cast<uint32_t>(eviction_effort_cap);
-  eec_and_scl |= strict_capacit_limit ? kStrictCapacityLimitBit : 0;
-  return eec_and_scl;
+  return static_cast<uint32_t>(eviction_effort_cap);
 }
 
 }  // namespace
@@ -381,6 +363,22 @@ void ClockHandleBasicData::FreeData(MemoryAllocator* allocator) const {
   }
 }
 
+BaseClockTable::BaseClockTable(size_t capacity, bool strict_capacity_limit,
+                               int eviction_effort_cap,
+                               CacheMetadataChargePolicy metadata_charge_policy,
+                               MemoryAllocator* allocator,
+                               const Cache::EvictionCallback* eviction_callback,
+                               const uint32_t* hash_seed)
+    : capacity_(capacity),
+      eec_and_scl_(EecAndScl{}
+                       .With<EvictionEffortCap>(
+                           SanitizeEvictionEffortCap(eviction_effort_cap))
+                       .With<StrictCapacityLimit>(strict_capacity_limit)),
+      metadata_charge_policy_(metadata_charge_policy),
+      allocator_(allocator),
+      eviction_callback_(*eviction_callback),
+      hash_seed_(*hash_seed) {}
+
 template <class HandleImpl>
 HandleImpl* BaseClockTable::StandaloneInsert(
     const ClockHandleBasicData& proto) {
@@ -391,9 +389,9 @@ HandleImpl* BaseClockTable::StandaloneInsert(
   h->SetStandalone();
   // Single reference (standalone entries only created if returning a refed
   // Handle back to user)
-  uint64_t meta = uint64_t{ClockHandle::kStateInvisible}
-                  << ClockHandle::kStateShift;
-  meta |= uint64_t{1} << ClockHandle::kAcquireCounterShift;
+  SlotMeta meta;
+  meta.SetInvisible();
+  meta.SetAcquireCounter(1);
   h->meta.Store(meta);
   // Keep track of how much of usage is standalone
   standalone_usage_.FetchAddRelaxed(proto.GetTotalCharge());
@@ -402,8 +400,7 @@ HandleImpl* BaseClockTable::StandaloneInsert(
 
 template <class Table>
 typename Table::HandleImpl* BaseClockTable::CreateStandalone(
-    ClockHandleBasicData& proto, size_t capacity, uint32_t eec_and_scl,
-    bool allow_uncharged) {
+    ClockHandleBasicData& proto, bool allow_uncharged) {
   Table& derived = static_cast<Table&>(*this);
   typename Table::InsertState state;
   derived.StartInsert(state);
@@ -412,10 +409,10 @@ typename Table::HandleImpl* BaseClockTable::CreateStandalone(
   // NOTE: we can use eec_and_scl as eviction_effort_cap below because
   // strict_capacity_limit=true is supposed to disable the limit on eviction
   // effort, and a large value effectively does that.
-  if (eec_and_scl & kStrictCapacityLimitBit) {
+  if (eec_and_scl_.LoadRelaxed().Get<StrictCapacityLimit>()) {
     Status s = ChargeUsageMaybeEvictStrict<Table>(
-        total_charge, capacity,
-        /*need_evict_for_occupancy=*/false, eec_and_scl, state);
+        total_charge,
+        /*need_evict_for_occupancy=*/false, state);
     if (!s.ok()) {
       if (allow_uncharged) {
         proto.total_charge = 0;
@@ -426,8 +423,8 @@ typename Table::HandleImpl* BaseClockTable::CreateStandalone(
   } else {
     // Case strict_capacity_limit == false
     bool success = ChargeUsageMaybeEvictNonStrict<Table>(
-        total_charge, capacity,
-        /*need_evict_for_occupancy=*/false, eec_and_scl, state);
+        total_charge,
+        /*need_evict_for_occupancy=*/false, state);
     if (!success) {
       // Force the issue
       usage_.FetchAddRelaxed(total_charge);
@@ -439,8 +436,9 @@ typename Table::HandleImpl* BaseClockTable::CreateStandalone(
 
 template <class Table>
 Status BaseClockTable::ChargeUsageMaybeEvictStrict(
-    size_t total_charge, size_t capacity, bool need_evict_for_occupancy,
-    uint32_t eviction_effort_cap, typename Table::InsertState& state) {
+    size_t total_charge, bool need_evict_for_occupancy,
+    typename Table::InsertState& state) {
+  const size_t capacity = capacity_.LoadRelaxed();
   if (total_charge > capacity) {
     return Status::MemoryLimit(
         "Cache entry too large for a single cache shard: " +
@@ -465,8 +463,7 @@ Status BaseClockTable::ChargeUsageMaybeEvictStrict(
   }
   if (request_evict_charge > 0) {
     EvictionData data;
-    static_cast<Table*>(this)->Evict(request_evict_charge, state, &data,
-                                     eviction_effort_cap);
+    static_cast<Table*>(this)->Evict(request_evict_charge, state, &data);
     occupancy_.FetchSub(data.freed_count);
     if (LIKELY(data.freed_charge > need_evict_charge)) {
       assert(data.freed_count > 0);
@@ -495,8 +492,8 @@ Status BaseClockTable::ChargeUsageMaybeEvictStrict(
 
 template <class Table>
 inline bool BaseClockTable::ChargeUsageMaybeEvictNonStrict(
-    size_t total_charge, size_t capacity, bool need_evict_for_occupancy,
-    uint32_t eviction_effort_cap, typename Table::InsertState& state) {
+    size_t total_charge, bool need_evict_for_occupancy,
+    typename Table::InsertState& state) {
   // For simplicity, we consider that either the cache can accept the insert
   // with no evictions, or we must evict enough to make (at least) enough
   // space. It could lead to unnecessary failures or excessive evictions in
@@ -506,7 +503,8 @@ inline bool BaseClockTable::ChargeUsageMaybeEvictNonStrict(
   // charge. Thus, we should evict some extra if it's not a signifcant
   // portion of the shard capacity. This can have the side benefit of
   // involving fewer threads in eviction.
-  size_t old_usage = usage_.LoadRelaxed();
+  const size_t old_usage = usage_.LoadRelaxed();
+  const size_t capacity = capacity_.LoadRelaxed();
   size_t need_evict_charge;
   // NOTE: if total_charge > old_usage, there isn't yet enough to evict
   // `total_charge` amount. Even if we only try to evict `old_usage` amount,
@@ -532,8 +530,7 @@ inline bool BaseClockTable::ChargeUsageMaybeEvictNonStrict(
   }
   EvictionData data;
   if (need_evict_charge > 0) {
-    static_cast<Table*>(this)->Evict(need_evict_charge, state, &data,
-                                     eviction_effort_cap);
+    static_cast<Table*>(this)->Evict(need_evict_charge, state, &data);
     // Deal with potential occupancy deficit
     if (UNLIKELY(need_evict_for_occupancy) && data.freed_count == 0) {
       assert(data.freed_charge == 0);
@@ -557,11 +554,10 @@ void BaseClockTable::TrackAndReleaseEvictedEntry(ClockHandle* h) {
   if (eviction_callback_) {
     // For key reconstructed from hash
     UniqueId64x2 unhashed;
-    took_value_ownership =
-        eviction_callback_(ClockCacheShard<FixedHyperClockTable>::ReverseHash(
-                               h->GetHash(), &unhashed, hash_seed_),
-                           static_cast<Cache::Handle*>(h),
-                           h->meta.LoadRelaxed() & ClockHandle::kHitBitMask);
+    took_value_ownership = eviction_callback_(
+        ClockCacheShard<FixedHyperClockTable>::ReverseHash(
+            h->GetHash(), &unhashed, hash_seed_),
+        static_cast<Cache::Handle*>(h), h->meta.LoadRelaxed().GetHit());
   }
   if (!took_value_ownership) {
     h->FreeData(allocator_);
@@ -569,8 +565,10 @@ void BaseClockTable::TrackAndReleaseEvictedEntry(ClockHandle* h) {
   MarkEmpty(*h);
 }
 
-bool IsEvictionEffortExceeded(const BaseClockTable::EvictionData& data,
-                              uint32_t eviction_effort_cap) {
+bool BaseClockTable::IsEvictionEffortExceeded(
+    const BaseClockTable::EvictionData& data) const {
+  auto eviction_effort_cap =
+      eec_and_scl_.LoadRelaxed().GetEffectiveEvictionEffortCap();
   // Basically checks whether the ratio of useful effort to wasted effort is
   // too low, with a start-up allowance for wasted effort before any useful
   // effort.
@@ -581,8 +579,7 @@ bool IsEvictionEffortExceeded(const BaseClockTable::EvictionData& data,
 template <class Table>
 Status BaseClockTable::Insert(const ClockHandleBasicData& proto,
                               typename Table::HandleImpl** handle,
-                              Cache::Priority priority, size_t capacity,
-                              uint32_t eec_and_scl) {
+                              Cache::Priority priority) {
   using HandleImpl = typename Table::HandleImpl;
   Table& derived = static_cast<Table&>(*this);
 
@@ -603,9 +600,9 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto,
   // NOTE: we can use eec_and_scl as eviction_effort_cap below because
   // strict_capacity_limit=true is supposed to disable the limit on eviction
   // effort, and a large value effectively does that.
-  if (eec_and_scl & kStrictCapacityLimitBit) {
+  if (eec_and_scl_.LoadRelaxed().Get<StrictCapacityLimit>()) {
     Status s = ChargeUsageMaybeEvictStrict<Table>(
-        total_charge, capacity, need_evict_for_occupancy, eec_and_scl, state);
+        total_charge, need_evict_for_occupancy, state);
     if (!s.ok()) {
       // Revert occupancy
       occupancy_.FetchSubRelaxed(1);
@@ -614,7 +611,7 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto,
   } else {
     // Case strict_capacity_limit == false
     bool success = ChargeUsageMaybeEvictNonStrict<Table>(
-        total_charge, capacity, need_evict_for_occupancy, eec_and_scl, state);
+        total_charge, need_evict_for_occupancy, state);
     if (!success) {
       // Revert occupancy
       occupancy_.FetchSubRelaxed(1);
@@ -640,7 +637,7 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto,
     // * Have to insert into a suboptimal location (more probes) so that the
     // old entry can be kept around as well.
 
-    uint64_t initial_countdown = GetInitialCountdown(priority);
+    uint32_t initial_countdown = GetInitialCountdown(priority);
     assert(initial_countdown > 0);
 
     HandleImpl* e =
@@ -685,44 +682,46 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto,
 
 void BaseClockTable::Ref(ClockHandle& h) {
   // Increment acquire counter
-  uint64_t old_meta = h.meta.FetchAdd(ClockHandle::kAcquireIncrement);
+  SlotMeta old_meta;
+  h.meta.Apply(AcquireCounter::PlusTransformPromiseNoOverflow(1), &old_meta);
 
-  assert((old_meta >> ClockHandle::kStateShift) &
-         ClockHandle::kStateShareableBit);
+  assert(old_meta.IsShareable());
   // Must have already had a reference
-  assert(GetRefcount(old_meta) > 0);
+  assert(old_meta.GetRefcount() > 0);
   (void)old_meta;
 }
 
 #ifndef NDEBUG
-void BaseClockTable::TEST_RefN(ClockHandle& h, size_t n) {
+void BaseClockTable::TEST_RefN(ClockHandle& h, uint32_t n) {
   // Increment acquire counter
-  uint64_t old_meta = h.meta.FetchAdd(n * ClockHandle::kAcquireIncrement);
+  SlotMeta old_meta;
+  h.meta.Apply(AcquireCounter::PlusTransformPromiseNoOverflow(n), &old_meta);
 
-  assert((old_meta >> ClockHandle::kStateShift) &
-         ClockHandle::kStateShareableBit);
+  assert(old_meta.IsShareable());
   (void)old_meta;
 }
 
-void BaseClockTable::TEST_ReleaseNMinus1(ClockHandle* h, size_t n) {
+void BaseClockTable::TEST_ReleaseNMinus1(ClockHandle* h, uint32_t n) {
   assert(n > 0);
 
   // Like n-1 Releases, but assumes one more will happen in the caller to take
   // care of anything like erasing an unreferenced, invisible entry.
-  uint64_t old_meta =
-      h->meta.FetchAdd((n - 1) * ClockHandle::kReleaseIncrement);
-  assert((old_meta >> ClockHandle::kStateShift) &
-         ClockHandle::kStateShareableBit);
+  SlotMeta old_meta;
+  h->meta.Apply(ReleaseCounter::PlusTransformPromiseNoOverflow(n - 1),
+                &old_meta);
+  assert(old_meta.IsShareable());
   (void)old_meta;
 }
 #endif
 
 FixedHyperClockTable::FixedHyperClockTable(
-    size_t capacity, CacheMetadataChargePolicy metadata_charge_policy,
+    size_t capacity, bool strict_capacity_limit,
+    CacheMetadataChargePolicy metadata_charge_policy,
     MemoryAllocator* allocator,
     const Cache::EvictionCallback* eviction_callback, const uint32_t* hash_seed,
     const Opts& opts)
-    : BaseClockTable(metadata_charge_policy, allocator, eviction_callback,
+    : BaseClockTable(capacity, strict_capacity_limit, opts.eviction_effort_cap,
+                     metadata_charge_policy, allocator, eviction_callback,
                      hash_seed),
       length_bits_(CalcHashBits(capacity, opts.estimated_value_size,
                                 metadata_charge_policy)),
@@ -744,23 +743,20 @@ FixedHyperClockTable::~FixedHyperClockTable() {
   // in the table.
   for (size_t i = 0; i < GetTableSize(); i++) {
     HandleImpl& h = array_[i];
-    switch (h.meta.LoadRelaxed() >> ClockHandle::kStateShift) {
-      case ClockHandle::kStateEmpty:
-        // noop
-        break;
-      case ClockHandle::kStateInvisible:  // rare but possible
-      case ClockHandle::kStateVisible:
-        assert(GetRefcount(h.meta.LoadRelaxed()) == 0);
-        h.FreeData(allocator_);
+    SlotMeta meta = h.meta.LoadRelaxed();
+    if (meta.IsShareable()) {
+      // NOTE: Reaching here invisible is rare but possible
+      assert(meta.GetRefcount() == 0);
+      h.FreeData(allocator_);
 #ifndef NDEBUG
-        Rollback(h.hashed_key, &h);
-        ReclaimEntryUsage(h.GetTotalCharge());
+      Rollback(h.hashed_key, &h);
+      ReclaimEntryUsage(h.GetTotalCharge());
 #endif
-        break;
-      // otherwise
-      default:
-        assert(false);
-        break;
+    } else {
+      // Should be no transient "under construction" states unless a thread
+      // was killed or we are being destructed while another thread is still
+      // operating on the structure
+      assert(meta.IsEmpty());
     }
   }
 
@@ -782,7 +778,7 @@ bool FixedHyperClockTable::GrowIfNeeded(size_t new_occupancy, InsertState&) {
 }
 
 FixedHyperClockTable::HandleImpl* FixedHyperClockTable::DoInsert(
-    const ClockHandleBasicData& proto, uint64_t initial_countdown,
+    const ClockHandleBasicData& proto, uint32_t initial_countdown,
     bool keep_ref, InsertState&) {
   bool already_matches = false;
   HandleImpl* e = FindSlot(
@@ -833,47 +829,46 @@ FixedHyperClockTable::HandleImpl* FixedHyperClockTable::Lookup(
   HandleImpl* e = FindSlot(
       hashed_key,
       [&](HandleImpl* h) {
+        SlotMeta old_meta;
         // Mostly branch-free version (similar performance)
         /*
-        uint64_t old_meta = h->meta.FetchAdd(ClockHandle::kAcquireIncrement,
-                                     std::memory_order_acquire);
-        bool Shareable = (old_meta >> (ClockHandle::kStateShift + 1)) & 1U;
-        bool visible = (old_meta >> ClockHandle::kStateShift) & 1U;
-        bool match = (h->key == key) & visible;
-        h->meta.FetchSub(static_cast<uint64_t>(Shareable & !match) <<
-        ClockHandle::kAcquireCounterShift); return
-        match;
+        h->meta.Apply(AcquireCounter::PlusTransformPromiseNoOverflow(1),
+                      &old_meta);
+        bool shareable = old_meta.IsShareable();
+        bool visible = old_meta.IsVisible();
+        bool match = (h->hashed_key == hashed_key) & visible;
+        h->meta.Apply(AcquireCounter::MinusTransformPromiseNoUnderflow(
+            uint32_t{shareable} & uint32_t{!match}));
+        h->meta.Apply(SlotMeta::HitFlag::Or(match));
+        return match;
         */
         // Optimistic lookup should pay off when the table is relatively
         // sparse.
         constexpr bool kOptimisticLookup = true;
-        uint64_t old_meta;
         if (!kOptimisticLookup) {
           old_meta = h->meta.Load();
-          if ((old_meta >> ClockHandle::kStateShift) !=
-              ClockHandle::kStateVisible) {
+          if (!old_meta.IsVisible()) {
             return false;
           }
         }
         // (Optimistically) increment acquire counter
-        old_meta = h->meta.FetchAdd(ClockHandle::kAcquireIncrement);
+        h->meta.Apply(AcquireCounter::PlusTransformPromiseNoOverflow(1),
+                      &old_meta);
         // Check if it's an entry visible to lookups
-        if ((old_meta >> ClockHandle::kStateShift) ==
-            ClockHandle::kStateVisible) {
+        if (old_meta.IsVisible()) {
           // Acquired a read reference
           if (h->hashed_key == hashed_key) {
             // Match
             // Update the hit bit
             if (eviction_callback_) {
-              h->meta.FetchOrRelaxed(uint64_t{1} << ClockHandle::kHitBitShift);
+              h->meta.ApplyRelaxed(SlotMeta::HitFlag::SetTransform());
             }
             return true;
           } else {
             // Mismatch. Pretend we never took the reference
             Unref(*h);
           }
-        } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) ==
-                            ClockHandle::kStateInvisible)) {
+        } else if (UNLIKELY(old_meta.IsInvisible())) {
           // Pretend we never took the reference
           Unref(*h);
         } else {
@@ -897,53 +892,49 @@ bool FixedHyperClockTable::Release(HandleImpl* h, bool useful,
   // is only freed up by EvictFromClock (called by Insert when space is needed)
   // and Erase. We do this to avoid an extra atomic read of the variable usage_.
 
-  uint64_t old_meta;
+  SlotMeta old_meta;
   if (useful) {
     // Increment release counter to indicate was used
-    old_meta = h->meta.FetchAdd(ClockHandle::kReleaseIncrement);
+    auto add_release = ReleaseCounter::PlusTransformPromiseNoOverflow(1);
+    h->meta.Apply(add_release, &old_meta);
   } else {
     // Decrement acquire counter to pretend it never happened
-    old_meta = h->meta.FetchSub(ClockHandle::kAcquireIncrement);
+    auto sub_acquire = AcquireCounter::MinusTransformPromiseNoUnderflow(1);
+    h->meta.Apply(sub_acquire, &old_meta);
   }
 
-  assert((old_meta >> ClockHandle::kStateShift) &
-         ClockHandle::kStateShareableBit);
+  assert(old_meta.IsShareable());
   // No underflow
-  assert(((old_meta >> ClockHandle::kAcquireCounterShift) &
-          ClockHandle::kCounterMask) !=
-         ((old_meta >> ClockHandle::kReleaseCounterShift) &
-          ClockHandle::kCounterMask));
+  assert(old_meta.GetAcquireCounter() != old_meta.GetReleaseCounter());
 
-  if (erase_if_last_ref || UNLIKELY(old_meta >> ClockHandle::kStateShift ==
-                                    ClockHandle::kStateInvisible)) {
+  if (erase_if_last_ref || UNLIKELY(old_meta.IsInvisible())) {
     // FIXME: There's a chance here that another thread could replace this
     // entry and we end up erasing the wrong one.
 
-    // Update for last FetchAdd op
+    // Update for last Apply op
     if (useful) {
-      old_meta += ClockHandle::kReleaseIncrement;
+      old_meta.SetReleaseCounter(old_meta.GetReleaseCounter() + 1);
     } else {
-      old_meta -= ClockHandle::kAcquireIncrement;
+      old_meta.SetAcquireCounter(old_meta.GetAcquireCounter() - 1);
     }
     // Take ownership if no refs
+    SlotMeta construction_meta;
+    construction_meta.SetUnderConstruction();
     do {
-      if (GetRefcount(old_meta) != 0) {
+      if (old_meta.GetRefcount() != 0) {
         // Not last ref at some point in time during this Release call
         // Correct for possible (but rare) overflow
         CorrectNearOverflow(old_meta, h->meta);
         return false;
       }
-      if ((old_meta & (uint64_t{ClockHandle::kStateShareableBit}
-                       << ClockHandle::kStateShift)) == 0) {
+      if (!old_meta.IsShareable()) {
         // Someone else took ownership
         return false;
       }
       // Note that there's a small chance that we release, another thread
       // replaces this entry with another, reaches zero refs, and then we end
       // up erasing that other entry. That's an acceptable risk / imprecision.
-    } while (
-        !h->meta.CasWeak(old_meta, uint64_t{ClockHandle::kStateConstruction}
-                                       << ClockHandle::kStateShift));
+    } while (!h->meta.CasWeak(old_meta, construction_meta));
     // Took ownership
     size_t total_charge = h->GetTotalCharge();
     if (UNLIKELY(h->IsStandalone())) {
@@ -966,7 +957,7 @@ bool FixedHyperClockTable::Release(HandleImpl* h, bool useful,
 }
 
 #ifndef NDEBUG
-void FixedHyperClockTable::TEST_ReleaseN(HandleImpl* h, size_t n) {
+void FixedHyperClockTable::TEST_ReleaseN(HandleImpl* h, uint32_t n) {
   if (n > 0) {
     // Do n-1 simple releases first
     TEST_ReleaseNMinus1(h, n);
@@ -983,30 +974,29 @@ void FixedHyperClockTable::Erase(const UniqueId64x2& hashed_key) {
       [&](HandleImpl* h) {
         // Could be multiple entries in rare cases. Erase them all.
         // Optimistically increment acquire counter
-        uint64_t old_meta = h->meta.FetchAdd(ClockHandle::kAcquireIncrement);
+        auto add_acquire = AcquireCounter::PlusTransformPromiseNoOverflow(1);
+        SlotMeta old_meta, meta;
+        h->meta.Apply(add_acquire, &old_meta, &meta);
         // Check if it's an entry visible to lookups
-        if ((old_meta >> ClockHandle::kStateShift) ==
-            ClockHandle::kStateVisible) {
+        if (meta.IsVisible()) {
           // Acquired a read reference
           if (h->hashed_key == hashed_key) {
-            // Match. Set invisible.
-            old_meta =
-                h->meta.FetchAnd(~(uint64_t{ClockHandle::kStateVisibleBit}
-                                   << ClockHandle::kStateShift));
-            // Apply update to local copy
-            old_meta &= ~(uint64_t{ClockHandle::kStateVisibleBit}
-                          << ClockHandle::kStateShift);
+            // Match. Take ownership if no other refs, or set invisible other
+            // refs exist.
             for (;;) {
-              uint64_t refcount = GetRefcount(old_meta);
+              uint32_t refcount = meta.GetRefcount();
               assert(refcount > 0);
               if (refcount > 1) {
                 // Not last ref at some point in time during this Erase call
-                // Pretend we never took the reference
+                // Set invisible
+                h->meta.Apply(SlotMeta::VisibleFlag::ClearTransform());
+                // And pretend we never took the reference
                 Unref(*h);
                 break;
-              } else if (h->meta.CasWeak(
-                             old_meta, uint64_t{ClockHandle::kStateConstruction}
-                                           << ClockHandle::kStateShift)) {
+              }
+              SlotMeta construction_meta;
+              construction_meta.SetUnderConstruction();
+              if (h->meta.CasWeak(meta, construction_meta)) {
                 // Took ownership
                 assert(hashed_key == h->hashed_key);
                 size_t total_charge = h->GetTotalCharge();
@@ -1022,8 +1012,7 @@ void FixedHyperClockTable::Erase(const UniqueId64x2& hashed_key) {
             // Mismatch. Pretend we never took the reference
             Unref(*h);
           }
-        } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) ==
-                            ClockHandle::kStateInvisible)) {
+        } else if (UNLIKELY(old_meta.IsInvisible())) {
           // Pretend we never took the reference
           Unref(*h);
         } else {
@@ -1040,17 +1029,17 @@ void FixedHyperClockTable::EraseUnRefEntries() {
   for (size_t i = 0; i <= this->length_bits_mask_; i++) {
     HandleImpl& h = array_[i];
 
-    uint64_t old_meta = h.meta.LoadRelaxed();
-    if (old_meta & (uint64_t{ClockHandle::kStateShareableBit}
-                    << ClockHandle::kStateShift) &&
-        GetRefcount(old_meta) == 0 &&
-        h.meta.CasStrong(old_meta, uint64_t{ClockHandle::kStateConstruction}
-                                       << ClockHandle::kStateShift)) {
-      // Took ownership
-      size_t total_charge = h.GetTotalCharge();
-      Rollback(h.hashed_key, &h);
-      FreeDataMarkEmpty(h, allocator_);
-      ReclaimEntryUsage(total_charge);
+    SlotMeta old_meta = h.meta.LoadRelaxed();
+    if (old_meta.IsShareable() && old_meta.GetRefcount() == 0) {
+      SlotMeta construction_meta;
+      construction_meta.SetUnderConstruction();
+      if (h.meta.CasStrong(old_meta, construction_meta)) {
+        // Took ownership
+        size_t total_charge = h.GetTotalCharge();
+        Rollback(h.hashed_key, &h);
+        FreeDataMarkEmpty(h, allocator_);
+        ReclaimEntryUsage(total_charge);
+      }
     }
   }
 }
@@ -1113,8 +1102,7 @@ inline void FixedHyperClockTable::ReclaimEntryUsage(size_t total_charge) {
 }
 
 inline void FixedHyperClockTable::Evict(size_t requested_charge, InsertState&,
-                                        EvictionData* data,
-                                        uint32_t eviction_effort_cap) {
+                                        EvictionData* data) {
   // precondition
   assert(requested_charge > 0);
 
@@ -1149,7 +1137,7 @@ inline void FixedHyperClockTable::Evict(size_t requested_charge, InsertState&,
     if (old_clock_pointer >= max_clock_pointer) {
       return;
     }
-    if (IsEvictionEffortExceeded(*data, eviction_effort_cap)) {
+    if (IsEvictionEffortExceeded(*data)) {
       eviction_effort_exceeded_count_.FetchAddRelaxed(1);
       return;
     }
@@ -1167,14 +1155,11 @@ ClockCacheShard<Table>::ClockCacheShard(
     const Cache::EvictionCallback* eviction_callback, const uint32_t* hash_seed,
     const typename Table::Opts& opts)
     : CacheShardBase(metadata_charge_policy),
-      table_(capacity, metadata_charge_policy, allocator, eviction_callback,
-             hash_seed, opts),
-      capacity_(capacity),
-      eec_and_scl_(SanitizeEncodeEecAndScl(opts.eviction_effort_cap,
-                                           strict_capacity_limit)) {
+      table_(capacity, strict_capacity_limit, metadata_charge_policy, allocator,
+             eviction_callback, hash_seed, opts) {
   // Initial charge metadata should not exceed capacity
-  assert(table_.GetUsage() <= capacity_.LoadRelaxed() ||
-         capacity_.LoadRelaxed() < sizeof(HandleImpl));
+  assert(table_.GetUsage() <= table_.GetCapacity() ||
+         table_.GetCapacity() < sizeof(HandleImpl));
 }
 
 template <class Table>
@@ -1240,18 +1225,14 @@ int FixedHyperClockTable::CalcHashBits(
 
 template <class Table>
 void ClockCacheShard<Table>::SetCapacity(size_t capacity) {
-  capacity_.StoreRelaxed(capacity);
+  table_.SetCapacity(capacity);
   // next Insert will take care of any necessary evictions
 }
 
 template <class Table>
 void ClockCacheShard<Table>::SetStrictCapacityLimit(
     bool strict_capacity_limit) {
-  if (strict_capacity_limit) {
-    eec_and_scl_.FetchOrRelaxed(kStrictCapacityLimitBit);
-  } else {
-    eec_and_scl_.FetchAndRelaxed(~kStrictCapacityLimitBit);
-  }
+  table_.SetStrictCapacityLimit(strict_capacity_limit);
   // next Insert will take care of any necessary evictions
 }
 
@@ -1271,9 +1252,7 @@ Status ClockCacheShard<Table>::Insert(const Slice& key,
   proto.value = value;
   proto.helper = helper;
   proto.total_charge = charge;
-  return table_.template Insert<Table>(proto, handle, priority,
-                                       capacity_.LoadRelaxed(),
-                                       eec_and_scl_.LoadRelaxed());
+  return table_.template Insert<Table>(proto, handle, priority);
 }
 
 template <class Table>
@@ -1288,9 +1267,7 @@ typename Table::HandleImpl* ClockCacheShard<Table>::CreateStandalone(
   proto.value = obj;
   proto.helper = helper;
   proto.total_charge = charge;
-  return table_.template CreateStandalone<Table>(proto, capacity_.LoadRelaxed(),
-                                                 eec_and_scl_.LoadRelaxed(),
-                                                 allow_uncharged);
+  return table_.template CreateStandalone<Table>(proto, allow_uncharged);
 }
 
 template <class Table>
@@ -1322,12 +1299,12 @@ bool ClockCacheShard<Table>::Release(HandleImpl* handle, bool useful,
 
 #ifndef NDEBUG
 template <class Table>
-void ClockCacheShard<Table>::TEST_RefN(HandleImpl* h, size_t n) {
+void ClockCacheShard<Table>::TEST_RefN(HandleImpl* h, uint32_t n) {
   table_.TEST_RefN(*h, n);
 }
 
 template <class Table>
-void ClockCacheShard<Table>::TEST_ReleaseN(HandleImpl* h, size_t n) {
+void ClockCacheShard<Table>::TEST_ReleaseN(HandleImpl* h, uint32_t n) {
   table_.TEST_ReleaseN(h, n);
 }
 #endif
@@ -1359,7 +1336,7 @@ size_t ClockCacheShard<Table>::GetStandaloneUsage() const {
 
 template <class Table>
 size_t ClockCacheShard<Table>::GetCapacity() const {
-  return capacity_.LoadRelaxed();
+  return table_.GetCapacity();
 }
 
 template <class Table>
@@ -1375,8 +1352,8 @@ size_t ClockCacheShard<Table>::GetPinnedUsage() const {
       metadata_charge_policy_ == kFullChargeCacheMetadata;
   ConstApplyToEntriesRange(
       [&table_pinned_usage, charge_metadata](const HandleImpl& h) {
-        uint64_t meta = h.meta.LoadRelaxed();
-        uint64_t refcount = GetRefcount(meta);
+        SlotMeta meta = h.meta.LoadRelaxed();
+        uint32_t refcount = meta.GetRefcount();
         // Holding one ref for ConstApplyToEntriesRange
         assert(refcount > 0);
         if (refcount > 1) {
@@ -1496,7 +1473,7 @@ void AddShardEvaluation(const FixedHyperClockCache::Shard& shard,
 }
 
 bool IsSlotOccupied(const ClockHandle& h) {
-  return (h.meta.LoadRelaxed() >> ClockHandle::kStateShift) != 0;
+  return !h.meta.LoadRelaxed().IsEmpty();
 }
 }  // namespace
 
@@ -1727,10 +1704,13 @@ inline uint64_t UsedLengthToLengthInfo(size_t used_length) {
   return length_info;
 }
 
+// Avoid potential initialization order race with port::kPageSize
+constexpr size_t kPresumedPageSize = 4096;
+
 inline size_t GetStartingLength(size_t capacity) {
-  if (capacity > port::kPageSize) {
+  if (capacity > kPresumedPageSize) {
     // Start with one memory page
-    return port::kPageSize / sizeof(AutoHyperClockTable::HandleImpl);
+    return kPresumedPageSize / sizeof(AutoHyperClockTable::HandleImpl);
   } else {
     // Mostly to make unit tests happy
     return 4;
@@ -1751,26 +1731,6 @@ inline void GetHomeIndexAndShift(uint64_t length_info, uint64_t hash,
   assert(*home < LengthInfoToUsedLength(length_info));
 }
 
-inline int GetShiftFromNextWithShift(uint64_t next_with_shift) {
-  return BitwiseAnd(next_with_shift,
-                    AutoHyperClockTable::HandleImpl::kShiftMask);
-}
-
-inline size_t GetNextFromNextWithShift(uint64_t next_with_shift) {
-  return static_cast<size_t>(next_with_shift >>
-                             AutoHyperClockTable::HandleImpl::kNextShift);
-}
-
-inline uint64_t MakeNextWithShift(size_t next, int shift) {
-  return (uint64_t{next} << AutoHyperClockTable::HandleImpl::kNextShift) |
-         static_cast<uint64_t>(shift);
-}
-
-inline uint64_t MakeNextWithShiftEnd(size_t head, int shift) {
-  return AutoHyperClockTable::HandleImpl::kNextEndFlags |
-         MakeNextWithShift(head, shift);
-}
-
 // Helper function for Lookup
 inline bool MatchAndRef(const UniqueId64x2* hashed_key, const ClockHandle& h,
                         int shift = 0, size_t home = 0,
@@ -1778,12 +1738,12 @@ inline bool MatchAndRef(const UniqueId64x2* hashed_key, const ClockHandle& h,
   // Must be at least something to match
   assert(hashed_key || shift > 0);
 
-  uint64_t old_meta;
+  SlotMeta old_meta, new_meta;
   // (Optimistically) increment acquire counter.
-  old_meta = h.meta.FetchAdd(ClockHandle::kAcquireIncrement);
+  auto add_acquire = AcquireCounter::PlusTransformPromiseNoOverflow(1);
+  h.meta.Apply(add_acquire, &old_meta, &new_meta);
   // Check if it's a referencable (sharable) entry
-  if ((old_meta & (uint64_t{ClockHandle::kStateShareableBit}
-                   << ClockHandle::kStateShift)) == 0) {
+  if (!old_meta.IsShareable()) {
     // For non-sharable states, incrementing the acquire counter has no effect
     // so we don't need to undo it. Furthermore, we cannot safely undo
     // it because we did not acquire a read reference to lock the
@@ -1794,10 +1754,9 @@ inline bool MatchAndRef(const UniqueId64x2* hashed_key, const ClockHandle& h,
     return false;
   }
   // Else acquired a read reference
-  assert(GetRefcount(old_meta + ClockHandle::kAcquireIncrement) > 0);
+  assert(new_meta.GetRefcount() > 0);
   if (hashed_key && h.hashed_key == *hashed_key &&
-      LIKELY(old_meta & (uint64_t{ClockHandle::kStateVisibleBit}
-                         << ClockHandle::kStateShift))) {
+      LIKELY(old_meta.IsVisible())) {
     // Match on full key, visible
     if (full_match_or_unknown) {
       *full_match_or_unknown = true;
@@ -1820,36 +1779,39 @@ inline bool MatchAndRef(const UniqueId64x2* hashed_key, const ClockHandle& h,
   }
 }
 
+using NextWithShift = AutoHyperClockTable::HandleImpl::NextWithShift;
+
 // Assumes a chain rewrite lock prevents concurrent modification of
 // these chain pointers
 void UpgradeShiftsOnRange(AutoHyperClockTable::HandleImpl* arr,
-                          size_t& frontier, uint64_t stop_before_or_new_tail,
-                          int old_shift, int new_shift) {
+                          size_t& frontier,
+                          NextWithShift stop_before_or_new_tail, int old_shift,
+                          int new_shift) {
   assert(frontier != SIZE_MAX);
   assert(new_shift == old_shift + 1);
   (void)old_shift;
   (void)new_shift;
-  using HandleImpl = AutoHyperClockTable::HandleImpl;
   for (;;) {
-    uint64_t next_with_shift = arr[frontier].chain_next_with_shift.Load();
-    assert(GetShiftFromNextWithShift(next_with_shift) == old_shift);
+    NextWithShift next_with_shift = arr[frontier].chain_next_with_shift.Load();
+    assert(next_with_shift.GetShift() == old_shift);
     if (next_with_shift == stop_before_or_new_tail) {
       // Stopping at entry with pointer matching "stop before"
-      assert(!HandleImpl::IsEnd(next_with_shift));
+      assert(!next_with_shift.IsEnd());
       return;
     }
-    if (HandleImpl::IsEnd(next_with_shift)) {
+    if (next_with_shift.IsEnd()) {
       // Also update tail to new tail
-      assert(HandleImpl::IsEnd(stop_before_or_new_tail));
+      assert(stop_before_or_new_tail.IsEnd());
       arr[frontier].chain_next_with_shift.Store(stop_before_or_new_tail);
       // Mark nothing left to upgrade
       frontier = SIZE_MAX;
       return;
     }
     // Next is another entry to process, so upgrade and advance frontier
-    arr[frontier].chain_next_with_shift.FetchAdd(1U);
-    assert(GetShiftFromNextWithShift(next_with_shift + 1) == new_shift);
-    frontier = GetNextFromNextWithShift(next_with_shift);
+    arr[frontier].chain_next_with_shift.Apply(
+        NextWithShift::Shift::PlusTransformPromiseNoOverflow(1U));
+    assert(next_with_shift.GetShift() + 1 == new_shift);
+    frontier = next_with_shift.GetNext();
   }
 }
 
@@ -1887,19 +1849,19 @@ class AutoHyperClockTable::ChainRewriteLock {
   // RAII wrap existing lock held (or end)
   explicit ChainRewriteLock(HandleImpl* h,
                             RelaxedAtomic<uint64_t>& /*yield_count*/,
-                            uint64_t already_locked_or_end)
+                            NextWithShift already_locked_or_end)
       : head_ptr_(&h->head_next_with_shift) {
     saved_head_ = already_locked_or_end;
     // already locked or end
-    assert(saved_head_ & HandleImpl::kHeadLocked);
+    assert(saved_head_.IsLocked());
   }
 
   ~ChainRewriteLock() {
     if (!IsEnd()) {
       // Release lock
-      uint64_t old = head_ptr_->FetchAnd(~HandleImpl::kHeadLocked);
-      (void)old;
-      assert((old & HandleImpl::kNextEndFlags) == HandleImpl::kHeadLocked);
+      NextWithShift old;
+      head_ptr_->Apply(NextWithShift::LockedFlag::ClearTransform(), &old);
+      assert(old.IsLockedNotEnd());
     }
   }
 
@@ -1909,12 +1871,13 @@ class AutoHyperClockTable::ChainRewriteLock {
   }
 
   // Expected current state, assuming no parallel updates.
-  uint64_t GetSavedHead() const { return saved_head_; }
+  NextWithShift GetSavedHead() const { return saved_head_; }
 
-  bool CasUpdate(uint64_t next_with_shift,
+  bool CasUpdate(NextWithShift next_with_shift,
                  RelaxedAtomic<uint64_t>& yield_count) {
-    uint64_t new_head = next_with_shift | HandleImpl::kHeadLocked;
-    uint64_t expected = GetSavedHead();
+    NextWithShift new_head =
+        next_with_shift.With<NextWithShift::LockedFlag>(true);
+    NextWithShift expected = GetSavedHead();
     bool success = head_ptr_->CasStrong(expected, new_head);
     if (success) {
       // Ensure IsEnd() is kept up-to-date, including for dtor
@@ -1923,7 +1886,7 @@ class AutoHyperClockTable::ChainRewriteLock {
       // Parallel update to head, such as Insert()
       if (IsEnd()) {
         // Didn't previously hold a lock
-        if (HandleImpl::IsEnd(expected)) {
+        if (expected.IsEnd()) {
           // Still don't need to
           saved_head_ = expected;
         } else {
@@ -1932,28 +1895,25 @@ class AutoHyperClockTable::ChainRewriteLock {
         }
       } else {
         // Parallel update must preserve our lock
-        assert((expected & HandleImpl::kNextEndFlags) ==
-               HandleImpl::kHeadLocked);
+        assert(expected.IsLockedNotEnd());
         saved_head_ = expected;
       }
     }
     return success;
   }
 
-  bool IsEnd() const { return HandleImpl::IsEnd(saved_head_); }
+  bool IsEnd() const { return saved_head_.IsEnd(); }
 
  private:
   void Acquire(RelaxedAtomic<uint64_t>& yield_count) {
     for (;;) {
       // Acquire removal lock on the chain
-      uint64_t old_head = head_ptr_->FetchOr(HandleImpl::kHeadLocked);
-      if ((old_head & HandleImpl::kNextEndFlags) != HandleImpl::kHeadLocked) {
+      NextWithShift old_head;
+      head_ptr_->Apply(NextWithShift::LockedFlag::SetTransform(), &old_head,
+                       &saved_head_);
+      if (!old_head.IsLockedNotEnd()) {
         // Either acquired the lock or lock not needed (end)
-        assert((old_head & HandleImpl::kNextEndFlags) == 0 ||
-               (old_head & HandleImpl::kNextEndFlags) ==
-                   HandleImpl::kNextEndFlags);
-
-        saved_head_ = old_head | HandleImpl::kHeadLocked;
+        assert(old_head.IsEnd() == old_head.IsLocked());
         break;
       }
       // NOTE: one of the few yield-wait loops, which is rare enough in practice
@@ -1964,16 +1924,18 @@ class AutoHyperClockTable::ChainRewriteLock {
     }
   }
 
-  AcqRelAtomic<uint64_t>* head_ptr_;
-  uint64_t saved_head_;
+  BitFieldsAtomic<NextWithShift>* head_ptr_;
+  NextWithShift saved_head_;
 };
 
 AutoHyperClockTable::AutoHyperClockTable(
-    size_t capacity, CacheMetadataChargePolicy metadata_charge_policy,
+    size_t capacity, bool strict_capacity_limit,
+    CacheMetadataChargePolicy metadata_charge_policy,
     MemoryAllocator* allocator,
     const Cache::EvictionCallback* eviction_callback, const uint32_t* hash_seed,
     const Opts& opts)
-    : BaseClockTable(metadata_charge_policy, allocator, eviction_callback,
+    : BaseClockTable(capacity, strict_capacity_limit, opts.eviction_effort_cap,
+                     metadata_charge_policy, allocator, eviction_callback,
                      hash_seed),
       array_(MemMapping::AllocateLazyZeroed(
           sizeof(HandleImpl) * CalcMaxUsableLength(capacity,
@@ -1985,6 +1947,11 @@ AutoHyperClockTable::AutoHyperClockTable(
       grow_frontier_(GetTableSize()),
       clock_pointer_mask_(
           BottomNBits(UINT64_MAX, LengthInfoToMinShift(length_info_.Load()))) {
+  if (array_.Get() == nullptr) {
+    fprintf(stderr,
+            "Anonymous mmap for RocksDB HyperClockCache failed. Aborting.\n");
+    std::terminate();
+  }
   if (metadata_charge_policy ==
       CacheMetadataChargePolicy::kFullChargeCacheMetadata) {
     // NOTE: ignoring page boundaries for simplicity
@@ -2013,9 +1980,9 @@ AutoHyperClockTable::AutoHyperClockTable(
 #endif
     if (major + i < used_length) {
       array_[i].head_next_with_shift.StoreRelaxed(
-          MakeNextWithShiftEnd(i, max_shift));
+          NextWithShift::MakeEnd(i, max_shift));
       array_[major + i].head_next_with_shift.StoreRelaxed(
-          MakeNextWithShiftEnd(major + i, max_shift));
+          NextWithShift::MakeEnd(major + i, max_shift));
 #ifndef NDEBUG  // Extra invariant checking
       GetHomeIndexAndShift(length_info, i, &home, &shift);
       assert(home == i);
@@ -2026,7 +1993,7 @@ AutoHyperClockTable::AutoHyperClockTable(
 #endif
     } else {
       array_[i].head_next_with_shift.StoreRelaxed(
-          MakeNextWithShiftEnd(i, min_shift));
+          NextWithShift::MakeEnd(i, min_shift));
 #ifndef NDEBUG  // Extra invariant checking
       GetHomeIndexAndShift(length_info, i, &home, &shift);
       assert(home == i);
@@ -2052,52 +2019,54 @@ AutoHyperClockTable::~AutoHyperClockTable() {
              HandleImpl::kUnusedMarker) {
     used_end++;
   }
-#ifndef NDEBUG
-  for (size_t i = used_end; i < array_.Count(); i++) {
-    assert(array_[i].head_next_with_shift.LoadRelaxed() == 0);
-    assert(array_[i].chain_next_with_shift.LoadRelaxed() == 0);
-    assert(array_[i].meta.LoadRelaxed() == 0);
-  }
+  // This check can be extra expensive for a cache that is just created,
+  // maybe used for a small number of entries, as in a unit test, and then
+  // destroyed. Only do this in rare modes. REVISED: Don't scan the whole mmap,
+  // just a reasonable frontier past what we expect to have written.
+#ifdef MUST_FREE_HEAP_ALLOCATIONS
+  for (size_t i = used_end; i < array_.Count() && i < used_end + 64U; i++) {
+    assert(array_[i].head_next_with_shift.LoadRelaxed() ==
+           HandleImpl::kUnusedMarker);
+    assert(array_[i].chain_next_with_shift.LoadRelaxed() ==
+           HandleImpl::kUnusedMarker);
+    assert(array_[i].meta.LoadRelaxed() == SlotMeta{});
+  }
+#endif          // MUST_FREE_HEAP_ALLOCATIONS
+#ifndef NDEBUG  // Extra invariant checking
   std::vector<bool> was_populated(used_end);
   std::vector<bool> was_pointed_to(used_end);
-#endif
+#endif  // !NDEBUG
   for (size_t i = 0; i < used_end; i++) {
     HandleImpl& h = array_[i];
-    switch (h.meta.LoadRelaxed() >> ClockHandle::kStateShift) {
-      case ClockHandle::kStateEmpty:
-        // noop
-        break;
-      case ClockHandle::kStateInvisible:  // rare but possible
-      case ClockHandle::kStateVisible:
-        assert(GetRefcount(h.meta.LoadRelaxed()) == 0);
-        h.FreeData(allocator_);
+    SlotMeta meta = h.meta.LoadRelaxed();
+    if (meta.IsShareable()) {
+      // NOTE: Reaching here invisible is rare but possible
+      assert(meta.GetRefcount() == 0);
+      h.FreeData(allocator_);
 #ifndef NDEBUG  // Extra invariant checking
-        usage_.FetchSubRelaxed(h.total_charge);
-        occupancy_.FetchSubRelaxed(1U);
-        was_populated[i] = true;
-        if (!HandleImpl::IsEnd(h.chain_next_with_shift.LoadRelaxed())) {
-          assert((h.chain_next_with_shift.LoadRelaxed() &
-                  HandleImpl::kHeadLocked) == 0);
-          size_t next =
-              GetNextFromNextWithShift(h.chain_next_with_shift.LoadRelaxed());
-          assert(!was_pointed_to[next]);
-          was_pointed_to[next] = true;
-        }
-#endif
-        break;
-      // otherwise
-      default:
-        assert(false);
-        break;
+      usage_.FetchSubRelaxed(h.total_charge);
+      occupancy_.FetchSubRelaxed(1U);
+      was_populated[i] = true;
+      if (!h.chain_next_with_shift.LoadRelaxed().IsEnd()) {
+        assert(!h.chain_next_with_shift.LoadRelaxed().IsLocked());
+        size_t next = h.chain_next_with_shift.LoadRelaxed().GetNext();
+        assert(!was_pointed_to[next]);
+        was_pointed_to[next] = true;
+      }
+#endif  // !NDEBUG
+    } else {
+      // Should be no transient "under construction" states unless a thread
+      // was killed or we are being destructed while another thread is still
+      // operating on the structure
+      assert(meta.IsEmpty());
     }
 #ifndef NDEBUG  // Extra invariant checking
-    if (!HandleImpl::IsEnd(h.head_next_with_shift.LoadRelaxed())) {
-      size_t next =
-          GetNextFromNextWithShift(h.head_next_with_shift.LoadRelaxed());
+    if (!h.head_next_with_shift.LoadRelaxed().IsEnd()) {
+      size_t next = h.head_next_with_shift.LoadRelaxed().GetNext();
       assert(!was_pointed_to[next]);
       was_pointed_to[next] = true;
     }
-#endif
+#endif  // !NDEBUG
   }
 #ifndef NDEBUG  // Extra invariant checking
   // This check is not perfect, but should detect most reasonable cases
@@ -2110,7 +2079,7 @@ AutoHyperClockTable::~AutoHyperClockTable() {
       assert(!was_pointed_to[i]);
     }
   }
-#endif
+#endif  // !NDEBUG
 
   // Metadata charging only follows the published table size
   assert(usage_.LoadRelaxed() == 0 ||
@@ -2208,10 +2177,10 @@ bool AutoHyperClockTable::Grow(InsertState& state) {
   // chain rewrite lock has been released.
   size_t old_old_home = BottomNBits(grow_home, old_shift - 1);
   for (;;) {
-    uint64_t old_old_head = array_[old_old_home].head_next_with_shift.Load();
-    if (GetShiftFromNextWithShift(old_old_head) >= old_shift) {
-      if ((old_old_head & HandleImpl::kNextEndFlags) !=
-          HandleImpl::kHeadLocked) {
+    NextWithShift old_old_head =
+        array_[old_old_home].head_next_with_shift.Load();
+    if (old_old_head.GetShift() >= old_shift) {
+      if (!old_old_head.IsLockedNotEnd()) {
         break;
       }
     }
@@ -2271,8 +2240,7 @@ void AutoHyperClockTable::CatchUpLengthInfoNoWait(
     if (published_usable_size < known_usable_grow_home) {
       int old_shift = FloorLog2(next_usable_size - 1);
       size_t old_home = BottomNBits(published_usable_size, old_shift);
-      int shift = GetShiftFromNextWithShift(
-          array_[old_home].head_next_with_shift.Load());
+      int shift = array_[old_home].head_next_with_shift.Load().GetShift();
       if (shift <= old_shift) {
         // Not ready
         break;
@@ -2423,9 +2391,10 @@ void AutoHyperClockTable::SplitForGrow(size_t grow_home, size_t old_home,
   ChainRewriteLock zero_head_lock(&arr[old_home], yield_count_);
 
   // Used for locking the one chain below
-  uint64_t saved_one_head;
+  NextWithShift saved_one_head;
   // One head has not been written to
-  assert(arr[grow_home].head_next_with_shift.Load() == 0);
+  assert(arr[grow_home].head_next_with_shift.Load() ==
+         HandleImpl::kUnusedMarker);
 
   // old_home will also the head of the new "zero chain" -- all entries in the
   // "from" chain whose next hash bit is 0. grow_home will be head of the new
@@ -2447,7 +2416,7 @@ void AutoHyperClockTable::SplitForGrow(size_t grow_home, size_t old_home,
     assert(cur == SIZE_MAX);
     assert(chain_frontier_first == -1);
 
-    uint64_t next_with_shift = zero_head_lock.GetSavedHead();
+    NextWithShift next_with_shift = zero_head_lock.GetSavedHead();
 
     // Find a single representative for each target chain, or scan the whole
     // chain if some target chain has no representative.
@@ -2460,16 +2429,16 @@ void AutoHyperClockTable::SplitForGrow(size_t grow_home, size_t old_home,
       assert((cur == SIZE_MAX) == (zero_chain_frontier == SIZE_MAX &&
                                    one_chain_frontier == SIZE_MAX));
 
-      assert(GetShiftFromNextWithShift(next_with_shift) == old_shift);
+      assert(next_with_shift.GetShift() == old_shift);
 
       // Check for end of original chain
-      if (HandleImpl::IsEnd(next_with_shift)) {
+      if (next_with_shift.IsEnd()) {
         cur = SIZE_MAX;
         break;
       }
 
       // next_with_shift is not End
-      cur = GetNextFromNextWithShift(next_with_shift);
+      cur = next_with_shift.GetNext();
 
       if (BottomNBits(arr[cur].hashed_key[1], new_shift) == old_home) {
         // Entry for zero chain
@@ -2508,10 +2477,10 @@ void AutoHyperClockTable::SplitForGrow(size_t grow_home, size_t old_home,
            (zero_chain_frontier == SIZE_MAX && one_chain_frontier == SIZE_MAX));
 
     // Always update one chain's head first (safe), and mark it as locked
-    saved_one_head = HandleImpl::kHeadLocked |
-                     (one_chain_frontier != SIZE_MAX
-                          ? MakeNextWithShift(one_chain_frontier, new_shift)
-                          : MakeNextWithShiftEnd(grow_home, new_shift));
+    saved_one_head = one_chain_frontier != SIZE_MAX
+                         ? NextWithShift::Make(one_chain_frontier, new_shift)
+                         : NextWithShift::MakeEnd(grow_home, new_shift);
+    saved_one_head.Set<NextWithShift::LockedFlag>(true);
     arr[grow_home].head_next_with_shift.Store(saved_one_head);
 
     // Make sure length_info_ hasn't been updated too early, as we're about
@@ -2521,8 +2490,8 @@ void AutoHyperClockTable::SplitForGrow(size_t grow_home, size_t old_home,
     // Try to set zero's head.
     if (zero_head_lock.CasUpdate(
             zero_chain_frontier != SIZE_MAX
-                ? MakeNextWithShift(zero_chain_frontier, new_shift)
-                : MakeNextWithShiftEnd(old_home, new_shift),
+                ? NextWithShift::Make(zero_chain_frontier, new_shift)
+                : NextWithShift::MakeEnd(old_home, new_shift),
             yield_count_)) {
       // Both heads successfully updated to new shift
       break;
@@ -2556,10 +2525,10 @@ void AutoHyperClockTable::SplitForGrow(size_t grow_home, size_t old_home,
     size_t& other_frontier = chain_frontier_first != 0
                                  ? /*&*/ zero_chain_frontier
                                  : /*&*/ one_chain_frontier;
-    uint64_t stop_before_or_new_tail =
+    NextWithShift stop_before_or_new_tail =
         other_frontier != SIZE_MAX
-            ? /*stop before*/ MakeNextWithShift(other_frontier, old_shift)
-            : /*new tail*/ MakeNextWithShiftEnd(
+            ? /*stop before*/ NextWithShift::Make(other_frontier, old_shift)
+            : /*new tail*/ NextWithShift::MakeEnd(
                   chain_frontier_first == 0 ? old_home : grow_home, new_shift);
     UpgradeShiftsOnRange(arr, first_frontier, stop_before_or_new_tail,
                          old_shift, new_shift);
@@ -2585,20 +2554,19 @@ void AutoHyperClockTable::SplitForGrow(size_t grow_home, size_t old_home,
                                    ? /*&*/ zero_chain_frontier
                                    : /*&*/ one_chain_frontier;
       assert(cur != first_frontier);
-      assert(GetNextFromNextWithShift(
-                 arr[first_frontier].chain_next_with_shift.Load()) ==
+      assert(arr[first_frontier].chain_next_with_shift.Load().GetNext() ==
              other_frontier);
 
-      uint64_t next_with_shift = arr[cur].chain_next_with_shift.Load();
+      NextWithShift next_with_shift = arr[cur].chain_next_with_shift.Load();
 
       // Check for end of original chain
-      if (HandleImpl::IsEnd(next_with_shift)) {
+      if (next_with_shift.IsEnd()) {
         // Can set upgraded tail on first chain
-        uint64_t first_new_tail = MakeNextWithShiftEnd(
+        NextWithShift first_new_tail = NextWithShift::MakeEnd(
             chain_frontier_first == 0 ? old_home : grow_home, new_shift);
         arr[first_frontier].chain_next_with_shift.Store(first_new_tail);
         // And upgrade remainder of other chain
-        uint64_t other_new_tail = MakeNextWithShiftEnd(
+        NextWithShift other_new_tail = NextWithShift::MakeEnd(
             chain_frontier_first != 0 ? old_home : grow_home, new_shift);
         UpgradeShiftsOnRange(arr, other_frontier, other_new_tail, old_shift,
                              new_shift);
@@ -2607,7 +2575,7 @@ void AutoHyperClockTable::SplitForGrow(size_t grow_home, size_t old_home,
       }
 
       // next_with_shift is not End
-      cur = GetNextFromNextWithShift(next_with_shift);
+      cur = next_with_shift.GetNext();
 
       int target_chain;
       if (BottomNBits(arr[cur].hashed_key[1], new_shift) == old_home) {
@@ -2620,7 +2588,7 @@ void AutoHyperClockTable::SplitForGrow(size_t grow_home, size_t old_home,
       }
       if (target_chain == chain_frontier_first) {
         // Found next entry to skip to on the first chain
-        uint64_t skip_to = MakeNextWithShift(cur, new_shift);
+        NextWithShift skip_to = NextWithShift::Make(cur, new_shift);
         arr[first_frontier].chain_next_with_shift.Store(skip_to);
         first_frontier = cur;
         // Upgrade other chain up to entry before that one
@@ -2661,17 +2629,17 @@ void AutoHyperClockTable::PurgeImplLocked(OpData* op_data,
 
   HandleImpl* const arr = array_.Get();
 
-  uint64_t next_with_shift = rewrite_lock.GetSavedHead();
-  assert(!HandleImpl::IsEnd(next_with_shift));
-  int home_shift = GetShiftFromNextWithShift(next_with_shift);
+  NextWithShift next_with_shift = rewrite_lock.GetSavedHead();
+  assert(!next_with_shift.IsEnd());
+  int home_shift = next_with_shift.GetShift();
   (void)home;
   (void)home_shift;
-  size_t next = GetNextFromNextWithShift(next_with_shift);
+  size_t next = next_with_shift.GetNext();
   assert(next < array_.Count());
   HandleImpl* h = &arr[next];
   HandleImpl* prev_to_keep = nullptr;
 #ifndef NDEBUG
-  uint64_t prev_to_keep_next_with_shift = 0;
+  NextWithShift prev_to_keep_next_with_shift{};
 #endif
   // Whether there are entries between h and prev_to_keep that should be
   // purged from the chain.
@@ -2698,20 +2666,17 @@ void AutoHyperClockTable::PurgeImplLocked(OpData* op_data,
           op_data->push_back(h);
           // Entries for eviction become purgeable
           purgeable = true;
-          assert((h->meta.Load() >> ClockHandle::kStateShift) ==
-                 ClockHandle::kStateConstruction);
+          assert(h->meta.Load().IsUnderConstruction());
         }
       } else {
         (void)op_data;
         (void)data;
-        purgeable = ((h->meta.Load() >> ClockHandle::kStateShift) &
-                     ClockHandle::kStateShareableBit) == 0;
+        purgeable = !h->meta.Load().IsShareable();
       }
     }
 
     if (purgeable) {
-      assert((h->meta.Load() >> ClockHandle::kStateShift) ==
-             ClockHandle::kStateConstruction);
+      assert(h->meta.Load().IsUnderConstruction());
       pending_purge = true;
     } else if (pending_purge) {
       if (prev_to_keep) {
@@ -2729,13 +2694,13 @@ void AutoHyperClockTable::PurgeImplLocked(OpData* op_data,
         // update any new entries just inserted in parallel.
         // Can simply restart (GetSavedHead() already updated from CAS failure).
         next_with_shift = rewrite_lock.GetSavedHead();
-        assert(!HandleImpl::IsEnd(next_with_shift));
-        next = GetNextFromNextWithShift(next_with_shift);
+        assert(!next_with_shift.IsEnd());
+        next = next_with_shift.GetNext();
         assert(next < array_.Count());
         h = &arr[next];
         pending_purge = false;
         assert(prev_to_keep == nullptr);
-        assert(GetShiftFromNextWithShift(next_with_shift) == home_shift);
+        assert(next_with_shift.GetShift() == home_shift);
         continue;
       }
       pending_purge = false;
@@ -2757,13 +2722,13 @@ void AutoHyperClockTable::PurgeImplLocked(OpData* op_data,
     }
 #endif
 
-    assert(GetShiftFromNextWithShift(next_with_shift) == home_shift);
+    assert(next_with_shift.GetShift() == home_shift);
 
     // Check for end marker
-    if (HandleImpl::IsEnd(next_with_shift)) {
+    if (next_with_shift.IsEnd()) {
       h = nullptr;
     } else {
-      next = GetNextFromNextWithShift(next_with_shift);
+      next = next_with_shift.GetNext();
       assert(next < array_.Count());
       h = &arr[next];
       assert(h != prev_to_keep);
@@ -2835,7 +2800,7 @@ void AutoHyperClockTable::PurgeImpl(OpData* op_data, size_t home,
     // Ensure we are at the correct home for the shift in effect for the
     // chain head.
     for (;;) {
-      int shift = GetShiftFromNextWithShift(rewrite_lock.GetSavedHead());
+      int shift = rewrite_lock.GetSavedHead().GetShift();
 
       if (shift > home_shift) {
         // Found a newer shift at candidate head, which must apply to us.
@@ -2871,7 +2836,7 @@ void AutoHyperClockTable::PurgeImpl(OpData* op_data, size_t home,
 }
 
 AutoHyperClockTable::HandleImpl* AutoHyperClockTable::DoInsert(
-    const ClockHandleBasicData& proto, uint64_t initial_countdown,
+    const ClockHandleBasicData& proto, uint32_t initial_countdown,
     bool take_ref, InsertState& state) {
   size_t home;
   int orig_home_shift;
@@ -3031,14 +2996,14 @@ AutoHyperClockTable::HandleImpl* AutoHyperClockTable::DoInsert(
   }
 
   // Now insert into chain using head pointer
-  uint64_t next_with_shift;
+  NextWithShift next_with_shift;
   int home_shift = orig_home_shift;
 
   // Might need to retry
   for (int i = 0;; ++i) {
     CHECK_TOO_MANY_ITERATIONS(i);
     next_with_shift = arr[home].head_next_with_shift.Load();
-    int shift = GetShiftFromNextWithShift(next_with_shift);
+    int shift = next_with_shift.GetShift();
 
     if (UNLIKELY(shift != home_shift)) {
       // NOTE: shift increases with table growth
@@ -3065,15 +3030,14 @@ AutoHyperClockTable::HandleImpl* AutoHyperClockTable::DoInsert(
     }
 
     // Values to update to
-    uint64_t head_next_with_shift = MakeNextWithShift(idx, home_shift);
-    uint64_t chain_next_with_shift = next_with_shift;
+    NextWithShift head_next_with_shift = NextWithShift::Make(idx, home_shift);
+    NextWithShift chain_next_with_shift = next_with_shift;
 
     // Preserve the locked state in head, without propagating to chain next
     // where it is meaningless (and not allowed)
-    if (UNLIKELY((next_with_shift & HandleImpl::kNextEndFlags) ==
-                 HandleImpl::kHeadLocked)) {
-      head_next_with_shift |= HandleImpl::kHeadLocked;
-      chain_next_with_shift &= ~HandleImpl::kHeadLocked;
+    if (UNLIKELY(next_with_shift.IsLockedNotEnd())) {
+      head_next_with_shift.Set<NextWithShift::LockedFlag>(true);
+      chain_next_with_shift.Set<NextWithShift::LockedFlag>(false);
     }
 
     arr[idx].chain_next_with_shift.Store(chain_next_with_shift);
@@ -3142,9 +3106,9 @@ AutoHyperClockTable::HandleImpl* AutoHyperClockTable::Lookup(
   // of a loop as possible.
 
   HandleImpl* const arr = array_.Get();
-  uint64_t next_with_shift = arr[home].head_next_with_shift.LoadRelaxed();
-  for (size_t i = 0; !HandleImpl::IsEnd(next_with_shift) && i < 10; ++i) {
-    HandleImpl* h = &arr[GetNextFromNextWithShift(next_with_shift)];
+  NextWithShift next_with_shift = arr[home].head_next_with_shift.LoadRelaxed();
+  for (size_t i = 0; !next_with_shift.IsEnd() && i < 10; ++i) {
+    HandleImpl* h = &arr[next_with_shift.IsEnd()];
     // Attempt cheap key match without acquiring a read ref. This could give a
     // false positive, which is re-checked after acquiring read ref, or false
     // negative, which is re-checked in the full Lookup. Also, this is a
@@ -3157,14 +3121,14 @@ AutoHyperClockTable::HandleImpl* AutoHyperClockTable::Lookup(
 #endif
     if (probably_equal) {
       // Increment acquire counter for definitive check
-      uint64_t old_meta = h->meta.FetchAdd(ClockHandle::kAcquireIncrement);
+      auto add_acquire = AcquireCounter::PlusTransformPromiseNoOverflow(1);
+      SlotMeta old_meta, new_meta;
+      h->meta.Apply(add_acquire, &old_meta, &new_meta);
       // Check if it's a referencable (sharable) entry
-      if (LIKELY(old_meta & (uint64_t{ClockHandle::kStateShareableBit}
-                             << ClockHandle::kStateShift))) {
-        assert(GetRefcount(old_meta + ClockHandle::kAcquireIncrement) > 0);
+      if (LIKELY(old_meta.IsShareable())) {
+        assert(new_meta.GetRefcount() > 0);
         if (LIKELY(h->hashed_key == hashed_key) &&
-            LIKELY(old_meta & (uint64_t{ClockHandle::kStateVisibleBit}
-                               << ClockHandle::kStateShift))) {
+            LIKELY(old_meta.IsVisible())) {
           return h;
         } else {
           Unref(*h);
@@ -3189,7 +3153,7 @@ AutoHyperClockTable::HandleImpl* AutoHyperClockTable::Lookup(
     // Read head or chain pointer
     next_with_shift = h ? h->chain_next_with_shift.Load()
                         : arr[home].head_next_with_shift.Load();
-    int shift = GetShiftFromNextWithShift(next_with_shift);
+    int shift = next_with_shift.GetShift();
 
     // Make sure it's usable
     size_t effective_home = home;
@@ -3243,10 +3207,10 @@ AutoHyperClockTable::HandleImpl* AutoHyperClockTable::Lookup(
     }
 
     // Check for end marker
-    if (HandleImpl::IsEnd(next_with_shift)) {
+    if (next_with_shift.IsEnd()) {
       // To ensure we didn't miss anything in the chain, the end marker must
       // point back to the correct home.
-      if (LIKELY(GetNextFromNextWithShift(next_with_shift) == effective_home)) {
+      if (LIKELY(next_with_shift.GetNext() == effective_home)) {
         // Complete, clean iteration of the chain, not found.
         // Clean up.
         if (read_ref_on_chain) {
@@ -3262,7 +3226,7 @@ AutoHyperClockTable::HandleImpl* AutoHyperClockTable::Lookup(
     }
 
     // Follow the next and check for full key match, home match, or neither
-    h = &arr[GetNextFromNextWithShift(next_with_shift)];
+    h = &arr[next_with_shift.GetNext()];
     bool full_match_or_unknown = false;
     if (MatchAndRef(&hashed_key, *h, shift, effective_home,
                     &full_match_or_unknown)) {
@@ -3285,7 +3249,7 @@ AutoHyperClockTable::HandleImpl* AutoHyperClockTable::Lookup(
         }
         // Update the hit bit
         if (eviction_callback_) {
-          h->meta.FetchOrRelaxed(uint64_t{1} << ClockHandle::kHitBitShift);
+          h->meta.ApplyRelaxed(SlotMeta::HitFlag::SetTransform());
         }
         // All done.
         return h;
@@ -3325,8 +3289,7 @@ AutoHyperClockTable::HandleImpl* AutoHyperClockTable::Lookup(
 }
 
 void AutoHyperClockTable::Remove(HandleImpl* h) {
-  assert((h->meta.Load() >> ClockHandle::kStateShift) ==
-         ClockHandle::kStateConstruction);
+  assert(h->meta.Load().IsUnderConstruction());
 
   const HandleImpl& c_h = *h;
   PurgeImpl(&c_h.hashed_key);
@@ -3334,26 +3297,23 @@ void AutoHyperClockTable::Remove(HandleImpl* h) {
 
 bool AutoHyperClockTable::TryEraseHandle(HandleImpl* h, bool holding_ref,
                                          bool mark_invisible) {
-  uint64_t meta;
-  if (mark_invisible) {
-    // Set invisible
-    meta = h->meta.FetchAnd(
-        ~(uint64_t{ClockHandle::kStateVisibleBit} << ClockHandle::kStateShift));
-    // To local variable also
-    meta &=
-        ~(uint64_t{ClockHandle::kStateVisibleBit} << ClockHandle::kStateShift);
-  } else {
-    meta = h->meta.Load();
-  }
+  SlotMeta meta = h->meta.Load();
+  assert(!holding_ref || meta.IsShareable());
 
-  // Take ownership if no other refs
+  // Take ownership if no other refs, or set invisible if other refs exist (and
+  // mark_invisible is set).
+  SlotMeta construction_meta;
+  construction_meta.SetUnderConstruction();
   do {
-    if (GetRefcount(meta) != uint64_t{holding_ref}) {
+    if (meta.GetRefcount() != uint32_t{holding_ref}) {
       // Not last ref at some point in time during this call
+      if (mark_invisible) {
+        // Set invisible
+        h->meta.Apply(SlotMeta::VisibleFlag::ClearTransform());
+      }
       return false;
     }
-    if ((meta & (uint64_t{ClockHandle::kStateShareableBit}
-                 << ClockHandle::kStateShift)) == 0) {
+    if (!meta.IsShareable()) {
       // Someone else took ownership
       return false;
     }
@@ -3361,8 +3321,7 @@ bool AutoHyperClockTable::TryEraseHandle(HandleImpl* h, bool holding_ref,
     // another thread replaces this entry with another, reaches zero refs, and
     // then we end up erasing that other entry. That's an acceptable risk /
     // imprecision.
-  } while (!h->meta.CasWeak(meta, uint64_t{ClockHandle::kStateConstruction}
-                                      << ClockHandle::kStateShift));
+  } while (!h->meta.CasWeak(meta, construction_meta));
   // Took ownership
   // TODO? Delay freeing?
   h->FreeData(allocator_);
@@ -3389,27 +3348,24 @@ bool AutoHyperClockTable::Release(HandleImpl* h, bool useful,
   // is needed) and Erase. We do this to avoid an extra atomic read of the
   // variable usage_.
 
-  uint64_t old_meta;
+  SlotMeta old_meta;
   if (useful) {
     // Increment release counter to indicate was used
-    old_meta = h->meta.FetchAdd(ClockHandle::kReleaseIncrement);
+    auto add_release = ReleaseCounter::PlusTransformPromiseNoOverflow(1);
+    h->meta.Apply(add_release, &old_meta);
     // Correct for possible (but rare) overflow
     CorrectNearOverflow(old_meta, h->meta);
   } else {
     // Decrement acquire counter to pretend it never happened
-    old_meta = h->meta.FetchSub(ClockHandle::kAcquireIncrement);
+    auto sub_acquire = AcquireCounter::MinusTransformPromiseNoUnderflow(1);
+    h->meta.Apply(sub_acquire, &old_meta);
   }
 
-  assert((old_meta >> ClockHandle::kStateShift) &
-         ClockHandle::kStateShareableBit);
+  assert(old_meta.IsShareable());
   // No underflow
-  assert(((old_meta >> ClockHandle::kAcquireCounterShift) &
-          ClockHandle::kCounterMask) !=
-         ((old_meta >> ClockHandle::kReleaseCounterShift) &
-          ClockHandle::kCounterMask));
+  assert(old_meta.GetAcquireCounter() != old_meta.GetReleaseCounter());
 
-  if ((erase_if_last_ref || UNLIKELY(old_meta >> ClockHandle::kStateShift ==
-                                     ClockHandle::kStateInvisible))) {
+  if ((erase_if_last_ref || UNLIKELY(old_meta.IsInvisible()))) {
     // FIXME: There's a chance here that another thread could replace this
     // entry and we end up erasing the wrong one.
     return TryEraseHandle(h, /*holding_ref=*/false, /*mark_invisible=*/false);
@@ -3419,7 +3375,7 @@ bool AutoHyperClockTable::Release(HandleImpl* h, bool useful,
 }
 
 #ifndef NDEBUG
-void AutoHyperClockTable::TEST_ReleaseN(HandleImpl* h, size_t n) {
+void AutoHyperClockTable::TEST_ReleaseN(HandleImpl* h, uint32_t n) {
   if (n > 0) {
     // Do n-1 simple releases first
     TEST_ReleaseNMinus1(h, n);
@@ -3449,27 +3405,26 @@ void AutoHyperClockTable::EraseUnRefEntries() {
   for (size_t i = 0; i < usable_size; i++) {
     HandleImpl& h = array_[i];
 
-    uint64_t old_meta = h.meta.LoadRelaxed();
-    if (old_meta & (uint64_t{ClockHandle::kStateShareableBit}
-                    << ClockHandle::kStateShift) &&
-        GetRefcount(old_meta) == 0 &&
-        h.meta.CasStrong(old_meta, uint64_t{ClockHandle::kStateConstruction}
-                                       << ClockHandle::kStateShift)) {
-      // Took ownership
-      h.FreeData(allocator_);
-      usage_.FetchSubRelaxed(h.total_charge);
-      // NOTE: could be more efficient with a dedicated variant of
-      // PurgeImpl, but this is not a common operation
-      Remove(&h);
-      MarkEmpty(h);
-      occupancy_.FetchSub(1U);
+    SlotMeta old_meta = h.meta.LoadRelaxed();
+    if (old_meta.IsShareable() && old_meta.GetRefcount() == 0) {
+      SlotMeta construction_meta;
+      construction_meta.SetUnderConstruction();
+      if (h.meta.CasStrong(old_meta, construction_meta)) {
+        // Took ownership
+        h.FreeData(allocator_);
+        usage_.FetchSubRelaxed(h.total_charge);
+        // NOTE: could be more efficient with a dedicated variant of
+        // PurgeImpl, but this is not a common operation
+        Remove(&h);
+        MarkEmpty(h);
+        occupancy_.FetchSub(1U);
+      }
     }
   }
 }
 
 void AutoHyperClockTable::Evict(size_t requested_charge, InsertState& state,
-                                EvictionData* data,
-                                uint32_t eviction_effort_cap) {
+                                EvictionData* data) {
   // precondition
   assert(requested_charge > 0);
 
@@ -3561,7 +3516,7 @@ void AutoHyperClockTable::Evict(size_t requested_charge, InsertState& state,
       return;
     }
 
-    if (IsEvictionEffortExceeded(*data, eviction_effort_cap)) {
+    if (IsEvictionEffortExceeded(*data)) {
       eviction_effort_exceeded_count_.FetchAddRelaxed(1);
       return;
     }
@@ -3579,7 +3534,7 @@ size_t AutoHyperClockTable::CalcMaxUsableLength(
   size_t num_slots =
       static_cast<size_t>(capacity / min_avg_slot_charge + 0.999999);
 
-  const size_t slots_per_page = port::kPageSize / sizeof(HandleImpl);
+  const size_t slots_per_page = kPresumedPageSize / sizeof(HandleImpl);
 
   // Round up to page size
   return ((num_slots + slots_per_page - 1) / slots_per_page) * slots_per_page;
@@ -3587,8 +3542,7 @@ size_t AutoHyperClockTable::CalcMaxUsableLength(
 
 namespace {
 bool IsHeadNonempty(const AutoHyperClockTable::HandleImpl& h) {
-  return !AutoHyperClockTable::HandleImpl::IsEnd(
-      h.head_next_with_shift.LoadRelaxed());
+  return !h.head_next_with_shift.LoadRelaxed().IsEnd();
 }
 bool IsEntryAtHome(const AutoHyperClockTable::HandleImpl& h, int shift,
                    size_t home) {
diff --git a/cache/clock_cache.h b/cache/clock_cache.h
index 2d5d0d9eef3c..efce8a69e352 100644
--- a/cache/clock_cache.h
+++ b/cache/clock_cache.h
@@ -9,8 +9,6 @@
 
 #pragma once
 
-#include <array>
-#include <atomic>
 #include <climits>
 #include <cstddef>
 #include <cstdint>
@@ -19,14 +17,10 @@
 
 #include "cache/cache_key.h"
 #include "cache/sharded_cache.h"
-#include "port/lang.h"
-#include "port/malloc.h"
 #include "port/mmap.h"
-#include "port/port.h"
 #include "rocksdb/cache.h"
-#include "rocksdb/secondary_cache.h"
 #include "util/atomic.h"
-#include "util/autovector.h"
+#include "util/bit_fields.h"
 #include "util/math.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -323,40 +317,89 @@ struct ClockHandle : public ClockHandleBasicData {
   // | acquire counter      | release counter     | hit bit | state marker |
   // -----------------------------------------------------------------------
 
-  // For reading or updating counters in meta word.
-  static constexpr uint8_t kCounterNumBits = 30;
-  static constexpr uint64_t kCounterMask = (uint64_t{1} << kCounterNumBits) - 1;
-
-  static constexpr uint8_t kAcquireCounterShift = 0;
-  static constexpr uint64_t kAcquireIncrement = uint64_t{1}
-                                                << kAcquireCounterShift;
-  static constexpr uint8_t kReleaseCounterShift = kCounterNumBits;
-  static constexpr uint64_t kReleaseIncrement = uint64_t{1}
-                                                << kReleaseCounterShift;
-
-  // For setting the hit bit
-  static constexpr uint8_t kHitBitShift = 2U * kCounterNumBits;
-  static constexpr uint64_t kHitBitMask = uint64_t{1} << kHitBitShift;
-
-  // For reading or updating the state marker in meta word
-  static constexpr uint8_t kStateShift = kHitBitShift + 1;
-
-  // Bits contribution to state marker.
-  // Occupied means any state other than empty
-  static constexpr uint8_t kStateOccupiedBit = 0b100;
-  // Shareable means the entry is reference counted (visible or invisible)
-  // (only set if also occupied)
-  static constexpr uint8_t kStateShareableBit = 0b010;
-  // Visible is only set if also shareable
-  static constexpr uint8_t kStateVisibleBit = 0b001;
-
-  // Complete state markers (not shifted into full word)
-  static constexpr uint8_t kStateEmpty = 0b000;
-  static constexpr uint8_t kStateConstruction = kStateOccupiedBit;
-  static constexpr uint8_t kStateInvisible =
-      kStateOccupiedBit | kStateShareableBit;
-  static constexpr uint8_t kStateVisible =
-      kStateOccupiedBit | kStateShareableBit | kStateVisibleBit;
+  struct SlotMeta : public BitFields<uint64_t, SlotMeta> {
+    // For reading or updating counters in meta word.
+    static constexpr uint8_t kCounterNumBits = 30;
+    // Number of times the a reference has been acquired (or attempted)
+    // since last reset by eviction processing
+    using AcquireCounter =
+        UnsignedBitField<SlotMeta, kCounterNumBits, NoPrevBitField>;
+    // Number of times the a reference has been released (or attempted)
+    // since last reset by eviction processing
+    using ReleaseCounter =
+        UnsignedBitField<SlotMeta, kCounterNumBits, AcquireCounter>;
+    // Metadata bit in support of secondary cache
+    using HitFlag = BoolBitField<SlotMeta, ReleaseCounter>;
+    // Occupied means any state other than empty
+    using OccupiedFlag = BoolBitField<SlotMeta, HitFlag>;
+    // Shareable means the entry is reference counted (visible or invisible)
+    // (only set if also occupied)
+    using ShareableFlag = BoolBitField<SlotMeta, OccupiedFlag>;
+    // Visible is only set if also shareable (invisible can't be found by
+    // Lookup)
+    using VisibleFlag = BoolBitField<SlotMeta, ShareableFlag>;
+
+    // Convenience functions
+    uint32_t GetAcquireCounter() const { return Get<AcquireCounter>(); }
+    void SetAcquireCounter(uint32_t val) { Set<AcquireCounter>(val); }
+    uint32_t GetReleaseCounter() const { return Get<ReleaseCounter>(); }
+    void SetReleaseCounter(uint32_t val) { Set<ReleaseCounter>(val); }
+    uint32_t GetRefcount() const {
+      return Get<AcquireCounter>() - Get<ReleaseCounter>();
+    }
+    bool GetHit() const { return Get<HitFlag>(); }
+    void SetHit(bool val) { Set<HitFlag>(val); }
+
+    // Some distinct states for the various state flags
+    bool IsEmpty() const {
+      bool rv = !Get<OccupiedFlag>();
+      if (rv) {
+        assert(!Get<ShareableFlag>());
+        assert(!Get<VisibleFlag>());
+      }
+      return rv;
+    }
+
+    bool IsUnderConstruction() const {
+      bool rv = Get<OccupiedFlag>() && !Get<ShareableFlag>();
+      if (rv) {
+        assert(!Get<VisibleFlag>());
+      }
+      return rv;
+    }
+    void SetUnderConstruction() {
+      Set<OccupiedFlag>(true);
+      Set<ShareableFlag>(false);
+      Set<VisibleFlag>(false);
+    }
+
+    bool IsShareable() const { return Get<ShareableFlag>(); }
+    bool IsInvisible() const {
+      bool rv = Get<ShareableFlag>() && !Get<VisibleFlag>();
+      if (rv) {
+        assert(Get<OccupiedFlag>());
+      }
+      return rv;
+    }
+    void SetInvisible() {
+      Set<OccupiedFlag>(true);
+      Set<ShareableFlag>(true);
+      Set<VisibleFlag>(false);
+    }
+
+    bool IsVisible() const {
+      bool rv = Get<ShareableFlag>() && Get<VisibleFlag>();
+      if (rv) {
+        assert(Get<OccupiedFlag>());
+      }
+      return rv;
+    }
+    void SetVisible() {
+      Set<OccupiedFlag>(true);
+      Set<ShareableFlag>(true);
+      Set<VisibleFlag>(true);
+    }
+  };
 
   // Constants for initializing the countdown clock. (Countdown clock is only
   // in effect with zero refs, acquire counter == release counter, and in that
@@ -370,7 +413,7 @@ struct ClockHandle : public ClockHandleBasicData {
   // TODO: make these coundown values tuning parameters for eviction?
 
   // See above. Mutable for read reference counting.
-  mutable AcqRelAtomic<uint64_t> meta{};
+  mutable BitFieldsAtomic<SlotMeta> meta{};
 };  // struct ClockHandle
 
 class BaseClockTable {
@@ -383,25 +426,20 @@ class BaseClockTable {
     int eviction_effort_cap;
   };
 
-  BaseClockTable(CacheMetadataChargePolicy metadata_charge_policy,
+  BaseClockTable(size_t capacity, bool strict_capacity_limit,
+                 int eviction_effort_cap,
+                 CacheMetadataChargePolicy metadata_charge_policy,
                  MemoryAllocator* allocator,
                  const Cache::EvictionCallback* eviction_callback,
-                 const uint32_t* hash_seed)
-      : metadata_charge_policy_(metadata_charge_policy),
-        allocator_(allocator),
-        eviction_callback_(*eviction_callback),
-        hash_seed_(*hash_seed) {}
+                 const uint32_t* hash_seed);
 
   template <class Table>
   typename Table::HandleImpl* CreateStandalone(ClockHandleBasicData& proto,
-                                               size_t capacity,
-                                               uint32_t eec_and_scl,
                                                bool allow_uncharged);
 
   template <class Table>
   Status Insert(const ClockHandleBasicData& proto,
-                typename Table::HandleImpl** handle, Cache::Priority priority,
-                size_t capacity, uint32_t eec_and_scl);
+                typename Table::HandleImpl** handle, Cache::Priority priority);
 
   void Ref(ClockHandle& handle);
 
@@ -411,6 +449,18 @@ class BaseClockTable {
 
   size_t GetStandaloneUsage() const { return standalone_usage_.LoadRelaxed(); }
 
+  size_t GetCapacity() const { return capacity_.LoadRelaxed(); }
+
+  void SetCapacity(size_t capacity) { capacity_.StoreRelaxed(capacity); }
+
+  void SetStrictCapacityLimit(bool strict_capacity_limit) {
+    if (strict_capacity_limit) {
+      eec_and_scl_.ApplyRelaxed(StrictCapacityLimit::SetTransform());
+    } else {
+      eec_and_scl_.ApplyRelaxed(StrictCapacityLimit::ClearTransform());
+    }
+  }
+
   uint32_t GetHashSeed() const { return hash_seed_; }
 
   uint64_t GetYieldCount() const { return yield_count_.LoadRelaxed(); }
@@ -427,11 +477,12 @@ class BaseClockTable {
 
   void TrackAndReleaseEvictedEntry(ClockHandle* h);
 
+  bool IsEvictionEffortExceeded(const BaseClockTable::EvictionData& data) const;
 #ifndef NDEBUG
   // Acquire N references
-  void TEST_RefN(ClockHandle& handle, size_t n);
+  void TEST_RefN(ClockHandle& handle, uint32_t n);
   // Helper for TEST_ReleaseN
-  void TEST_ReleaseNMinus1(ClockHandle* handle, size_t n);
+  void TEST_ReleaseNMinus1(ClockHandle* handle, uint32_t n);
 #endif
 
  private:  // fns
@@ -448,9 +499,8 @@ class BaseClockTable {
   // required, and the operation should fail if not possible.
   // NOTE: Otherwise, occupancy_ is not managed in this function
   template <class Table>
-  Status ChargeUsageMaybeEvictStrict(size_t total_charge, size_t capacity,
+  Status ChargeUsageMaybeEvictStrict(size_t total_charge,
                                      bool need_evict_for_occupancy,
-                                     uint32_t eviction_effort_cap,
                                      typename Table::InsertState& state);
 
   // Helper for updating `usage_` for new entry with given `total_charge`
@@ -462,9 +512,8 @@ class BaseClockTable {
   // true, indicating success.
   // NOTE: occupancy_ is not managed in this function
   template <class Table>
-  bool ChargeUsageMaybeEvictNonStrict(size_t total_charge, size_t capacity,
+  bool ChargeUsageMaybeEvictNonStrict(size_t total_charge,
                                       bool need_evict_for_occupancy,
-                                      uint32_t eviction_effort_cap,
                                       typename Table::InsertState& state);
 
  protected:  // data
@@ -489,13 +538,32 @@ class BaseClockTable {
   // TODO: is this separation needed if we don't do background evictions?
   ALIGN_AS(CACHE_LINE_SIZE)
   // Number of elements in the table.
-  AcqRelAtomic<size_t> occupancy_{};
+  Atomic<size_t> occupancy_{};
 
   // Memory usage by entries tracked by the cache (including standalone)
-  AcqRelAtomic<size_t> usage_{};
+  Atomic<size_t> usage_{};
 
   // Part of usage by standalone entries (not in table)
-  AcqRelAtomic<size_t> standalone_usage_{};
+  Atomic<size_t> standalone_usage_{};
+
+  // Maximum total charge of all elements stored in the table.
+  // (Relaxed: eventual consistency/update is OK)
+  RelaxedAtomic<size_t> capacity_;
+
+  // Encodes eviction_effort_cap (bottom 31 bits) and strict_capacity_limit
+  // (top bit). See HyperClockCacheOptions::eviction_effort_cap etc.
+  struct EecAndScl : public BitFields<uint32_t, EecAndScl> {
+    uint32_t GetEffectiveEvictionEffortCap() const {
+      // Because setting strict_capacity_limit is supposed to imply infinite
+      // cap on eviction effort, we can let the bit for strict_capacity_limit
+      // in the upper-most bit position to used as part of the effective cap.
+      return underlying;
+    }
+  };
+  using EvictionEffortCap = UnsignedBitField<EecAndScl, 31, NoPrevBitField>;
+  using StrictCapacityLimit = BoolBitField<EecAndScl, EvictionEffortCap>;
+  // (Relaxed: eventual consistency/update is OK)
+  RelaxedBitFieldsAtomic<EecAndScl> eec_and_scl_;
 
   ALIGN_AS(CACHE_LINE_SIZE)
   const CacheMetadataChargePolicy metadata_charge_policy_;
@@ -551,7 +619,7 @@ class FixedHyperClockTable : public BaseClockTable {
     size_t estimated_value_size;
   };
 
-  FixedHyperClockTable(size_t capacity,
+  FixedHyperClockTable(size_t capacity, bool strict_capacity_limit,
                        CacheMetadataChargePolicy metadata_charge_policy,
                        MemoryAllocator* allocator,
                        const Cache::EvictionCallback* eviction_callback,
@@ -567,14 +635,13 @@ class FixedHyperClockTable : public BaseClockTable {
   bool GrowIfNeeded(size_t new_occupancy, InsertState& state);
 
   HandleImpl* DoInsert(const ClockHandleBasicData& proto,
-                       uint64_t initial_countdown, bool take_ref,
+                       uint32_t initial_countdown, bool take_ref,
                        InsertState& state);
 
   // Runs the clock eviction algorithm trying to reclaim at least
   // requested_charge. Returns how much is evicted, which could be less
   // if it appears impossible to evict the requested amount without blocking.
-  void Evict(size_t requested_charge, InsertState& state, EvictionData* data,
-             uint32_t eviction_effort_cap);
+  void Evict(size_t requested_charge, InsertState& state, EvictionData* data);
 
   HandleImpl* Lookup(const UniqueId64x2& hashed_key);
 
@@ -596,7 +663,7 @@ class FixedHyperClockTable : public BaseClockTable {
   }
 
   // Release N references
-  void TEST_ReleaseN(HandleImpl* handle, size_t n);
+  void TEST_ReleaseN(HandleImpl* handle, uint32_t n);
 #endif
 
   // The load factor p is a real number in (0, 1) such that at all
@@ -757,6 +824,7 @@ class AutoHyperClockTable : public BaseClockTable {
     // chain--specifically the next entry in the chain.
     // * The end of a chain is given a special "end" marker and refers back
     // to the head of the chain.
+    // These decorated pointers use the NextWithShift bit field struct below.
     //
     // Why do we need shift on each pointer? To make Lookup wait-free, we need
     // to be able to query a chain without missing anything, and preferably
@@ -776,47 +844,63 @@ class AutoHyperClockTable : public BaseClockTable {
     // it is normal to see "under construction" entries on the chain, and it
     // is not safe to read their hashed key without either a read reference
     // on the entry or a rewrite lock on the chain.
-
-    // Marker in a "with_shift" head pointer for some thread owning writes
-    // to the chain structure (except for inserts), but only if not an
-    // "end" pointer. Also called the "rewrite lock."
-    static constexpr uint64_t kHeadLocked = uint64_t{1} << 7;
-
-    // Marker in a "with_shift" pointer for the end of a chain. Must also
-    // point back to the head of the chain (with end marker removed).
-    // Also includes the "locked" bit so that attempting to lock an empty
-    // chain has no effect (not needed, as the lock is only needed for
-    // removals).
-    static constexpr uint64_t kNextEndFlags = (uint64_t{1} << 6) | kHeadLocked;
-
-    static inline bool IsEnd(uint64_t next_with_shift) {
-      // Assuming certain values never used, suffices to check this one bit
-      constexpr auto kCheckBit = kNextEndFlags ^ kHeadLocked;
-      return next_with_shift & kCheckBit;
-    }
-
-    // Bottom bits to right shift away to get an array index from a
-    // "with_shift" pointer.
-    static constexpr int kNextShift = 8;
-
-    // A bit mask for the "shift" associated with each "with_shift" pointer.
-    // Always bottommost bits.
-    static constexpr int kShiftMask = 63;
+    struct NextWithShift : public BitFields<uint64_t, NextWithShift> {
+      // The "shift" associated with this decorated pointer (see description
+      // above).
+      using Shift = UnsignedBitField<NextWithShift, 6, NoPrevBitField>;
+      // Marker for the end of a chain. Must also (a) point back to the head of
+      // the chain (with end marker removed), and (b) set the LockedFlag
+      // (below), so that attempting to lock an empty chain has no effect (not
+      // needed, as the lock is only needed for removals).
+      using EndFlag = BoolBitField<NextWithShift, Shift>;
+      // Marker that some thread owning writes to the chain structure (except
+      // for inserts), but only if not an "end" pointer. Also called the
+      // "rewrite lock."
+      using LockedFlag = BoolBitField<NextWithShift, EndFlag>;
+      // The "next" associated with this decorated pointer, which is an index
+      // into the table's array_ (see description above).
+      using Next = UnsignedBitField<NextWithShift, 56, LockedFlag>;
+
+      bool IsLocked() const { return Get<LockedFlag>(); }
+      bool IsEnd() const {
+        // End flag should imply locked flag
+        assert(!Get<EndFlag>() || Get<LockedFlag>());
+        return Get<EndFlag>();
+      }
+      bool IsLockedNotEnd() const {
+        // NOTE: helping GCC to optimize this simpler code:
+        // return IsLocked() && !IsEnd();
+        constexpr U kEndFlag = U{1} << EndFlag::kBitOffset;
+        constexpr U kLockedFlag = U{1} << LockedFlag::kBitOffset;
+        return (underlying & (kEndFlag | kLockedFlag)) == kLockedFlag;
+      }
+      auto GetNext() const { return Get<Next>(); }
+      auto GetShift() const { return Get<Shift>(); }
+
+      static NextWithShift Make(size_t next, int shift) {
+        return NextWithShift{}.With<Next>(next).With<Shift>(
+            static_cast<uint8_t>(shift));
+      }
+
+      static NextWithShift MakeEnd(size_t next, int shift) {
+        return Make(next, shift).With<EndFlag>(true).With<LockedFlag>(true);
+      }
+    };
 
     // A marker for head_next_with_shift that indicates this HandleImpl is
     // heap allocated (standalone) rather than in the table.
-    static constexpr uint64_t kStandaloneMarker = UINT64_MAX;
+    static constexpr NextWithShift kStandaloneMarker{UINT64_MAX};
 
     // A marker for head_next_with_shift indicating the head is not yet part
     // of the usable table, or for chain_next_with_shift indicating that the
     // entry is not present or is not yet part of a chain (must not be
     // "shareable" state).
-    static constexpr uint64_t kUnusedMarker = 0;
+    static constexpr NextWithShift kUnusedMarker{0};
 
     // See above. The head pointer is logically independent of the rest of
     // the entry, including the chain next pointer.
-    AcqRelAtomic<uint64_t> head_next_with_shift{kUnusedMarker};
-    AcqRelAtomic<uint64_t> chain_next_with_shift{kUnusedMarker};
+    BitFieldsAtomic<NextWithShift> head_next_with_shift{kUnusedMarker};
+    BitFieldsAtomic<NextWithShift> chain_next_with_shift{kUnusedMarker};
 
     // For supporting CreateStandalone and some fallback cases.
     inline bool IsStandalone() const {
@@ -841,7 +925,7 @@ class AutoHyperClockTable : public BaseClockTable {
     size_t min_avg_value_size;
   };
 
-  AutoHyperClockTable(size_t capacity,
+  AutoHyperClockTable(size_t capacity, bool strict_capacity_limit,
                       CacheMetadataChargePolicy metadata_charge_policy,
                       MemoryAllocator* allocator,
                       const Cache::EvictionCallback* eviction_callback,
@@ -862,14 +946,13 @@ class AutoHyperClockTable : public BaseClockTable {
   bool GrowIfNeeded(size_t new_occupancy, InsertState& state);
 
   HandleImpl* DoInsert(const ClockHandleBasicData& proto,
-                       uint64_t initial_countdown, bool take_ref,
+                       uint32_t initial_countdown, bool take_ref,
                        InsertState& state);
 
   // Runs the clock eviction algorithm trying to reclaim at least
   // requested_charge. Returns how much is evicted, which could be less
   // if it appears impossible to evict the requested amount without blocking.
-  void Evict(size_t requested_charge, InsertState& state, EvictionData* data,
-             uint32_t eviction_effort_cap);
+  void Evict(size_t requested_charge, InsertState& state, EvictionData* data);
 
   HandleImpl* Lookup(const UniqueId64x2& hashed_key);
 
@@ -891,7 +974,7 @@ class AutoHyperClockTable : public BaseClockTable {
   }
 
   // Release N references
-  void TEST_ReleaseN(HandleImpl* handle, size_t n);
+  void TEST_ReleaseN(HandleImpl* handle, uint32_t n);
 #endif
 
   // Maximum ratio of number of occupied slots to number of usable slots. The
@@ -973,7 +1056,7 @@ class AutoHyperClockTable : public BaseClockTable {
   // To maximize parallelization of Grow() operations, this field is only
   // updated opportunistically after Grow() operations and in DoInsert() where
   // it is found to be out-of-date. See CatchUpLengthInfoNoWait().
-  AcqRelAtomic<uint64_t> length_info_;
+  Atomic<uint64_t> length_info_;
 
   // An already-computed version of the usable length times the max load
   // factor. Could be slightly out of date but GrowIfNeeded()/Grow() handle
@@ -1096,21 +1179,12 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase {
     return table_.TEST_MutableOccupancyLimit();
   }
   // Acquire/release N references
-  void TEST_RefN(HandleImpl* handle, size_t n);
-  void TEST_ReleaseN(HandleImpl* handle, size_t n);
+  void TEST_RefN(HandleImpl* handle, uint32_t n);
+  void TEST_ReleaseN(HandleImpl* handle, uint32_t n);
 #endif
 
  private:  // data
   Table table_;
-
-  // Maximum total charge of all elements stored in the table.
-  // (Relaxed: eventual consistency/update is OK)
-  RelaxedAtomic<size_t> capacity_;
-
-  // Encodes eviction_effort_cap (bottom 31 bits) and strict_capacity_limit
-  // (top bit). See HyperClockCacheOptions::eviction_effort_cap etc.
-  // (Relaxed: eventual consistency/update is OK)
-  RelaxedAtomic<uint32_t> eec_and_scl_;
 };  // class ClockCacheShard
 
 template <class Table>
diff --git a/cache/compressed_secondary_cache.cc b/cache/compressed_secondary_cache.cc
index 4d3d0a2cddf7..d07a099ec872 100644
--- a/cache/compressed_secondary_cache.cc
+++ b/cache/compressed_secondary_cache.cc
@@ -16,6 +16,31 @@
 #include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
+namespace {
+// Format of values in CompressedSecondaryCache:
+// If enable_custom_split_merge:
+//  * A chain of CacheValueChunk representing the sequence of bytes for a tagged
+//    value. The overall length of the tagged value is determined by the chain
+//    of CacheValueChunks.
+// If !enable_custom_split_merge:
+//  * A LengthPrefixedSlice (starts with varint64 size) of a tagged value.
+//
+// A tagged value has a 2-byte header before the "saved" or compressed block
+// data:
+//  * 1 byte for "source" CacheTier indicating which tier is responsible for
+//    compression/decompression.
+//  * 1 byte for compression type which is generated/used by
+//    CompressedSecondaryCache iff source == CacheTier::kVolatileCompressedTier
+//    (original entry passed in was uncompressed). Otherwise, the compression
+//    type is preserved from the entry passed in.
+constexpr uint32_t kTagSize = 2;
+
+// Size of tag + varint size prefix when applicable
+uint32_t GetHeaderSize(size_t data_size, bool enable_split_merge) {
+  return (enable_split_merge ? 0 : VarintLength(kTagSize + data_size)) +
+         kTagSize;
+}
+}  // namespace
 
 CompressedSecondaryCache::CompressedSecondaryCache(
     const CompressedSecondaryCacheOptions& opts)
@@ -24,7 +49,13 @@ CompressedSecondaryCache::CompressedSecondaryCache(
       cache_res_mgr_(std::make_shared<ConcurrentCacheReservationManager>(
           std::make_shared<CacheReservationManagerImpl<CacheEntryRole::kMisc>>(
               cache_))),
-      disable_cache_(opts.capacity == 0) {}
+      disable_cache_(opts.capacity == 0) {
+  auto mgr = GetBuiltinV2CompressionManager();
+  compressor_ = mgr->GetCompressor(cache_options_.compression_opts,
+                                   cache_options_.compression_type);
+  decompressor_ =
+      mgr->GetDecompressorOptimizeFor(cache_options_.compression_type);
+}
 
 CompressedSecondaryCache::~CompressedSecondaryCache() = default;
 
@@ -33,13 +64,9 @@ std::unique_ptr<SecondaryCacheResultHandle> CompressedSecondaryCache::Lookup(
     Cache::CreateContext* create_context, bool /*wait*/, bool advise_erase,
     Statistics* stats, bool& kept_in_sec_cache) {
   assert(helper);
-  // This is a minor optimization. Its ok to skip it in TSAN in order to
-  // avoid a false positive.
-#ifndef __SANITIZE_THREAD__
-  if (disable_cache_) {
+  if (disable_cache_.LoadRelaxed()) {
     return nullptr;
   }
-#endif
 
   std::unique_ptr<SecondaryCacheResultHandle> handle;
   kept_in_sec_cache = false;
@@ -55,75 +82,58 @@ std::unique_ptr<SecondaryCacheResultHandle> CompressedSecondaryCache::Lookup(
     return nullptr;
   }
 
-  CacheAllocationPtr* ptr{nullptr};
-  CacheAllocationPtr merged_value;
-  size_t handle_value_charge{0};
-  const char* data_ptr = nullptr;
-  CacheTier source = CacheTier::kVolatileCompressedTier;
-  CompressionType type = cache_options_.compression_type;
+  std::string merged_value;
+  Slice tagged_data;
   if (cache_options_.enable_custom_split_merge) {
     CacheValueChunk* value_chunk_ptr =
-        reinterpret_cast<CacheValueChunk*>(handle_value);
-    merged_value = MergeChunksIntoValue(value_chunk_ptr, handle_value_charge);
-    ptr = &merged_value;
-    data_ptr = ptr->get();
+        static_cast<CacheValueChunk*>(handle_value);
+    merged_value = MergeChunksIntoValue(value_chunk_ptr);
+    tagged_data = Slice(merged_value);
   } else {
-    uint32_t type_32 = static_cast<uint32_t>(type);
-    uint32_t source_32 = static_cast<uint32_t>(source);
-    ptr = reinterpret_cast<CacheAllocationPtr*>(handle_value);
-    handle_value_charge = cache_->GetCharge(lru_handle);
-    data_ptr = ptr->get();
-    data_ptr = GetVarint32Ptr(data_ptr, data_ptr + 1,
-                              static_cast<uint32_t*>(&type_32));
-    type = static_cast<CompressionType>(type_32);
-    data_ptr = GetVarint32Ptr(data_ptr, data_ptr + 1,
-                              static_cast<uint32_t*>(&source_32));
-    source = static_cast<CacheTier>(source_32);
-    uint64_t data_size = 0;
-    data_ptr = GetVarint64Ptr(data_ptr, ptr->get() + handle_value_charge,
-                              static_cast<uint64_t*>(&data_size));
-    assert(handle_value_charge > data_size);
-    handle_value_charge = data_size;
+    tagged_data = GetLengthPrefixedSlice(static_cast<char*>(handle_value));
   }
-  MemoryAllocator* allocator = cache_options_.memory_allocator.get();
 
-  Status s;
-  Cache::ObjectPtr value{nullptr};
-  size_t charge{0};
+  auto source = lossless_cast<CacheTier>(tagged_data[0]);
+  auto type = lossless_cast<CompressionType>(tagged_data[1]);
+
+  std::unique_ptr<char[]> uncompressed;
+  Slice saved(tagged_data.data() + kTagSize, tagged_data.size() - kTagSize);
   if (source == CacheTier::kVolatileCompressedTier) {
-    if (cache_options_.compression_type == kNoCompression ||
-        cache_options_.do_not_compress_roles.Contains(helper->role)) {
-      s = helper->create_cb(Slice(data_ptr, handle_value_charge),
-                            kNoCompression, CacheTier::kVolatileTier,
-                            create_context, allocator, &value, &charge);
-    } else {
-      UncompressionContext uncompression_context(
-          cache_options_.compression_type);
-      UncompressionInfo uncompression_info(uncompression_context,
-                                           UncompressionDict::GetEmptyDict(),
-                                           cache_options_.compression_type);
-
-      size_t uncompressed_size{0};
-      CacheAllocationPtr uncompressed =
-          UncompressData(uncompression_info, (char*)data_ptr,
-                         handle_value_charge, &uncompressed_size,
-                         cache_options_.compress_format_version, allocator);
-
-      if (!uncompressed) {
+    if (type != kNoCompression) {
+      // TODO: can we do something to avoid yet another allocation?
+      Decompressor::Args args;
+      args.compressed_data = saved;
+      args.compression_type = type;
+      Status s = decompressor_->ExtractUncompressedSize(args);
+      assert(s.ok());  // in-memory data
+      if (s.ok()) {
+        uncompressed = std::make_unique<char[]>(args.uncompressed_size);
+        s = decompressor_->DecompressBlock(args, uncompressed.get());
+        assert(s.ok());  // in-memory data
+      }
+      if (!s.ok()) {
         cache_->Release(lru_handle, /*erase_if_last_ref=*/true);
         return nullptr;
       }
-      s = helper->create_cb(Slice(uncompressed.get(), uncompressed_size),
-                            kNoCompression, CacheTier::kVolatileTier,
-                            create_context, allocator, &value, &charge);
+      saved = Slice(uncompressed.get(), args.uncompressed_size);
+      type = kNoCompression;
+      // Free temporary compressed data as early as we can. This could matter
+      // for unusually large blocks because we also have
+      // * Another compressed copy above (from lru_cache).
+      // * The uncompressed copy in `uncompressed`.
+      // * Another uncompressed copy in `result_value` below.
+      // Let's try to max out at 3 copies instead of 4.
+      merged_value = std::string();
     }
-  } else {
-    // The item was not compressed by us. Let the helper create_cb
-    // uncompress it
-    s = helper->create_cb(Slice(data_ptr, handle_value_charge), type, source,
-                          create_context, allocator, &value, &charge);
+    // Reduced as if it came from primary cache
+    source = CacheTier::kVolatileTier;
   }
 
+  Cache::ObjectPtr result_value = nullptr;
+  size_t result_charge = 0;
+  Status s = helper->create_cb(saved, type, source, create_context,
+                               cache_options_.memory_allocator.get(),
+                               &result_value, &result_charge);
   if (!s.ok()) {
     cache_->Release(lru_handle, /*erase_if_last_ref=*/true);
     return nullptr;
@@ -141,7 +151,8 @@ std::unique_ptr<SecondaryCacheResultHandle> CompressedSecondaryCache::Lookup(
     kept_in_sec_cache = true;
     cache_->Release(lru_handle, /*erase_if_last_ref=*/false);
   }
-  handle.reset(new CompressedSecondaryCacheResultHandle(value, charge));
+  handle.reset(
+      new CompressedSecondaryCacheResultHandle(result_value, result_charge));
   RecordTick(stats, COMPRESSED_SECONDARY_CACHE_HITS);
   return handle;
 }
@@ -164,88 +175,111 @@ bool CompressedSecondaryCache::MaybeInsertDummy(const Slice& key) {
 
 Status CompressedSecondaryCache::InsertInternal(
     const Slice& key, Cache::ObjectPtr value,
-    const Cache::CacheItemHelper* helper, CompressionType type,
+    const Cache::CacheItemHelper* helper, CompressionType from_type,
     CacheTier source) {
-  if (source != CacheTier::kVolatileCompressedTier &&
-      cache_options_.enable_custom_split_merge) {
-    // We don't support custom split/merge for the tiered case
-    return Status::OK();
-  }
-
-  auto internal_helper = GetHelper(cache_options_.enable_custom_split_merge);
-  char header[20];
-  char* payload = header;
-  payload = EncodeVarint32(payload, static_cast<uint32_t>(type));
-  payload = EncodeVarint32(payload, static_cast<uint32_t>(source));
-  size_t data_size = (*helper->size_cb)(value);
-  char* data_size_ptr = payload;
-  payload = EncodeVarint64(payload, data_size);
-
-  size_t header_size = payload - header;
-  size_t total_size = data_size + header_size;
-  CacheAllocationPtr ptr =
-      AllocateBlock(total_size, cache_options_.memory_allocator.get());
-  char* data_ptr = ptr.get() + header_size;
-
-  Status s = (*helper->saveto_cb)(value, 0, data_size, data_ptr);
+  bool enable_split_merge = cache_options_.enable_custom_split_merge;
+  const Cache::CacheItemHelper* internal_helper = GetHelper(enable_split_merge);
+
+  // TODO: variant of size_cb that also returns a pointer to the data if
+  // already available. Saves an allocation if we keep the compressed version.
+  const size_t data_size_original = (*helper->size_cb)(value);
+
+  // Allocate enough memory for header/tag + original data because (a) we might
+  // not be attempting compression at all, and (b) we might keep the original if
+  // compression is insufficient. But we don't need the length prefix with
+  // enable_split_merge. TODO: be smarter with CacheValueChunk to save an
+  // allocation in the enable_split_merge case.
+  size_t header_size = GetHeaderSize(data_size_original, enable_split_merge);
+  CacheAllocationPtr allocation = AllocateBlock(
+      header_size + data_size_original, cache_options_.memory_allocator.get());
+  char* data_ptr = allocation.get() + header_size;
+  Slice tagged_data(data_ptr - kTagSize, data_size_original + kTagSize);
+  assert(tagged_data.data() >= allocation.get());
+
+  Status s = (*helper->saveto_cb)(value, 0, data_size_original, data_ptr);
   if (!s.ok()) {
     return s;
   }
-  Slice val(data_ptr, data_size);
 
-  std::string compressed_val;
-  if (cache_options_.compression_type != kNoCompression &&
-      type == kNoCompression &&
+  std::unique_ptr<char[]> tagged_compressed_data;
+  CompressionType to_type = kNoCompression;
+  if (compressor_ && from_type == kNoCompression &&
       !cache_options_.do_not_compress_roles.Contains(helper->role)) {
-    PERF_COUNTER_ADD(compressed_sec_cache_uncompressed_bytes, data_size);
-    CompressionContext compression_context(cache_options_.compression_type,
-                                           cache_options_.compression_opts);
-    uint64_t sample_for_compression{0};
-    CompressionInfo compression_info(
-        cache_options_.compression_opts, compression_context,
-        CompressionDict::GetEmptyDict(), cache_options_.compression_type,
-        sample_for_compression);
-
-    bool success =
-        CompressData(val, compression_info,
-                     cache_options_.compress_format_version, &compressed_val);
-
-    if (!success) {
-      return Status::Corruption("Error compressing value.");
+    assert(source == CacheTier::kVolatileCompressedTier);
+
+    // TODO: consider malloc sizes for max acceptable compressed size
+    // Or maybe max_compressed_bytes_per_kb
+    size_t data_size_compressed = data_size_original - 1;
+    tagged_compressed_data =
+        std::make_unique<char[]>(data_size_compressed + kTagSize);
+    s = compressor_->CompressBlock(Slice(data_ptr, data_size_original),
+                                   tagged_compressed_data.get() + kTagSize,
+                                   &data_size_compressed, &to_type,
+                                   nullptr /*working_area*/);
+    if (!s.ok()) {
+      return s;
     }
-
-    val = Slice(compressed_val);
-    data_size = compressed_val.size();
-    payload = EncodeVarint64(data_size_ptr, data_size);
-    header_size = payload - header;
-    total_size = header_size + data_size;
-    PERF_COUNTER_ADD(compressed_sec_cache_compressed_bytes, data_size);
-
-    if (!cache_options_.enable_custom_split_merge) {
-      ptr = AllocateBlock(total_size, cache_options_.memory_allocator.get());
-      data_ptr = ptr.get() + header_size;
-      memcpy(data_ptr, compressed_val.data(), data_size);
+    PERF_COUNTER_ADD(compressed_sec_cache_uncompressed_bytes,
+                     data_size_original);
+    if (to_type == kNoCompression) {
+      // Compression rejected or otherwise aborted/failed
+      to_type = kNoCompression;
+      tagged_compressed_data.reset();
+      // TODO: consider separate counters for rejected compressions
+      PERF_COUNTER_ADD(compressed_sec_cache_compressed_bytes,
+                       data_size_original);
+    } else {
+      PERF_COUNTER_ADD(compressed_sec_cache_compressed_bytes,
+                       data_size_compressed);
+      if (enable_split_merge) {
+        // Only need tagged_data for copying into CacheValueChunks.
+        tagged_data = Slice(tagged_compressed_data.get(),
+                            data_size_compressed + kTagSize);
+        allocation.reset();
+      } else {
+        // Replace allocation with compressed version, copied from string
+        header_size = GetHeaderSize(data_size_compressed, enable_split_merge);
+        allocation = AllocateBlock(header_size + data_size_compressed,
+                                   cache_options_.memory_allocator.get());
+        data_ptr = allocation.get() + header_size;
+        // Ignore unpopulated tag on tagged_compressed_data; will only be
+        // populated on the new allocation.
+        std::memcpy(data_ptr, tagged_compressed_data.get() + kTagSize,
+                    data_size_compressed);
+        tagged_data =
+            Slice(data_ptr - kTagSize, data_size_compressed + kTagSize);
+        assert(tagged_data.data() >= allocation.get());
+      }
     }
   }
 
   PERF_COUNTER_ADD(compressed_sec_cache_insert_real_count, 1);
-  if (cache_options_.enable_custom_split_merge) {
+
+  // Save the tag fields
+  const_cast<char*>(tagged_data.data())[0] = lossless_cast<char>(source);
+  const_cast<char*>(tagged_data.data())[1] = lossless_cast<char>(
+      source == CacheTier::kVolatileCompressedTier ? to_type : from_type);
+
+  if (enable_split_merge) {
     size_t split_charge{0};
-    CacheValueChunk* value_chunks_head = SplitValueIntoChunks(
-        val, cache_options_.compression_type, split_charge);
-    return cache_->Insert(key, value_chunks_head, internal_helper,
-                          split_charge);
+    CacheValueChunk* value_chunks_head =
+        SplitValueIntoChunks(tagged_data, split_charge);
+    s = cache_->Insert(key, value_chunks_head, internal_helper, split_charge);
+    assert(s.ok());  // LRUCache::Insert() with handle==nullptr always OK
   } else {
+    // Save the size prefix
+    char* ptr = allocation.get();
+    ptr = EncodeVarint64(ptr, tagged_data.size());
+    assert(ptr == tagged_data.data());
 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
-    size_t charge = malloc_usable_size(ptr.get());
+    size_t charge = malloc_usable_size(allocation.get());
 #else
-    size_t charge = total_size;
+    size_t charge = tagged_data.size();
 #endif
-    std::memcpy(ptr.get(), header, header_size);
-    CacheAllocationPtr* buf = new CacheAllocationPtr(std::move(ptr));
-    charge += sizeof(CacheAllocationPtr);
-    return cache_->Insert(key, buf, internal_helper, charge);
+    s = cache_->Insert(key, allocation.release(), internal_helper, charge);
+    assert(s.ok());  // LRUCache::Insert() with handle==nullptr always OK
   }
+  return Status::OK();
 }
 
 Status CompressedSecondaryCache::Insert(const Slice& key,
@@ -267,7 +301,17 @@ Status CompressedSecondaryCache::Insert(const Slice& key,
 Status CompressedSecondaryCache::InsertSaved(
     const Slice& key, const Slice& saved, CompressionType type = kNoCompression,
     CacheTier source = CacheTier::kVolatileTier) {
+  if (source == CacheTier::kVolatileCompressedTier) {
+    // Unexpected, would violate InsertInternal preconditions
+    assert(source != CacheTier::kVolatileCompressedTier);
+    return Status::OK();
+  }
   if (type == kNoCompression) {
+    // Not currently supported (why?)
+    return Status::OK();
+  }
+  if (cache_options_.enable_custom_split_merge) {
+    // We don't support custom split/merge for the tiered case (why?)
     return Status::OK();
   }
 
@@ -287,7 +331,7 @@ Status CompressedSecondaryCache::SetCapacity(size_t capacity) {
   MutexLock l(&capacity_mutex_);
   cache_options_.capacity = capacity;
   cache_->SetCapacity(capacity);
-  disable_cache_ = capacity == 0;
+  disable_cache_.StoreRelaxed(capacity == 0);
   return Status::OK();
 }
 
@@ -311,15 +355,17 @@ std::string CompressedSecondaryCache::GetPrintableOptions() const {
                const_cast<CompressionOptions&>(cache_options_.compression_opts))
                .c_str());
   ret.append(buffer);
-  snprintf(buffer, kBufferSize, "    compress_format_version : %d\n",
-           cache_options_.compress_format_version);
-  ret.append(buffer);
   return ret;
 }
 
+// FIXME: this could use a lot of attention, including:
+// * Use allocator
+// * We shouldn't be worse than non-split; be more pro-actively aware of
+// internal fragmentation
+// * Consider a unified object/chunk structure that may or may not split
+// * Optimize size overhead of chunks
 CompressedSecondaryCache::CacheValueChunk*
 CompressedSecondaryCache::SplitValueIntoChunks(const Slice& value,
-                                               CompressionType compression_type,
                                                size_t& charge) {
   assert(!value.empty());
   const char* src_ptr = value.data();
@@ -340,15 +386,14 @@ CompressedSecondaryCache::SplitValueIntoChunks(const Slice& value,
     // size, or there is no compression.
     if (upper == malloc_bin_sizes_.begin() ||
         upper == malloc_bin_sizes_.end() ||
-        *upper - predicted_chunk_size < malloc_bin_sizes_.front() ||
-        compression_type == kNoCompression) {
+        *upper - predicted_chunk_size < malloc_bin_sizes_.front()) {
       tmp_size = predicted_chunk_size;
     } else {
       tmp_size = *(--upper);
     }
 
     CacheValueChunk* new_chunk =
-        reinterpret_cast<CacheValueChunk*>(new char[tmp_size]);
+        static_cast<CacheValueChunk*>(static_cast<void*>(new char[tmp_size]));
     current_chunk->next = new_chunk;
     current_chunk = current_chunk->next;
     actual_chunk_size = tmp_size - sizeof(CacheValueChunk) + 1;
@@ -363,28 +408,24 @@ CompressedSecondaryCache::SplitValueIntoChunks(const Slice& value,
   return dummy_head.next;
 }
 
-CacheAllocationPtr CompressedSecondaryCache::MergeChunksIntoValue(
-    const void* chunks_head, size_t& charge) {
-  const CacheValueChunk* head =
-      reinterpret_cast<const CacheValueChunk*>(chunks_head);
+std::string CompressedSecondaryCache::MergeChunksIntoValue(
+    const CacheValueChunk* head) {
   const CacheValueChunk* current_chunk = head;
-  charge = 0;
+  size_t total_size = 0;
   while (current_chunk != nullptr) {
-    charge += current_chunk->size;
+    total_size += current_chunk->size;
     current_chunk = current_chunk->next;
   }
 
-  CacheAllocationPtr ptr =
-      AllocateBlock(charge, cache_options_.memory_allocator.get());
+  std::string result;
+  result.reserve(total_size);
   current_chunk = head;
-  size_t pos{0};
   while (current_chunk != nullptr) {
-    memcpy(ptr.get() + pos, current_chunk->data, current_chunk->size);
-    pos += current_chunk->size;
+    result.append(current_chunk->data, current_chunk->size);
     current_chunk = current_chunk->next;
   }
-
-  return ptr;
+  assert(result.size() == total_size);
+  return result;
 }
 
 const Cache::CacheItemHelper* CompressedSecondaryCache::GetHelper(
@@ -398,16 +439,16 @@ const Cache::CacheItemHelper* CompressedSecondaryCache::GetHelper(
             CacheValueChunk* tmp_chunk = chunks_head;
             chunks_head = chunks_head->next;
             tmp_chunk->Free();
-            obj = nullptr;
           }
         }};
     return &kHelper;
   } else {
     static const Cache::CacheItemHelper kHelper{
         CacheEntryRole::kMisc,
-        [](Cache::ObjectPtr obj, MemoryAllocator* /*alloc*/) {
-          delete static_cast<CacheAllocationPtr*>(obj);
-          obj = nullptr;
+        [](Cache::ObjectPtr obj, MemoryAllocator* alloc) {
+          if (obj != nullptr) {
+            CacheAllocationDeleter{alloc}(static_cast<char*>(obj));
+          }
         }};
     return &kHelper;
   }
@@ -418,12 +459,7 @@ size_t CompressedSecondaryCache::TEST_GetCharge(const Slice& key) {
   if (lru_handle == nullptr) {
     return 0;
   }
-
   size_t charge = cache_->GetCharge(lru_handle);
-  if (cache_->Value(lru_handle) != nullptr &&
-      !cache_options_.enable_custom_split_merge) {
-    charge -= 10;
-  }
   cache_->Release(lru_handle, /*erase_if_last_ref=*/false);
   return charge;
 }
diff --git a/cache/compressed_secondary_cache.h b/cache/compressed_secondary_cache.h
index 45eab656e44f..52b3d84b6dda 100644
--- a/cache/compressed_secondary_cache.h
+++ b/cache/compressed_secondary_cache.h
@@ -10,13 +10,12 @@
 #include <memory>
 
 #include "cache/cache_reservation_manager.h"
-#include "cache/lru_cache.h"
 #include "memory/memory_allocator_impl.h"
+#include "rocksdb/advanced_compression.h"
 #include "rocksdb/secondary_cache.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/status.h"
-#include "util/compression.h"
-#include "util/mutexlock.h"
+#include "util/atomic.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -124,14 +123,9 @@ class CompressedSecondaryCache : public SecondaryCache {
   // Split value into chunks to better fit into jemalloc bins. The chunks
   // are stored in CacheValueChunk and extra charge is needed for each chunk,
   // so the cache charge is recalculated here.
-  CacheValueChunk* SplitValueIntoChunks(const Slice& value,
-                                        CompressionType compression_type,
-                                        size_t& charge);
+  CacheValueChunk* SplitValueIntoChunks(const Slice& value, size_t& charge);
 
-  // After merging chunks, the extra charge for each chunk is removed, so
-  // the charge is recalculated.
-  CacheAllocationPtr MergeChunksIntoValue(const void* chunks_head,
-                                          size_t& charge);
+  std::string MergeChunksIntoValue(const CacheValueChunk* head);
 
   bool MaybeInsertDummy(const Slice& key);
 
@@ -145,9 +139,11 @@ class CompressedSecondaryCache : public SecondaryCache {
   const Cache::CacheItemHelper* GetHelper(bool enable_custom_split_merge) const;
   std::shared_ptr<Cache> cache_;
   CompressedSecondaryCacheOptions cache_options_;
+  std::unique_ptr<Compressor> compressor_;
+  std::shared_ptr<Decompressor> decompressor_;
   mutable port::Mutex capacity_mutex_;
   std::shared_ptr<ConcurrentCacheReservationManager> cache_res_mgr_;
-  bool disable_cache_;
+  RelaxedAtomic<bool> disable_cache_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/cache/compressed_secondary_cache_test.cc b/cache/compressed_secondary_cache_test.cc
index df319390eedb..845df62f72c0 100644
--- a/cache/compressed_secondary_cache_test.cc
+++ b/cache/compressed_secondary_cache_test.cc
@@ -24,6 +24,14 @@ namespace ROCKSDB_NAMESPACE {
 using secondary_cache_test_util::GetTestingCacheTypes;
 using secondary_cache_test_util::WithCacheType;
 
+// Read and reset a statistic
+template <typename T>
+T Pop(T& var) {
+  T ret = var;
+  var = T();
+  return ret;
+}
+
 // 16 bytes for HCC compatibility
 const std::string key0 = "____    ____key0";
 const std::string key1 = "____    ____key1";
@@ -51,7 +59,7 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
 
     Random rnd(301);
     // Insert and Lookup the item k1 for the first time.
-    std::string str1(rnd.RandomString(1000));
+    std::string str1 = test::CompressibleString(&rnd, 0.5, 1000);
     TestItem item1(str1.data(), str1.length());
     // A dummy handle is inserted if the item is inserted for the first time.
     ASSERT_OK(sec_cache->Insert(key1, &item1, GetHelper(), false));
@@ -68,7 +76,14 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
     ASSERT_OK(sec_cache->Insert(key1, &item1, GetHelper(), false));
     ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 1);
 
-    ASSERT_GT(comp_sec_cache->TEST_GetCharge(key1), 1000);
+    if (sec_cache_is_compressed) {
+      ASSERT_GT(comp_sec_cache->TEST_GetCharge(key1), str1.length() / 4);
+      ASSERT_LT(comp_sec_cache->TEST_GetCharge(key1), str1.length() * 3 / 4);
+    } else {
+      ASSERT_GE(comp_sec_cache->TEST_GetCharge(key1), str1.length());
+      // NOTE: split-merge is worse (1048 vs. 1024)
+      ASSERT_LE(comp_sec_cache->TEST_GetCharge(key1), 1048U);
+    }
 
     std::unique_ptr<SecondaryCacheResultHandle> handle1_2 =
         sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/true,
@@ -76,10 +91,13 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
     ASSERT_NE(handle1_2, nullptr);
     ASSERT_FALSE(kept_in_sec_cache);
     if (sec_cache_is_compressed) {
-      ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes,
-                1000);
-      ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes,
-                1007);
+      ASSERT_EQ(
+          Pop(get_perf_context()->compressed_sec_cache_uncompressed_bytes),
+          str1.length());
+      ASSERT_LT(get_perf_context()->compressed_sec_cache_compressed_bytes,
+                str1.length() * 3 / 4);
+      ASSERT_GT(Pop(get_perf_context()->compressed_sec_cache_compressed_bytes),
+                str1.length() / 4);
     } else {
       ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
       ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);
@@ -97,7 +115,7 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
     ASSERT_EQ(handle1_3, nullptr);
 
     // Insert and Lookup the item k2.
-    std::string str2(rnd.RandomString(1000));
+    std::string str2 = test::CompressibleString(&rnd, 0.5, 1017);
     TestItem item2(str2.data(), str2.length());
     ASSERT_OK(sec_cache->Insert(key2, &item2, GetHelper(), false));
     ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 2);
@@ -109,10 +127,13 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
     ASSERT_OK(sec_cache->Insert(key2, &item2, GetHelper(), false));
     ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 2);
     if (sec_cache_is_compressed) {
-      ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes,
-                2000);
-      ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes,
-                2014);
+      ASSERT_EQ(
+          Pop(get_perf_context()->compressed_sec_cache_uncompressed_bytes),
+          str2.length());
+      ASSERT_LT(get_perf_context()->compressed_sec_cache_compressed_bytes,
+                str2.length() * 3 / 4);
+      ASSERT_GT(Pop(get_perf_context()->compressed_sec_cache_compressed_bytes),
+                str2.length() / 4);
     } else {
       ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
       ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);
@@ -126,9 +147,48 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
     ASSERT_NE(val2, nullptr);
     ASSERT_EQ(memcmp(val2->Buf(), item2.Buf(), item2.Size()), 0);
 
+    // Release handles
     std::vector<SecondaryCacheResultHandle*> handles = {handle1_2.get(),
                                                         handle2_2.get()};
     sec_cache->WaitAll(handles);
+    handle1_2.reset();
+    handle2_2.reset();
+
+    // Insert and Lookup a non-compressible item k3.
+    std::string str3 = rnd.RandomBinaryString(480);
+    TestItem item3(str3.data(), str3.length());
+    ASSERT_OK(sec_cache->Insert(key3, &item3, GetHelper(), false));
+    ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 3);
+    std::unique_ptr<SecondaryCacheResultHandle> handle3_1 =
+        sec_cache->Lookup(key3, GetHelper(), this, true, /*advise_erase=*/false,
+                          /*stats=*/nullptr, kept_in_sec_cache);
+    ASSERT_EQ(handle3_1, nullptr);
+
+    ASSERT_OK(sec_cache->Insert(key3, &item3, GetHelper(), false));
+    ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 3);
+    if (sec_cache_is_compressed) {
+      // TODO: consider a compression rejected stat?
+      ASSERT_EQ(
+          Pop(get_perf_context()->compressed_sec_cache_uncompressed_bytes),
+          str3.length());
+      ASSERT_EQ(Pop(get_perf_context()->compressed_sec_cache_compressed_bytes),
+                str3.length());
+    } else {
+      ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
+      ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);
+    }
+
+    std::unique_ptr<SecondaryCacheResultHandle> handle3_2 =
+        sec_cache->Lookup(key3, GetHelper(), this, true, /*advise_erase=*/false,
+                          /*stats=*/nullptr, kept_in_sec_cache);
+    ASSERT_NE(handle3_2, nullptr);
+    std::unique_ptr<TestItem> val3 =
+        std::unique_ptr<TestItem>(static_cast<TestItem*>(handle3_2->Value()));
+    ASSERT_NE(val3, nullptr);
+    ASSERT_EQ(memcmp(val3->Buf(), item3.Buf(), item3.Size()), 0);
+
+    EXPECT_GE(comp_sec_cache->TEST_GetCharge(key3), str3.length());
+    EXPECT_LE(comp_sec_cache->TEST_GetCharge(key3), 512);
 
     sec_cache.reset();
   }
@@ -178,8 +238,9 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
       secondary_cache_opts.compression_type = CompressionType::kNoCompression;
     }
 
-    secondary_cache_opts.capacity = 1100;
+    secondary_cache_opts.capacity = 1400;
     secondary_cache_opts.num_shard_bits = 0;
+    secondary_cache_opts.strict_capacity_limit = true;
     std::shared_ptr<SecondaryCache> sec_cache =
         NewCompressedSecondaryCache(secondary_cache_opts);
 
@@ -193,7 +254,7 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
     ASSERT_OK(sec_cache->Insert(key1, &item1, GetHelper(), false));
 
     // Insert and Lookup the second item.
-    std::string str2(rnd.RandomString(200));
+    std::string str2(rnd.RandomString(500));
     TestItem item2(str2.data(), str2.length());
     // Insert a dummy handle, k1 is not evicted.
     ASSERT_OK(sec_cache->Insert(key2, &item2, GetHelper(), false));
@@ -201,16 +262,23 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
     std::unique_ptr<SecondaryCacheResultHandle> handle1 =
         sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/false,
                           /*stats=*/nullptr, kept_in_sec_cache);
-    ASSERT_EQ(handle1, nullptr);
+    ASSERT_NE(handle1, nullptr);
+    std::unique_ptr<TestItem> val1{static_cast<TestItem*>(handle1->Value())};
+    ASSERT_NE(val1, nullptr);
+    ASSERT_EQ(val1->ToString(), str1);
+    handle1.reset();
 
     // Insert k2 and k1 is evicted.
     ASSERT_OK(sec_cache->Insert(key2, &item2, GetHelper(), false));
+    handle1 =
+        sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/false,
+                          /*stats=*/nullptr, kept_in_sec_cache);
+    ASSERT_EQ(handle1, nullptr);
     std::unique_ptr<SecondaryCacheResultHandle> handle2 =
         sec_cache->Lookup(key2, GetHelper(), this, true, /*advise_erase=*/false,
                           /*stats=*/nullptr, kept_in_sec_cache);
     ASSERT_NE(handle2, nullptr);
-    std::unique_ptr<TestItem> val2 =
-        std::unique_ptr<TestItem>(static_cast<TestItem*>(handle2->Value()));
+    std::unique_ptr<TestItem> val2{static_cast<TestItem*>(handle2->Value())};
     ASSERT_NE(val2, nullptr);
     ASSERT_EQ(memcmp(val2->Buf(), item2.Buf(), item2.Size()), 0);
 
@@ -232,7 +300,7 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
     // Save Fails.
     std::string str3 = rnd.RandomString(10);
     TestItem item3(str3.data(), str3.length());
-    // The Status is OK because a dummy handle is inserted.
+    // The first Status is OK because a dummy handle is inserted.
     ASSERT_OK(sec_cache->Insert(key3, &item3, GetHelperFail(), false));
     ASSERT_NOK(sec_cache->Insert(key3, &item3, GetHelperFail(), false));
 
@@ -265,11 +333,11 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
 
     get_perf_context()->Reset();
     Random rnd(301);
-    std::string str1 = rnd.RandomString(1001);
+    std::string str1 = test::CompressibleString(&rnd, 0.5, 1001);
     auto item1_1 = new TestItem(str1.data(), str1.length());
     ASSERT_OK(cache->Insert(key1, item1_1, GetHelper(), str1.length()));
 
-    std::string str2 = rnd.RandomString(1012);
+    std::string str2 = test::CompressibleString(&rnd, 0.5, 1012);
     auto item2_1 = new TestItem(str2.data(), str2.length());
     // After this Insert, primary cache contains k2 and secondary cache contains
     // k1's dummy item.
@@ -278,7 +346,7 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
     ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
     ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);
 
-    std::string str3 = rnd.RandomString(1024);
+    std::string str3 = test::CompressibleString(&rnd, 0.5, 1024);
     auto item3_1 = new TestItem(str3.data(), str3.length());
     // After this Insert, primary cache contains k3 and secondary cache contains
     // k1's dummy item and k2's dummy item.
@@ -297,10 +365,13 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
     ASSERT_OK(cache->Insert(key2, item2_2, GetHelper(), str2.length()));
     ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 1);
     if (sec_cache_is_compressed) {
-      ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes,
+      ASSERT_EQ(
+          Pop(get_perf_context()->compressed_sec_cache_uncompressed_bytes),
+          str1.length());
+      ASSERT_LT(get_perf_context()->compressed_sec_cache_compressed_bytes,
                 str1.length());
-      ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes,
-                1008);
+      ASSERT_GT(Pop(get_perf_context()->compressed_sec_cache_compressed_bytes),
+                str1.length() / 10);
     } else {
       ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
       ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);
@@ -312,10 +383,13 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
     ASSERT_OK(cache->Insert(key3, item3_2, GetHelper(), str3.length()));
     ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 2);
     if (sec_cache_is_compressed) {
-      ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes,
-                str1.length() + str2.length());
-      ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes,
-                2027);
+      ASSERT_EQ(
+          Pop(get_perf_context()->compressed_sec_cache_uncompressed_bytes),
+          str2.length());
+      ASSERT_LT(get_perf_context()->compressed_sec_cache_compressed_bytes,
+                str2.length());
+      ASSERT_GT(Pop(get_perf_context()->compressed_sec_cache_compressed_bytes),
+                str2.length() / 10);
     } else {
       ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
       ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);
@@ -641,8 +715,7 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
     size_t str_size{8500};
     std::string str = rnd.RandomString(static_cast<int>(str_size));
     size_t charge{0};
-    CacheValueChunk* chunks_head =
-        sec_cache->SplitValueIntoChunks(str, kLZ4Compression, charge);
+    CacheValueChunk* chunks_head = sec_cache->SplitValueIntoChunks(str, charge);
     ASSERT_EQ(charge, str_size + 3 * (sizeof(CacheValueChunk) - 1));
 
     CacheValueChunk* current_chunk = chunks_head;
@@ -688,12 +761,9 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
     std::unique_ptr<CompressedSecondaryCache> sec_cache =
         std::make_unique<CompressedSecondaryCache>(
             CompressedSecondaryCacheOptions(1000, 0, true, 0.5, 0.0));
-    size_t charge{0};
-    CacheAllocationPtr value =
-        sec_cache->MergeChunksIntoValue(chunks_head, charge);
-    ASSERT_EQ(charge, size1 + size2 + size3);
-    std::string value_str{value.get(), charge};
-    ASSERT_EQ(strcmp(value_str.data(), str.data()), 0);
+    std::string value_str = sec_cache->MergeChunksIntoValue(chunks_head);
+    ASSERT_EQ(value_str.size(), size1 + size2 + size3);
+    ASSERT_EQ(value_str, str);
 
     while (chunks_head != nullptr) {
       CacheValueChunk* tmp_chunk = chunks_head;
@@ -725,15 +795,12 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
     size_t str_size{8500};
     std::string str = rnd.RandomString(static_cast<int>(str_size));
     size_t charge{0};
-    CacheValueChunk* chunks_head =
-        sec_cache->SplitValueIntoChunks(str, kLZ4Compression, charge);
+    CacheValueChunk* chunks_head = sec_cache->SplitValueIntoChunks(str, charge);
     ASSERT_EQ(charge, str_size + 3 * (sizeof(CacheValueChunk) - 1));
 
-    CacheAllocationPtr value =
-        sec_cache->MergeChunksIntoValue(chunks_head, charge);
-    ASSERT_EQ(charge, str_size);
-    std::string value_str{value.get(), charge};
-    ASSERT_EQ(strcmp(value_str.data(), str.data()), 0);
+    std::string value_str = sec_cache->MergeChunksIntoValue(chunks_head);
+    ASSERT_EQ(value_str.size(), str_size);
+    ASSERT_EQ(value_str, str);
 
     sec_cache->GetHelper(true)->del_cb(chunks_head, /*alloc*/ nullptr);
   }
@@ -789,8 +856,7 @@ TEST_P(CompressedSecondaryCacheTestWithCompressionParam, BasicTestFromString) {
     if (LZ4_Supported()) {
       sec_cache_uri =
           "compressed_secondary_cache://"
-          "capacity=2048;num_shard_bits=0;compression_type=kLZ4Compression;"
-          "compress_format_version=2";
+          "capacity=2048;num_shard_bits=0;compression_type=kLZ4Compression";
     } else {
       ROCKSDB_GTEST_SKIP("This test requires LZ4 support.");
       sec_cache_uri =
@@ -821,7 +887,7 @@ TEST_P(CompressedSecondaryCacheTestWithCompressionParam,
       sec_cache_uri =
           "compressed_secondary_cache://"
           "capacity=2048;num_shard_bits=0;compression_type=kLZ4Compression;"
-          "compress_format_version=2;enable_custom_split_merge=true";
+          "enable_custom_split_merge=true";
     } else {
       ROCKSDB_GTEST_SKIP("This test requires LZ4 support.");
       sec_cache_uri =
@@ -896,8 +962,8 @@ TEST_P(CompressedSecondaryCacheTestWithCompressionParam, EntryRoles) {
 
   std::shared_ptr<SecondaryCache> sec_cache = NewCompressedSecondaryCache(opts);
 
-  // Fixed seed to ensure consistent compressibility (doesn't compress)
-  std::string junk(Random(301).RandomString(1000));
+  Random rnd(301);
+  std::string junk = test::CompressibleString(&rnd, 0.5, 1000);
 
   for (uint32_t i = 0; i < kNumCacheEntryRoles; ++i) {
     CacheEntryRole role = static_cast<CacheEntryRole>(i);
@@ -930,9 +996,11 @@ TEST_P(CompressedSecondaryCacheTestWithCompressionParam, EntryRoles) {
         sec_cache_is_compressed_ && !do_not_compress.Contains(role);
     if (compressed) {
       ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes,
-                1000);
-      ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes,
-                1007);
+                junk.length());
+      ASSERT_LT(get_perf_context()->compressed_sec_cache_compressed_bytes,
+                junk.length() * 3 / 4);
+      ASSERT_GT(get_perf_context()->compressed_sec_cache_compressed_bytes,
+                junk.length() / 4);
     } else {
       ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
       ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);
diff --git a/cache/lru_cache_test.cc b/cache/lru_cache_test.cc
index 7a1f18ed6f53..c9b4393dd274 100644
--- a/cache/lru_cache_test.cc
+++ b/cache/lru_cache_test.cc
@@ -1405,9 +1405,9 @@ TEST_P(BasicSecondaryCacheTest, SaveFailTest) {
   TestItem* item1 = new TestItem(str1.data(), str1.length());
   ASSERT_OK(cache->Insert(k1.AsSlice(), item1, GetHelperFail(), str1.length()));
   std::string str2 = rnd.RandomString(1020);
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
   TestItem* item2 = new TestItem(str2.data(), str2.length());
   // k1 should be demoted to NVM
-  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
   ASSERT_OK(cache->Insert(k2.AsSlice(), item2, GetHelperFail(), str2.length()));
   ASSERT_EQ(secondary_cache->num_inserts(), 1u);
 
@@ -1503,7 +1503,7 @@ TEST_P(BasicSecondaryCacheTest, FullCapacityTest) {
         /*context*/ this, Cache::Priority::LOW);
     ASSERT_EQ(handle1, nullptr);
 
-    // k1 promotion can fail with strict_capacit_limit=true, but Lookup still
+    // k1 promotion can fail with strict_capacity_limit=true, but Lookup still
     // succeeds using a standalone handle
     handle1 = cache->Lookup(k1.AsSlice(), GetHelper(),
                             /*context*/ this, Cache::Priority::LOW);
@@ -1680,7 +1680,7 @@ TEST_P(DBSecondaryCacheTest, TestSecondaryCacheCorrectness2) {
   // After Flush is successful, RocksDB will do the paranoid check for the new
   // SST file. Meta blocks are always cached in the block cache and they
   // will not be evicted. When block_2 is cache miss and read out, it is
-  // inserted to the block cache. Thefore, block_1 is evicted from block
+  // inserted to the block cache. Therefore, block_1 is evicted from block
   // cache and successfully inserted to the secondary cache. Here are 2
   // lookups in the secondary cache for block_1 and block_2.
   ASSERT_EQ(secondary_cache->num_inserts(), 1u);
@@ -1721,7 +1721,7 @@ TEST_P(DBSecondaryCacheTest, TestSecondaryCacheCorrectness2) {
   v = Get(Key(0));
   ASSERT_EQ(1007, v.size());
   // This Get needs to access block_1, since block_1 is not in block cache
-  // there is one econdary cache lookup. Then, block_1 is cached in the
+  // there is one secondary cache lookup. Then, block_1 is cached in the
   // block cache.
   ASSERT_EQ(secondary_cache->num_inserts(), 2u);
   ASSERT_EQ(secondary_cache->num_lookups(), 5u);
@@ -1785,7 +1785,7 @@ TEST_P(DBSecondaryCacheTest, NoSecondaryCacheInsertion) {
   std::string v = Get(Key(0));
   ASSERT_EQ(1000, v.size());
   // Since the block cache is large enough, all the blocks are cached. we
-  // do not need to lookup the seondary cache.
+  // do not need to lookup the secondary cache.
   ASSERT_EQ(secondary_cache->num_inserts(), 0u);
   ASSERT_EQ(secondary_cache->num_lookups(), 2u);
 
@@ -2150,7 +2150,7 @@ TEST_P(DBSecondaryCacheTest, LRUCacheDumpLoadBasic) {
   ASSERT_OK(Flush());
   Compact("a", "z");
 
-  // do th eread for all the key value pairs, so all the blocks should be in
+  // do the read for all the key value pairs, so all the blocks should be in
   // cache
   uint32_t start_insert = cache->GetInsertCount();
   uint32_t start_lookup = cache->GetLookupcount();
@@ -2179,7 +2179,7 @@ TEST_P(DBSecondaryCacheTest, LRUCacheDumpLoadBasic) {
                             &cache_dumper);
   ASSERT_OK(s);
   std::vector<DB*> db_list;
-  db_list.push_back(db_);
+  db_list.push_back(db_.get());
   s = cache_dumper->SetDumpFilter(db_list);
   ASSERT_OK(s);
   s = cache_dumper->DumpCacheEntriesToWriter();
@@ -2263,11 +2263,11 @@ TEST_P(DBSecondaryCacheTest, LRUCacheDumpLoadWithFilter) {
   options.env = fault_env_.get();
   std::string dbname1 = test::PerThreadDBPath("db_1");
   ASSERT_OK(DestroyDB(dbname1, options));
-  DB* db1 = nullptr;
+  std::unique_ptr<DB> db1;
   ASSERT_OK(DB::Open(options, dbname1, &db1));
   std::string dbname2 = test::PerThreadDBPath("db_2");
   ASSERT_OK(DestroyDB(dbname2, options));
-  DB* db2 = nullptr;
+  std::unique_ptr<DB> db2;
   ASSERT_OK(DB::Open(options, dbname2, &db2));
   fault_fs_->SetFailGetUniqueId(true);
 
@@ -2335,7 +2335,7 @@ TEST_P(DBSecondaryCacheTest, LRUCacheDumpLoadWithFilter) {
                             &cache_dumper);
   ASSERT_OK(s);
   std::vector<DB*> db_list;
-  db_list.push_back(db1);
+  db_list.push_back(db1.get());
   s = cache_dumper->SetDumpFilter(db_list);
   ASSERT_OK(s);
   s = cache_dumper->DumpCacheEntriesToWriter();
@@ -2377,7 +2377,7 @@ TEST_P(DBSecondaryCacheTest, LRUCacheDumpLoadWithFilter) {
   ASSERT_OK(s);
 
   ASSERT_OK(db1->Close());
-  delete db1;
+  db1.reset();
   ASSERT_OK(DB::Open(options, dbname1, &db1));
 
   // After load, we do the Get again. To validate the cache, we do not allow any
@@ -2406,8 +2406,8 @@ TEST_P(DBSecondaryCacheTest, LRUCacheDumpLoadWithFilter) {
   ASSERT_EQ(256, static_cast<int>(block_lookup));
   fault_fs_->SetFailGetUniqueId(false);
   fault_fs_->SetFilesystemActive(true);
-  delete db1;
-  delete db2;
+  db1.reset();
+  db2.reset();
   ASSERT_OK(DestroyDB(dbname1, options));
   ASSERT_OK(DestroyDB(dbname2, options));
 }
@@ -2464,7 +2464,7 @@ TEST_P(DBSecondaryCacheTest, TestSecondaryCacheOptionBasic) {
   std::string v = Get(Key(0));
   ASSERT_EQ(1007, v.size());
 
-  // Check the data in first block. Cache miss, direclty read from SST file.
+  // Check the data in first block. Cache miss, directly read from SST file.
   ASSERT_EQ(secondary_cache->num_inserts(), 0u);
   ASSERT_EQ(secondary_cache->num_lookups(), 0u);
 
@@ -2598,7 +2598,7 @@ TEST_P(DBSecondaryCacheTest, TestSecondaryCacheOptionChange) {
 }
 
 // Two DB test. We create 2 DBs sharing the same block cache and secondary
-// cache. We diable the secondary cache option for DB2.
+// cache. We disable the secondary cache option for DB2.
 TEST_P(DBSecondaryCacheTest, TestSecondaryCacheOptionTwoDB) {
   if (IsHyperClock()) {
     ROCKSDB_GTEST_BYPASS("Test depends on LRUCache-specific behaviors");
@@ -2619,11 +2619,11 @@ TEST_P(DBSecondaryCacheTest, TestSecondaryCacheOptionTwoDB) {
   options.paranoid_file_checks = true;
   std::string dbname1 = test::PerThreadDBPath("db_t_1");
   ASSERT_OK(DestroyDB(dbname1, options));
-  DB* db1 = nullptr;
+  std::unique_ptr<DB> db1;
   ASSERT_OK(DB::Open(options, dbname1, &db1));
   std::string dbname2 = test::PerThreadDBPath("db_t_2");
   ASSERT_OK(DestroyDB(dbname2, options));
-  DB* db2 = nullptr;
+  std::unique_ptr<DB> db2;
   Options options2 = options;
   options2.lowest_used_cache_tier = CacheTier::kVolatileTier;
   ASSERT_OK(DB::Open(options2, dbname2, &db2));
@@ -2700,8 +2700,8 @@ TEST_P(DBSecondaryCacheTest, TestSecondaryCacheOptionTwoDB) {
 
   fault_fs_->SetFailGetUniqueId(false);
   fault_fs_->SetFilesystemActive(true);
-  delete db1;
-  delete db2;
+  db1.reset();
+  db2.reset();
   ASSERT_OK(DestroyDB(dbname1, options));
   ASSERT_OK(DestroyDB(dbname2, options));
 }
diff --git a/cache/secondary_cache_adapter.cc b/cache/secondary_cache_adapter.cc
index 57a77bc7fcb0..c02e31227308 100644
--- a/cache/secondary_cache_adapter.cc
+++ b/cache/secondary_cache_adapter.cc
@@ -33,7 +33,7 @@ const char* kTieredCacheName = "TieredCache";
 // proportionally across the primary/secondary caches.
 //
 // The primary block cache is initially sized to the sum of the primary cache
-// budget + teh secondary cache budget, as follows -
+// budget + the secondary cache budget, as follows -
 //   |---------    Primary Cache Configured Capacity  -----------|
 //   |---Secondary Cache Budget----|----Primary Cache Budget-----|
 //
@@ -51,7 +51,7 @@ const char* kTieredCacheName = "TieredCache";
 // placeholder is counted against the primary cache. To compensate and count
 // a portion of it against the secondary cache, the secondary cache Deflate()
 // method is called to shrink it. Since the Deflate() causes the secondary
-// actual usage to shrink, it is refelcted here by releasing an equal amount
+// actual usage to shrink, it is reflected here by releasing an equal amount
 // from the pri_cache_res_ reservation. The Deflate() in the secondary cache
 // can be, but is not required to be, implemented using its own cache
 // reservation manager.
@@ -72,7 +72,7 @@ const char* kTieredCacheName = "TieredCache";
 // reservation is increased by an equal amount.
 //
 // Another way of implementing this would have been to simply split the user
-// reservation into primary and seconary components. However, this would
+// reservation into primary and secondary components. However, this would
 // require allocating a structure to track the associated secondary cache
 // reservation, which adds some complexity and overhead.
 //
@@ -121,7 +121,14 @@ CacheWithSecondaryAdapter::~CacheWithSecondaryAdapter() {
     assert(s.ok());
     assert(placeholder_usage_ == 0);
     assert(reserved_usage_ == 0);
-    assert(pri_cache_res_->GetTotalMemoryUsed() == sec_capacity);
+    if (pri_cache_res_->GetTotalMemoryUsed() != sec_capacity) {
+      fprintf(stdout,
+              "~CacheWithSecondaryAdapter: Primary cache reservation: "
+              "%zu, Secondary cache capacity: %zu, "
+              "Secondary cache reserved: %zu\n",
+              pri_cache_res_->GetTotalMemoryUsed(), sec_capacity,
+              sec_reserved_);
+    }
   }
 #endif  // NDEBUG
 }
@@ -479,12 +486,10 @@ const char* CacheWithSecondaryAdapter::Name() const {
 // as well. At the moment, we don't have a good way of handling the case
 // where the new capacity < total cache reservations.
 void CacheWithSecondaryAdapter::SetCapacity(size_t capacity) {
-  size_t sec_capacity = static_cast<size_t>(
-      capacity * (distribute_cache_res_ ? sec_cache_res_ratio_ : 0.0));
-  size_t old_sec_capacity = 0;
-
   if (distribute_cache_res_) {
     MutexLock m(&cache_res_mutex_);
+    size_t sec_capacity = static_cast<size_t>(capacity * sec_cache_res_ratio_);
+    size_t old_sec_capacity = 0;
 
     Status s = secondary_cache_->GetCapacity(old_sec_capacity);
     if (!s.ok()) {
@@ -579,7 +584,7 @@ Status CacheWithSecondaryAdapter::UpdateCacheReservationRatio(
   size_t pri_capacity = target_->GetCapacity();
   size_t sec_capacity =
       static_cast<size_t>(pri_capacity * compressed_secondary_ratio);
-  size_t old_sec_capacity;
+  size_t old_sec_capacity = 0;
   Status s = secondary_cache_->GetCapacity(old_sec_capacity);
   if (!s.ok()) {
     return s;
@@ -603,6 +608,7 @@ Status CacheWithSecondaryAdapter::UpdateCacheReservationRatio(
     //    cache utilization (increase in capacity - increase in share of cache
     //    reservation)
     // 3. Increase secondary cache capacity
+    assert(new_sec_reserved >= sec_reserved_);
     s = secondary_cache_->Deflate(new_sec_reserved - sec_reserved_);
     assert(s.ok());
     s = pri_cache_res_->UpdateCacheReservation(
@@ -615,7 +621,7 @@ Status CacheWithSecondaryAdapter::UpdateCacheReservationRatio(
   } else {
     // We're shrinking the ratio. Try to avoid unnecessary evictions -
     // 1. Lower the secondary cache capacity
-    // 2. Decrease pri_cache_res_ reservation to relect lower secondary
+    // 2. Decrease pri_cache_res_ reservation to reflect lower secondary
     //    cache utilization (decrease in capacity - decrease in share of cache
     //    reservations)
     // 3. Inflate the secondary cache to give it back the reduction in its
diff --git a/ccache_msvc_compiler.bat b/ccache_msvc_compiler.bat
new file mode 100644
index 000000000000..9501ec592bc4
--- /dev/null
+++ b/ccache_msvc_compiler.bat
@@ -0,0 +1 @@
+ccache.exe cl.exe %*
diff --git a/claude_md/add_option.md b/claude_md/add_option.md
new file mode 100644
index 000000000000..77caa1dbeeeb
--- /dev/null
+++ b/claude_md/add_option.md
@@ -0,0 +1,512 @@
+# Adding New Options to RocksDB Public API
+
+This document provides guidance on how to add new options to RocksDB's public API. There are two main categories of options:
+
+1. **Standard Column Family Options** (Options/DBOptions/AdvancedColumnFamilyOptions)
+2. **BlockBasedTableOptions** (options specific to block-based table format)
+
+## Overview of Files to Modify
+
+### For Standard Column Family Options
+
+| File | Purpose |
+|------|---------|
+| `include/rocksdb/advanced_options.h` | Define the option with documentation |
+| `include/rocksdb/options.h` | Add reference in related option groups if needed |
+| `options/cf_options.h` | Add to `MutableCFOptions` or `ImmutableCFOptions` struct |
+| `options/cf_options.cc` | Register option for serialization/deserialization and logging |
+| `options/options_helper.cc` | Add to `UpdateColumnFamilyOptions()` for mutable options |
+| `options/options_settable_test.cc` | Add to test string for option parsing |
+| `db_stress_tool/db_stress_common.h` | Declare gflag |
+| `db_stress_tool/db_stress_gflags.cc` | Define gflag with default value |
+| `db_stress_tool/db_stress_test_base.cc` | Apply flag to options |
+| `tools/db_bench_tool.cc` | Add flag definition and apply to options |
+| `tools/db_crashtest.py` | Add randomized values for stress testing |
+| `unreleased_history/new_features/` | Add release note markdown file |
+
+### For BlockBasedTableOptions
+
+| File | Purpose |
+|------|---------|
+| `include/rocksdb/table.h` | Define the option in `BlockBasedTableOptions` struct |
+| `table/block_based/block_based_table_factory.cc` | Register for serialization, validation, and printing |
+| `options/options_settable_test.cc` | Add to `BlockBasedTableOptionsAllFieldsSettable` test |
+| `options/options_test.cc` | Add to `MutableCFOptions` test if applicable |
+| `db_stress_tool/db_stress_common.h` | Declare gflag |
+| `db_stress_tool/db_stress_gflags.cc` | Define gflag |
+| `db_stress_tool/db_stress_test_base.cc` | Apply flag to `block_based_options` |
+| `tools/db_bench_tool.cc` | Add flag definition and apply to `block_based_options` |
+| `tools/db_crashtest.py` | Add randomized values |
+| `java/src/main/java/org/rocksdb/BlockBasedTableConfig.java` | Java API |
+| `java/rocksjni/portal.h` | JNI portal for Java bindings |
+| `java/rocksjni/table.cc` | JNI implementation |
+| `java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java` | Java unit test |
+
+---
+
+## Pattern 1: Adding a Standard Column Family Option
+
+Example reference: commit `94e65a2e0b4f817aa4bfa4c96cdf867e7980d7bc` (memtable_veirfy_per_key_checksum_on_seek)
+
+### Step 1: Define the Option in Public Header
+
+**File: `include/rocksdb/advanced_options.h`**
+
+Add the option with documentation in `AdvancedColumnFamilyOptions` struct:
+
+```cpp
+// Enables additional integrity checks during seek.
+// Specifically, for skiplist-based memtables, key checksum validation could
+// be enabled during seek optionally. This is helpful to detect corrupted
+// memtable keys during reads. Enabling this feature incurs a performance
+// overhead due to additional key checksum validation during memtable seek
+// operation.
+// This option depends on memtable_protection_bytes_per_key to be non zero.
+// If memtable_protection_bytes_per_key is zero, no validation is performed.
+bool memtable_veirfy_per_key_checksum_on_seek = false;
+```
+
+### Step 2: Add to Internal Options Structs
+
+**File: `options/cf_options.h`**
+
+Add to `MutableCFOptions` struct (or `ImmutableCFOptions` for immutable options):
+
+```cpp
+// In MutableCFOptions constructor from Options:
+memtable_veirfy_per_key_checksum_on_seek(
+    options.memtable_veirfy_per_key_checksum_on_seek),
+
+// In MutableCFOptions default constructor:
+memtable_veirfy_per_key_checksum_on_seek(false),
+
+// In MutableCFOptions struct member declarations:
+bool memtable_veirfy_per_key_checksum_on_seek;
+```
+
+### Step 3: Register for Serialization/Deserialization
+
+**File: `options/cf_options.cc`**
+
+Add to the options type info map for serialization:
+
+```cpp
+{"memtable_veirfy_per_key_checksum_on_seek",
+ {offsetof(struct MutableCFOptions,
+           memtable_veirfy_per_key_checksum_on_seek),
+  OptionType::kBoolean, OptionVerificationType::kNormal,
+  OptionTypeFlags::kMutable}},
+```
+
+Add logging in `MutableCFOptions::Dump()`:
+
+```cpp
+ROCKS_LOG_INFO(log, "memtable_veirfy_per_key_checksum_on_seek: %d",
+               memtable_veirfy_per_key_checksum_on_seek);
+```
+
+### Step 4: Update Options Helper
+
+**File: `options/options_helper.cc`**
+
+Add to `UpdateColumnFamilyOptions()`:
+
+```cpp
+cf_opts->memtable_veirfy_per_key_checksum_on_seek =
+    moptions.memtable_veirfy_per_key_checksum_on_seek;
+```
+
+### Step 5: Add to Options Settable Test
+
+**File: `options/options_settable_test.cc`**
+
+Add to the test string in `ColumnFamilyOptionsAllFieldsSettable`:
+
+```cpp
+"memtable_veirfy_per_key_checksum_on_seek=1;"
+```
+
+### Step 6: Add db_stress Support
+
+**File: `db_stress_tool/db_stress_common.h`**
+
+```cpp
+DECLARE_bool(memtable_veirfy_per_key_checksum_on_seek);
+```
+
+**File: `db_stress_tool/db_stress_gflags.cc`**
+
+```cpp
+DEFINE_bool(
+    memtable_veirfy_per_key_checksum_on_seek,
+    ROCKSDB_NAMESPACE::Options().memtable_veirfy_per_key_checksum_on_seek,
+    "Sets CF option memtable_veirfy_per_key_checksum_on_seek.");
+```
+
+**File: `db_stress_tool/db_stress_test_base.cc`**
+
+```cpp
+options.memtable_veirfy_per_key_checksum_on_seek =
+    FLAGS_memtable_veirfy_per_key_checksum_on_seek;
+```
+
+### Step 7: Add db_bench Support
+
+**File: `tools/db_bench_tool.cc`**
+
+```cpp
+// Flag definition (near related flags):
+DEFINE_bool(memtable_veirfy_per_key_checksum_on_seek, false,
+            "Sets CF option memtable_veirfy_per_key_checksum_on_seek");
+
+// Apply flag to options (in InitializeOptionsFromFlags or similar):
+options.memtable_veirfy_per_key_checksum_on_seek =
+    FLAGS_memtable_veirfy_per_key_checksum_on_seek;
+```
+
+### Step 8: Add Crash Test Support
+
+**File: `tools/db_crashtest.py`**
+
+```python
+"memtable_veirfy_per_key_checksum_on_seek": lambda: random.choice([0] * 7 + [1]),
+```
+
+Also add constraint handling in `finalize_and_sanitize()` if needed:
+
+```python
+# only skip list memtable representation supports paranoid memory checks
+if dest_params.get("memtablerep") != "skip_list":
+    dest_params["memtable_veirfy_per_key_checksum_on_seek"] = 0
+```
+
+### Step 9: Add Release Note
+
+**File: `unreleased_history/new_features/<descriptive_name>.md`**
+
+```markdown
+A new flag memtable_veirfy_per_key_checksum_on_seek is added to AdvancedColumnFamilyOptions. When it is enabled, it will validate key checksum along the binary search path on skiplist based memtable during seek operation.
+```
+
+---
+
+## Pattern 2: Adding a BlockBasedTableOptions Option
+
+Example reference: commit `742741b175c5f238374c1714f9db3340d49de569` (super_block_alignment_size)
+
+### Step 1: Define the Option in Public Header
+
+**File: `include/rocksdb/table.h`**
+
+Add to `BlockBasedTableOptions` struct with documentation:
+
+```cpp
+// Align data blocks on super block alignment. Avoid a data block split across
+// super block boundaries. Works with/without compression.
+//
+// Here a "super block" refers to an aligned unit of underlying Filesystem
+// storage for which there is an extra cost when a random read involves two
+// such super blocks instead of just one. Configuring that size here suggests
+// inserting padding in the SST file to avoid a single SST block splitting
+// across two super blocks. Only power-of-two sizes are supported. See also
+// super_block_alignment_space_overhead_ratio. Default to 0, which means super
+// block alignment is disabled.
+size_t super_block_alignment_size = 0;
+
+// This option controls the storage space overhead of super block alignment.
+// It is used to calculate the max padding size allowed for super block
+// alignment. It is calculated in this way. If super_block_alignment_size is
+// 2MB, and super_block_alignment_overhead_ratio is 128, then the max padding
+// size allowed for super block alignment is 2MB / 128 = 16KB.
+// Note that, when it is set to 0, super block alignment is disabled.
+size_t super_block_alignment_space_overhead_ratio = 128;
+```
+
+### Step 2: Register for Serialization in Table Factory
+
+**File: `table/block_based/block_based_table_factory.cc`**
+
+Add to the type info map:
+
+```cpp
+{"super_block_alignment_size",
+ {offsetof(struct BlockBasedTableOptions, super_block_alignment_size),
+  OptionType::kSizeT, OptionVerificationType::kNormal}},
+{"super_block_alignment_space_overhead_ratio",
+ {offsetof(struct BlockBasedTableOptions,
+           super_block_alignment_space_overhead_ratio),
+  OptionType::kSizeT, OptionVerificationType::kNormal}},
+```
+
+Add validation in `ValidateOptions()`:
+
+```cpp
+if ((table_options_.super_block_alignment_size &
+     (table_options_.super_block_alignment_size - 1))) {
+  return Status::InvalidArgument(
+      "Super Block alignment requested but super block alignment size is not "
+      "a power of 2");
+}
+if (table_options_.super_block_alignment_size >
+    std::numeric_limits<uint32_t>::max()) {
+  return Status::InvalidArgument(
+      "Super block alignment size exceeds maximum number (4GiB) allowed");
+}
+```
+
+Add printing in `GetPrintableOptions()`:
+
+```cpp
+snprintf(buffer, kBufferSize,
+         "  super_block_alignment_size: %" ROCKSDB_PRIszt "\n",
+         table_options_.super_block_alignment_size);
+ret.append(buffer);
+```
+
+### Step 3: Add to Options Settable Test
+
+**File: `options/options_settable_test.cc`**
+
+Add to `BlockBasedTableOptionsAllFieldsSettable` test:
+
+```cpp
+"super_block_alignment_size=65536;"
+"super_block_alignment_space_overhead_ratio=4096;"
+```
+
+### Step 4: Add to Options Test
+
+**File: `options/options_test.cc`**
+
+```cpp
+ASSERT_OK(GetColumnFamilyOptionsFromString(
+    config_options, cf_opts,
+    "block_based_table_factory.super_block_alignment_size=65536; "
+    "block_based_table_factory.super_block_alignment_space_overhead_ratio=4096;",
+    &cf_opts));
+ASSERT_EQ(bbto->super_block_alignment_size, 65536);
+ASSERT_EQ(bbto->super_block_alignment_space_overhead_ratio, 4096);
+```
+
+### Step 5: Add db_stress Support
+
+**File: `db_stress_tool/db_stress_common.h`**
+
+```cpp
+DECLARE_uint64(super_block_alignment_size);
+DECLARE_uint64(super_block_alignment_space_overhead_ratio);
+```
+
+**File: `db_stress_tool/db_stress_gflags.cc`**
+
+```cpp
+DEFINE_uint64(
+    super_block_alignment_size,
+    ROCKSDB_NAMESPACE::BlockBasedTableOptions().super_block_alignment_size,
+    "BlockBasedTableOptions.super_block_alignment_size");
+
+DEFINE_uint64(
+    super_block_alignment_space_overhead_ratio,
+    ROCKSDB_NAMESPACE::BlockBasedTableOptions()
+        .super_block_alignment_space_overhead_ratio,
+    "BlockBasedTableOptions.super_block_alignment_space_overhead_ratio");
+```
+
+**File: `db_stress_tool/db_stress_test_base.cc`**
+
+```cpp
+block_based_options.super_block_alignment_size =
+    fLU64::FLAGS_super_block_alignment_size;
+block_based_options.super_block_alignment_space_overhead_ratio =
+    fLU64::FLAGS_super_block_alignment_space_overhead_ratio;
+```
+
+### Step 6: Add db_bench Support
+
+**File: `tools/db_bench_tool.cc`**
+
+```cpp
+// Flag definitions:
+DEFINE_uint64(
+    super_block_alignment_size,
+    ROCKSDB_NAMESPACE::BlockBasedTableOptions().super_block_alignment_size,
+    "Configure super block size");
+
+DEFINE_uint64(super_block_alignment_space_overhead_ratio,
+              ROCKSDB_NAMESPACE::BlockBasedTableOptions()
+                  .super_block_alignment_space_overhead_ratio,
+              "Configure space overhead for super block alignment");
+
+// Apply to block_based_options (in the block where other options are set):
+block_based_options.super_block_alignment_size = FLAGS_super_block_alignment_size;
+block_based_options.super_block_alignment_space_overhead_ratio =
+    FLAGS_super_block_alignment_space_overhead_ratio;
+```
+
+### Step 7: Add Crash Test Support
+
+**File: `tools/db_crashtest.py`**
+
+```python
+"super_block_alignment_size": lambda: random.choice(
+    [0, 128 * 1024, 512 * 1024, 2 * 1024 * 1024]
+),
+"super_block_alignment_space_overhead_ratio": lambda: random.choice([0, 32, 4096]),
+```
+
+### Step 8: Add Java API Support
+
+**File: `java/src/main/java/org/rocksdb/BlockBasedTableConfig.java`**
+
+Add getter and setter methods:
+
+```java
+/**
+ * Get the super block alignment size.
+ *
+ * @return the super block alignment size.
+ */
+public long superBlockAlignmentSize() {
+  return superBlockAlignmentSize;
+}
+
+/**
+ * Set the super block alignment size.
+ * When set to 0, super block alignment is disabled.
+ *
+ * @param superBlockAlignmentSize the super block alignment size.
+ *
+ * @return the reference to the current option.
+ */
+public BlockBasedTableConfig setSuperBlockAlignmentSize(final long superBlockAlignmentSize) {
+  this.superBlockAlignmentSize = superBlockAlignmentSize;
+  return this;
+}
+```
+
+Add member variable:
+
+```java
+private long superBlockAlignmentSize;
+```
+
+Update constructor and native method signature.
+
+**File: `java/rocksjni/portal.h`**
+
+Update `GetMethodID` signature and add fields to Java object construction.
+
+**File: `java/rocksjni/table.cc`**
+
+Add parameters to JNI function and apply to options.
+
+**File: `java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java`**
+
+Add unit tests:
+
+```java
+@Test
+public void superBlockAlignmentSize() {
+  final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+  blockBasedTableConfig.setSuperBlockAlignmentSize(1024 * 1024);
+  assertThat(blockBasedTableConfig.superBlockAlignmentSize()).isEqualTo(1024 * 1024);
+}
+```
+
+---
+
+## Pattern 3: Adding C API for Existing Option
+
+Example reference: commit `429b36c22d76403d275dd0e6877b08d4cea2bc90` (block_align C API)
+
+If an option already exists but needs C API support:
+
+**File: `db/c.cc`**
+
+```cpp
+void rocksdb_block_based_options_set_block_align(
+    rocksdb_block_based_table_options_t* options, unsigned char v) {
+  options->rep.block_align = v;
+}
+```
+
+**File: `include/rocksdb/c.h`**
+
+```cpp
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_block_align(
+    rocksdb_block_based_table_options_t*, unsigned char);
+```
+
+---
+
+## Unit Testing Guidelines
+
+### For Standard Options
+
+Add tests in appropriate test files (e.g., `db/db_memtable_test.cc`, `db/db_options_test.cc`):
+
+```cpp
+TEST_F(DBMemTableTest, YourOptionTest) {
+  Options options;
+  options.your_new_option = true;
+  Reopen(options);
+  // Test the behavior
+}
+```
+
+### For BlockBasedTableOptions
+
+Add tests in `db/db_flush_test.cc`, `table/block_based/block_based_table_reader_test.cc`, or `table/table_test.cc`:
+
+```cpp
+TEST_P(DBFlushYourFeatureTest, YourFeature) {
+  Options options;
+  BlockBasedTableOptions block_options;
+  block_options.your_new_option = some_value;
+  options.table_factory.reset(NewBlockBasedTableFactory(block_options));
+
+  ASSERT_OK(options.table_factory->ValidateOptions(
+      DBOptions(options), ColumnFamilyOptions(options)));
+
+  Reopen(options);
+  // Test the behavior
+}
+```
+
+---
+
+## Option Type Reference
+
+Common option types used in serialization:
+
+| OptionType | C++ Type | Example |
+|------------|----------|---------|
+| `kBoolean` | `bool` | `paranoid_memory_checks` |
+| `kInt` | `int` | `max_write_buffer_number` |
+| `kInt32T` | `int32_t` | `level0_file_num_compaction_trigger` |
+| `kUInt32T` | `uint32_t` | `memtable_protection_bytes_per_key` |
+| `kUInt64T` | `uint64_t` | `target_file_size_base` |
+| `kSizeT` | `size_t` | `block_size` |
+| `kDouble` | `double` | `compression_ratio` |
+| `kString` | `std::string` | `db_log_dir` |
+
+---
+
+## Checklist Summary
+
+- [ ] Public header file with option definition and documentation
+- [ ] Internal options struct (MutableCFOptions or ImmutableCFOptions)
+- [ ] Options serialization/deserialization registration
+- [ ] Options logging in Dump() method
+- [ ] UpdateColumnFamilyOptions() for mutable options
+- [ ] options_settable_test.cc
+- [ ] db_stress_common.h (DECLARE)
+- [ ] db_stress_gflags.cc (DEFINE)
+- [ ] db_stress_test_base.cc (apply flag)
+- [ ] db_bench_tool.cc (DEFINE and apply)
+- [ ] db_crashtest.py (randomized values)
+- [ ] Unit tests
+- [ ] unreleased_history markdown file
+- [ ] Java API (for BlockBasedTableOptions)
+- [ ] C API (if needed)
+
diff --git a/claude_md/add_public_api.md b/claude_md/add_public_api.md
new file mode 100644
index 000000000000..684b89faeba5
--- /dev/null
+++ b/claude_md/add_public_api.md
@@ -0,0 +1,504 @@
+# RocksDB API Development Guide
+
+This document provides guidance for adding new public APIs to RocksDB, following the established patterns used by existing APIs like `CompactRange`.
+
+## API Layer Architecture
+
+RocksDB exposes public APIs through multiple layers. Users can access RocksDB through any of the three public APIs: C++ headers, C headers, or Java bindings.
+
+Here is an example for public header db.h:
+
+```
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                     Level 1: Public APIs (User Entry Points)                │
+├───────────────────────┬─────────────────────────┬───────────────────────────┤
+│   C++ Public API      │     C API Bindings      │       Java/JNI API        │
+│ include/rocksdb/db.h  │   include/rocksdb/c.h   │ java/src/.../RocksDB.java │
+│ include/rocksdb/*.h   │                         │ java/src/.../*.java       │
+└───────────────────────┴────────────┬────────────┴───────────────────────────┘
+                                     ↓
+┌─────────────────────────────────────────────────────────────────────────────┐
+│              Level 2: C++ Implementation (Internal Core)                    │
+│              db/db_impl/db_impl*.cc, db/c.cc, java/rocksjni/*.cc            │
+└─────────────────────────────────────────────────────────────────────────────┘
+```
+
+## Step-by-Step Guide: Adding a New Public API
+
+### Step 1: Define the C++ Public Interface
+
+**File:** `include/rocksdb/db.h`
+
+Add the virtual method declaration in the `DB` class:
+
+\`\`\`cpp
+// Pure virtual - must be implemented by DBImpl
+virtual Status YourNewAPI(const YourAPIOptions& options,
+                          ColumnFamilyHandle* column_family,
+                          /* other params */) = 0;
+
+// Convenience overload for default column family
+virtual Status YourNewAPI(const YourAPIOptions& options,
+                          /* other params */) {
+  return YourNewAPI(options, DefaultColumnFamily(), /* other params */);
+}
+\`\`\`
+
+**Key Patterns:**
+- Use `Status` return type for error handling
+- Use `OptSlice` to avoid unnecessary levels of indirection and use of raw pointers.
+- Use `ColumnFamilyHandle*` for column family support
+- Provide convenience overloads for the default column family
+
+### Step 2: Define Options Struct (If Needed)
+
+**File:** `include/rocksdb/options.h`
+
+If your API has multiple configuration options, define an options struct:
+
+\`\`\`cpp
+struct YourAPIOptions {
+  // Document each option with clear comments
+  bool some_boolean_option = false;
+
+  // Default value explanation
+  int some_int_option = -1;
+
+  // Pointer options require careful lifetime management
+  std::atomic<bool>* canceled = nullptr;
+
+  // Enum options for multi-choice settings
+  YourEnumType some_enum = YourEnumType::kDefault;
+};
+\`\`\`
+
+**Key Patterns:**
+- Use sensible default values specified inline (e.g., `= false`, `= -1`)
+- Do NOT redundantly document the default value in comments; instead, document the rationale (why this default), historical context, and how different values are interpreted
+- Group related options logically
+- Consider thread-safety for pointer options
+
+### Step 3: Implement in DBImpl
+
+**Header:** `db/db_impl/db_impl.h`
+
+\`\`\`cpp
+using DB::YourNewAPI;
+Status YourNewAPI(const YourAPIOptions& options,
+                  ColumnFamilyHandle* column_family,
+                  /* other params */) override;
+
+// Private internal implementation if needed
+Status YourNewAPIInternal(const YourAPIOptions& options,
+                          ColumnFamilyHandle* column_family,
+                          /* other params */);
+\`\`\`
+
+**Implementation:** `db/db_impl/db_impl_<category>.cc`
+
+Choose the appropriate implementation file based on functionality:
+- `db_impl_compaction_flush.cc` - Compaction and flush operations
+- `db_impl_write.cc` - Write operations
+- `db_impl_open.cc` - DB opening/closing
+- `db_impl_files.cc` - File operations
+- `db_impl.cc` - General operations
+
+\`\`\`cpp
+Status DBImpl::YourNewAPI(const YourAPIOptions& options,
+                          ColumnFamilyHandle* column_family,
+                          /* other params */) {
+  // 1. Input validation
+  if (/* invalid input */) {
+    return Status::InvalidArgument("Error message");
+  }
+
+  // 2. Check for cancellation/abort conditions
+  if (options.canceled && options.canceled->load(std::memory_order_acquire)) {
+    return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+  }
+
+  // 3. Get column family data
+  auto cfh = static_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfd = cfh->cfd();
+
+  // 4. Core implementation logic
+  // ...
+
+  return Status::OK();
+}
+\`\`\`
+
+### Step 4: Handle Special DB Types
+
+**StackableDB (Wrapper DBs):**
+**File:** `include/rocksdb/utilities/stackable_db.h`
+
+\`\`\`cpp
+using DB::YourNewAPI;
+Status YourNewAPI(const YourAPIOptions& options,
+                  ColumnFamilyHandle* column_family,
+                  /* other params */) override {
+  return db_->YourNewAPI(options, column_family, /* other params */);
+}
+\`\`\`
+
+**Secondary DB (Read-Only):**
+**File:** `db/db_impl/db_impl_secondary.h`
+
+\`\`\`cpp
+using DBImpl::YourNewAPI;
+Status YourNewAPI(const YourAPIOptions& /*options*/,
+                  ColumnFamilyHandle* /*column_family*/,
+                  /* other params */) override {
+  return Status::NotSupported("Not supported in secondary DB");
+}
+\`\`\`
+
+**CompactedDB (Read-Only):**
+**File:** `db/db_impl/compacted_db_impl.h`
+
+\`\`\`cpp
+using DBImpl::YourNewAPI;
+Status YourNewAPI(const YourAPIOptions& /*options*/,
+                  ColumnFamilyHandle* /*column_family*/,
+                  /* other params */) override {
+  return Status::NotSupported("Not supported for read-only DB");
+}
+\`\`\`
+
+### Step 5: Add C API Bindings
+
+**Header:** `include/rocksdb/c.h`
+
+\`\`\`c
+// Basic version
+extern ROCKSDB_LIBRARY_API void rocksdb_your_new_api(
+    rocksdb_t* db,
+    const char* start_key, size_t start_key_len,
+    const char* limit_key, size_t limit_key_len);
+
+// Column family version
+extern ROCKSDB_LIBRARY_API void rocksdb_your_new_api_cf(
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+    const char* start_key, size_t start_key_len,
+    const char* limit_key, size_t limit_key_len);
+
+// With options and error handling
+extern ROCKSDB_LIBRARY_API void rocksdb_your_new_api_opt(
+    rocksdb_t* db, rocksdb_your_api_options_t* opt,
+    const char* start_key, size_t start_key_len,
+    const char* limit_key, size_t limit_key_len,
+    char** errptr);
+\`\`\`
+
+**Implementation:** `db/c.cc`
+
+\`\`\`cpp
+void rocksdb_your_new_api(rocksdb_t* db, const char* start_key,
+                          size_t start_key_len, const char* limit_key,
+                          size_t limit_key_len) {
+  Slice a, b;
+  db->rep->YourNewAPI(
+      YourAPIOptions(),  // Default options
+      (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+      (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
+}
+
+void rocksdb_your_new_api_cf(rocksdb_t* db,
+                             rocksdb_column_family_handle_t* column_family,
+                             const char* start_key, size_t start_key_len,
+                             const char* limit_key, size_t limit_key_len) {
+  Slice a, b;
+  db->rep->YourNewAPI(
+      YourAPIOptions(),
+      column_family->rep,
+      (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+      (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
+}
+\`\`\`
+
+**If you have options, also add:**
+
+\`\`\`cpp
+// Options struct wrapper
+struct rocksdb_your_api_options_t {
+  YourAPIOptions rep;
+};
+
+rocksdb_your_api_options_t* rocksdb_your_api_options_create() {
+  return new rocksdb_your_api_options_t;
+}
+
+void rocksdb_your_api_options_destroy(rocksdb_your_api_options_t* opt) {
+  delete opt;
+}
+
+void rocksdb_your_api_options_set_some_option(
+    rocksdb_your_api_options_t* opt, unsigned char value) {
+  opt->rep.some_boolean_option = value;
+}
+\`\`\`
+
+### Step 6: Add Java Bindings
+
+**Java API:** `java/src/main/java/org/rocksdb/RocksDB.java`
+
+\`\`\`java
+// Basic version
+public void yourNewAPI() throws RocksDBException {
+  yourNewAPI(null);
+}
+
+// Column family version
+public void yourNewAPI(ColumnFamilyHandle columnFamilyHandle)
+    throws RocksDBException {
+  yourNewAPI(nativeHandle_, null, -1, null, -1, 0,
+      columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_);
+}
+
+// Range version
+public void yourNewAPI(final byte[] begin, final byte[] end)
+    throws RocksDBException {
+  yourNewAPI(null, begin, end);
+}
+
+// Full-featured version with options
+public void yourNewAPI(ColumnFamilyHandle columnFamilyHandle,
+                       final byte[] begin, final byte[] end,
+                       final YourAPIOptions options)
+    throws RocksDBException {
+  yourNewAPI(nativeHandle_,
+      begin, begin == null ? -1 : begin.length,
+      end, end == null ? -1 : end.length,
+      options.nativeHandle_,
+      columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_);
+}
+
+// Native method declaration
+private static native void yourNewAPI(final long handle,
+    /* @Nullable */ final byte[] begin, final int beginLen,
+    /* @Nullable */ final byte[] end, final int endLen,
+    final long optionsHandle,
+    final long cfHandle);
+\`\`\`
+
+**Options Class:** `java/src/main/java/org/rocksdb/YourAPIOptions.java`
+
+\`\`\`java
+public class YourAPIOptions extends RocksObject {
+
+  public YourAPIOptions() {
+    super(newYourAPIOptions());
+  }
+
+  // Builder pattern setters
+  public YourAPIOptions setSomeBooleanOption(boolean value) {
+    setSomeBooleanOption(nativeHandle_, value);
+    return this;
+  }
+
+  // Getters
+  public boolean someBooleanOption() {
+    return someBooleanOption(nativeHandle_);
+  }
+
+  // Native method declarations
+  private static native long newYourAPIOptions();
+  private static native void disposeInternalJni(long handle);
+  private static native void setSomeBooleanOption(long handle, boolean value);
+  private static native boolean someBooleanOption(long handle);
+
+  @Override
+  protected final void disposeInternal(final long handle) {
+    disposeInternalJni(handle);
+  }
+}
+\`\`\`
+
+**JNI Implementation:** `java/rocksjni/rocksjni.cc`
+
+\`\`\`cpp
+void Java_org_rocksdb_RocksDB_yourNewAPI(
+    JNIEnv* env, jclass,
+    jlong jdb_handle, jbyteArray jbegin, jint jbegin_len,
+    jbyteArray jend, jint jend_len,
+    jlong joptions_handle, jlong jcf_handle) {
+
+  // 1. Convert Java byte arrays to C++ strings
+  jboolean has_exception = JNI_FALSE;
+  std::string str_begin;
+  if (jbegin_len > 0) {
+    str_begin = ROCKSDB_NAMESPACE::JniUtil::byteString<std::string>(
+        env, jbegin, jbegin_len,
+        [](const char* str, const size_t len) { return std::string(str, len); },
+        &has_exception);
+    if (has_exception == JNI_TRUE) return;
+  }
+
+  std::string str_end;
+  if (jend_len > 0) {
+    str_end = ROCKSDB_NAMESPACE::JniUtil::byteString<std::string>(
+        env, jend, jend_len,
+        [](const char* str, const size_t len) { return std::string(str, len); },
+        &has_exception);
+    if (has_exception == JNI_TRUE) return;
+  }
+
+  // 2. Get or create options
+  ROCKSDB_NAMESPACE::YourAPIOptions* options = nullptr;
+  if (joptions_handle == 0) {
+    options = new ROCKSDB_NAMESPACE::YourAPIOptions();
+  } else {
+    options = reinterpret_cast<ROCKSDB_NAMESPACE::YourAPIOptions*>(joptions_handle);
+  }
+
+  // 3. Unwrap handles
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle =
+      jcf_handle == 0 ? db->DefaultColumnFamily()
+                      : reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+
+  // 4. Create Slices
+  std::unique_ptr<ROCKSDB_NAMESPACE::Slice> begin;
+  std::unique_ptr<ROCKSDB_NAMESPACE::Slice> end;
+  if (jbegin_len > 0) begin.reset(new ROCKSDB_NAMESPACE::Slice(str_begin));
+  if (jend_len > 0) end.reset(new ROCKSDB_NAMESPACE::Slice(str_end));
+
+  // 5. Call C++ API
+  ROCKSDB_NAMESPACE::Status s = db->YourNewAPI(*options, cf_handle, begin.get(), end.get());
+
+  // 6. Cleanup if we created options
+  if (joptions_handle == 0) delete options;
+
+  // 7. Throw Java exception on error
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+}
+\`\`\`
+
+**Options JNI:** `java/rocksjni/your_api_options.cc`
+
+\`\`\`cpp
+jlong Java_org_rocksdb_YourAPIOptions_newYourAPIOptions(JNIEnv*, jclass) {
+  auto* options = new ROCKSDB_NAMESPACE::YourAPIOptions();
+  return GET_CPLUSPLUS_POINTER(options);
+}
+
+void Java_org_rocksdb_YourAPIOptions_disposeInternalJni(JNIEnv*, jclass, jlong jhandle) {
+  auto* options = reinterpret_cast<ROCKSDB_NAMESPACE::YourAPIOptions*>(jhandle);
+  delete options;
+}
+
+void Java_org_rocksdb_YourAPIOptions_setSomeBooleanOption(
+    JNIEnv*, jclass, jlong jhandle, jboolean value) {
+  auto* options = reinterpret_cast<ROCKSDB_NAMESPACE::YourAPIOptions*>(jhandle);
+  options->some_boolean_option = static_cast<bool>(value);
+}
+
+jboolean Java_org_rocksdb_YourAPIOptions_someBooleanOption(JNIEnv*, jclass, jlong jhandle) {
+  auto* options = reinterpret_cast<ROCKSDB_NAMESPACE::YourAPIOptions*>(jhandle);
+  return static_cast<jboolean>(options->some_boolean_option);
+}
+\`\`\`
+
+### Step 7: Update Build Files
+
+**Java CMakeLists.txt:** `java/CMakeLists.txt`
+
+Add your new Java source files:
+\`\`\`cmake
+src/main/java/org/rocksdb/YourAPIOptions.java
+src/test/java/org/rocksdb/YourAPIOptionsTest.java
+\`\`\`
+
+### Step 8: Add Release Notes
+
+**Directory:** `unreleased_history/`
+
+RocksDB uses individual files in the `unreleased_history/` directory rather than directly editing `HISTORY.md`. This avoids merge conflicts and ensures changes are attributed to the correct release version.
+
+Add a file to the appropriate subdirectory:
+- `unreleased_history/new_features/` - For new functionality
+- `unreleased_history/public_api_changes/` - For API changes
+- `unreleased_history/behavior_changes/` - For behavior modifications
+- `unreleased_history/bug_fixes/` - For bug fixes
+
+**Example:** `unreleased_history/new_features/your_new_api.md`
+
+\`\`\`markdown
+Added `YourNewAPI()` to support [describe functionality]. See `YourAPIOptions` for configuration.
+\`\`\`
+
+**Example:** `unreleased_history/public_api_changes/your_api_options.md`
+
+**Note:** Files should contain one line of markdown. The "* " prefix is automatically added if not included. These files are compiled into `HISTORY.md` during the release process.
+
+### Step 9: Add Tests
+
+**C++ Unit Tests:** `db/db_your_api_test.cc` or add to existing test file
+
+\`\`\`cpp
+TEST_F(DBTest, YourNewAPIBasic) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Setup test data
+  ASSERT_OK(Put(1, "key1", "value1"));
+  ASSERT_OK(Put(1, "key2", "value2"));
+
+  // Test your API
+  YourAPIOptions api_options;
+  api_options.some_boolean_option = true;
+  ASSERT_OK(db_->YourNewAPI(api_options, handles_[1], nullptr, nullptr));
+
+  // Verify results
+  // ...
+}
+\`\`\`
+
+**Java Tests:** `java/src/test/java/org/rocksdb/YourAPIOptionsTest.java`
+
+\`\`\`java
+public class YourAPIOptionsTest {
+  @Test
+  public void yourAPIOptions() {
+    try (final YourAPIOptions options = new YourAPIOptions()) {
+      assertFalse(options.someBooleanOption());
+      options.setSomeBooleanOption(true);
+      assertTrue(options.someBooleanOption());
+    }
+  }
+}
+\`\`\`
+
+## File Summary Checklist
+
+
+| Component | File(s) | Required |
+|-----------|---------|----------|
+| C++ Public Interface | `include/rocksdb/db.h` | ✓ |
+| Options Struct | `include/rocksdb/options.h` | If needed |
+| DBImpl Declaration | `db/db_impl/db_impl.h` | ✓ |
+| DBImpl Implementation | `db/db_impl/db_impl_*.cc` | ✓ |
+| StackableDB | `include/rocksdb/utilities/stackable_db.h` | ✓ |
+| Secondary DB | `db/db_impl/db_impl_secondary.h` | If not supported |
+| Compacted DB | `db/db_impl/compacted_db_impl.h` | If not supported |
+| C API Header | `include/rocksdb/c.h` | ✓ |
+| C API Implementation | `db/c.cc` | ✓ |
+| Java API | `java/src/main/java/org/rocksdb/RocksDB.java` | ✓ |
+| Java Options | `java/src/main/java/org/rocksdb/YourAPIOptions.java` | If needed |
+| JNI Implementation | `java/rocksjni/rocksjni.cc` | ✓ |
+| JNI Options | `java/rocksjni/your_api_options.cc` | If needed |
+| Java CMake | `java/CMakeLists.txt` | If new files |
+| Changelog | `unreleased_history/*.md` | ✓ |
+| C++ Tests | `db/db_*_test.cc` | ✓ |
+| Java Tests | `java/src/test/java/org/rocksdb/*Test.java` | ✓ |
+
+## Best Practices
+
+1. **Error Handling**: Always return `Status` objects in C++, throw exceptions in Java
+2. **Default Values**: Provide sensible defaults for all options
+3. **Documentation**: Add clear comments for all public methods and options
+4. **Column Family Support**: Always support column family operations
+5. **Thread Safety**: Document thread-safety guarantees
+6. **Backward Compatibility**: Avoid breaking existing API contracts
+7. **Testing**: Add comprehensive unit tests for all code paths
diff --git a/crash_test.mk b/crash_test.mk
index a71a55c15c73..02e15a862aae 100644
--- a/crash_test.mk
+++ b/crash_test.mk
@@ -8,21 +8,33 @@ DB_STRESS_CMD?=./db_stress
 include common.mk
 
 CRASHTEST_MAKE=$(MAKE) -f crash_test.mk
-CRASHTEST_PY=$(PYTHON) -u tools/db_crashtest.py --stress_cmd=$(DB_STRESS_CMD) --cleanup_cmd='$(DB_CLEANUP_CMD)'
+CRASHTEST_PY=$(PYTHON) -u tools/db_crashtest.py --stress_cmd=$(DB_STRESS_CMD) --cleanup_cmd='$(DB_CLEANUP_CMD)' --destroy_db_initially=1
 
 .PHONY: crash_test crash_test_with_atomic_flush crash_test_with_txn \
+	crash_test_with_wc_txn crash_test_with_wp_txn crash_test_with_wup_txn \
 	crash_test_with_best_efforts_recovery crash_test_with_ts \
+	crash_test_with_multiops_wc_txn \
+	crash_test_with_multiops_wp_txn \
+	crash_test_with_multiops_wup_txn \
+	crash_test_with_optimistic_txn \
+	crash_test_with_tiered_storage \
 	blackbox_crash_test blackbox_crash_test_with_atomic_flush \
+	blackbox_crash_test_with_wc_txn blackbox_crash_test_with_wp_txn \
+	blackbox_crash_test_with_wup_txn \
 	blackbox_crash_test_with_txn blackbox_crash_test_with_ts \
 	blackbox_crash_test_with_best_efforts_recovery \
-	whitebox_crash_test whitebox_crash_test_with_atomic_flush \
-	whitebox_crash_test_with_txn whitebox_crash_test_with_ts \
 	blackbox_crash_test_with_multiops_wc_txn \
 	blackbox_crash_test_with_multiops_wp_txn \
-	crash_test_with_tiered_storage blackbox_crash_test_with_tiered_storage \
-	whitebox_crash_test_with_tiered_storage \
-	whitebox_crash_test_with_optimistic_txn \
+	blackbox_crash_test_with_multiops_wup_txn \
 	blackbox_crash_test_with_optimistic_txn \
+	blackbox_crash_test_with_tiered_storage \
+	whitebox_crash_test whitebox_crash_test_with_atomic_flush \
+	whitebox_crash_test_with_wc_txn whitebox_crash_test_with_wp_txn \
+	whitebox_crash_test_with_wup_txn \
+	whitebox_crash_test_with_txn whitebox_crash_test_with_ts \
+	whitebox_crash_test_with_optimistic_txn \
+	whitebox_crash_test_with_tiered_storage \
+	crash_test_db_cleanup \
 
 crash_test: $(DB_STRESS_CMD)
 # Do not parallelize
@@ -34,10 +46,20 @@ crash_test_with_atomic_flush: $(DB_STRESS_CMD)
 	$(CRASHTEST_MAKE) whitebox_crash_test_with_atomic_flush
 	$(CRASHTEST_MAKE) blackbox_crash_test_with_atomic_flush
 
-crash_test_with_txn: $(DB_STRESS_CMD)
+crash_test_with_wc_txn: $(DB_STRESS_CMD)
 # Do not parallelize
-	$(CRASHTEST_MAKE) whitebox_crash_test_with_txn
-	$(CRASHTEST_MAKE) blackbox_crash_test_with_txn
+	$(CRASHTEST_MAKE) whitebox_crash_test_with_wc_txn
+	$(CRASHTEST_MAKE) blackbox_crash_test_with_wc_txn
+
+crash_test_with_wp_txn: $(DB_STRESS_CMD)
+# Do not parallelize
+	$(CRASHTEST_MAKE) whitebox_crash_test_with_wp_txn
+	$(CRASHTEST_MAKE) blackbox_crash_test_with_wp_txn
+
+crash_test_with_wup_txn: $(DB_STRESS_CMD)
+# Do not parallelize
+	$(CRASHTEST_MAKE) whitebox_crash_test_with_wup_txn
+	$(CRASHTEST_MAKE) blackbox_crash_test_with_wup_txn
 
 crash_test_with_optimistic_txn: $(DB_STRESS_CMD)
 # Do not parallelize
@@ -62,6 +84,9 @@ crash_test_with_multiops_wc_txn: $(DB_STRESS_CMD)
 crash_test_with_multiops_wp_txn: $(DB_STRESS_CMD)
 	$(CRASHTEST_MAKE) blackbox_crash_test_with_multiops_wp_txn
 
+crash_test_with_multiops_wup_txn: $(DB_STRESS_CMD)
+	$(CRASHTEST_MAKE) blackbox_crash_test_with_multiops_wup_txn
+
 blackbox_crash_test: $(DB_STRESS_CMD)
 	$(CRASHTEST_PY) --simple blackbox $(CRASH_TEST_EXT_ARGS)
 	$(CRASHTEST_PY) blackbox $(CRASH_TEST_EXT_ARGS)
@@ -69,8 +94,14 @@ blackbox_crash_test: $(DB_STRESS_CMD)
 blackbox_crash_test_with_atomic_flush: $(DB_STRESS_CMD)
 	$(CRASHTEST_PY) --cf_consistency blackbox $(CRASH_TEST_EXT_ARGS)
 
-blackbox_crash_test_with_txn: $(DB_STRESS_CMD)
-	$(CRASHTEST_PY) --txn blackbox $(CRASH_TEST_EXT_ARGS)
+blackbox_crash_test_with_wc_txn: $(DB_STRESS_CMD)
+	$(CRASHTEST_PY) --txn blackbox --txn_write_policy 0 $(CRASH_TEST_EXT_ARGS)
+
+blackbox_crash_test_with_wp_txn: $(DB_STRESS_CMD)
+	$(CRASHTEST_PY) --txn blackbox --txn_write_policy 1 $(CRASH_TEST_EXT_ARGS)
+
+blackbox_crash_test_with_wup_txn: $(DB_STRESS_CMD)
+	$(CRASHTEST_PY) --txn blackbox --txn_write_policy 2 $(CRASH_TEST_EXT_ARGS)
 
 blackbox_crash_test_with_best_efforts_recovery: $(DB_STRESS_CMD)
 	$(CRASHTEST_PY) --test_best_efforts_recovery blackbox $(CRASH_TEST_EXT_ARGS)
@@ -79,10 +110,13 @@ blackbox_crash_test_with_ts: $(DB_STRESS_CMD)
 	$(CRASHTEST_PY) --enable_ts blackbox $(CRASH_TEST_EXT_ARGS)
 
 blackbox_crash_test_with_multiops_wc_txn: $(DB_STRESS_CMD)
-	$(CRASHTEST_PY) --test_multiops_txn --write_policy write_committed blackbox $(CRASH_TEST_EXT_ARGS)
+	$(CRASHTEST_PY) --test_multiops_txn --txn_write_policy 0 blackbox $(CRASH_TEST_EXT_ARGS)
 
 blackbox_crash_test_with_multiops_wp_txn: $(DB_STRESS_CMD)
-	$(CRASHTEST_PY) --test_multiops_txn --write_policy write_prepared blackbox $(CRASH_TEST_EXT_ARGS)
+	$(CRASHTEST_PY) --test_multiops_txn --txn_write_policy 1 blackbox $(CRASH_TEST_EXT_ARGS)
+
+blackbox_crash_test_with_multiops_wup_txn: $(DB_STRESS_CMD)
+	$(CRASHTEST_PY) --test_multiops_txn --txn_write_policy 2 blackbox $(CRASH_TEST_EXT_ARGS)
 
 blackbox_crash_test_with_tiered_storage: $(DB_STRESS_CMD)
 	$(CRASHTEST_PY) --test_tiered_storage blackbox $(CRASH_TEST_EXT_ARGS)
@@ -104,9 +138,17 @@ whitebox_crash_test_with_atomic_flush: $(DB_STRESS_CMD)
 	$(CRASHTEST_PY) --cf_consistency whitebox  --random_kill_odd \
       $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
 
-whitebox_crash_test_with_txn: $(DB_STRESS_CMD)
-	$(CRASHTEST_PY) --txn whitebox --random_kill_odd \
-      $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
+whitebox_crash_test_with_wc_txn: $(DB_STRESS_CMD)
+	$(CRASHTEST_PY) --txn whitebox --txn_write_policy 0 \
+	  --random_kill_odd $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
+
+whitebox_crash_test_with_wp_txn: $(DB_STRESS_CMD)
+	$(CRASHTEST_PY) --txn whitebox --txn_write_policy 1 \
+      --random_kill_odd $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
+
+whitebox_crash_test_with_wup_txn: $(DB_STRESS_CMD)
+	$(CRASHTEST_PY) --txn whitebox --txn_write_policy 2 \
+      --random_kill_odd $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
 
 whitebox_crash_test_with_ts: $(DB_STRESS_CMD)
 	$(CRASHTEST_PY) --enable_ts whitebox --random_kill_odd \
@@ -119,3 +161,11 @@ whitebox_crash_test_with_tiered_storage: $(DB_STRESS_CMD)
 whitebox_crash_test_with_optimistic_txn: $(DB_STRESS_CMD)
 	$(CRASHTEST_PY) --optimistic_txn whitebox --random_kill_odd \
       $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
+
+crash_test_db_cleanup: $(DB_STRESS_CMD)
+	$(DB_STRESS_CMD) --delete_dir_and_exit=$(TEST_TMPDIR)
+
+# Old names DEPRECATED
+crash_test_with_txn: crash_test_with_wc_txn
+whitebox_crash_test_with_txn: whitebox_crash_test_with_wc_txn
+blackbox_crash_test_with_txn: blackbox_crash_test_with_wc_txn
diff --git a/db/arena_wrapped_db_iter.cc b/db/arena_wrapped_db_iter.cc
index 21fb15504061..96441d5d303e 100644
--- a/db/arena_wrapped_db_iter.cc
+++ b/db/arena_wrapped_db_iter.cc
@@ -42,9 +42,9 @@ Status ArenaWrappedDBIter::GetProperty(std::string prop_name,
 void ArenaWrappedDBIter::Init(
     Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions,
     const MutableCFOptions& mutable_cf_options, const Version* version,
-    const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iteration,
-    uint64_t version_number, ReadCallback* read_callback,
-    ColumnFamilyHandleImpl* cfh, bool expose_blob_index, bool allow_refresh) {
+    const SequenceNumber& sequence, uint64_t version_number,
+    ReadCallback* read_callback, ColumnFamilyHandleImpl* cfh,
+    bool expose_blob_index, bool allow_refresh, ReadOnlyMemTable* active_mem) {
   read_options_ = read_options;
   if (!CheckFSFeatureSupport(env->GetFileSystem().get(),
                              FSSupportedOps::kAsyncIO)) {
@@ -52,15 +52,14 @@ void ArenaWrappedDBIter::Init(
   }
   read_options_.total_order_seek |= ioptions.prefix_seek_opt_in_only;
 
-  auto mem = arena_.AllocateAligned(sizeof(DBIter));
-  db_iter_ = new (mem) DBIter(env, read_options_, ioptions, mutable_cf_options,
-                              ioptions.user_comparator,
-                              /* iter */ nullptr, version, sequence, true,
-                              max_sequential_skip_in_iteration, read_callback,
-                              cfh, expose_blob_index);
+  db_iter_ = DBIter::NewIter(
+      env, read_options_, ioptions, mutable_cf_options,
+      ioptions.user_comparator, /*internal_iter=*/nullptr, version, sequence,
+      read_callback, active_mem, cfh, expose_blob_index, &arena_);
 
   sv_number_ = version_number;
   allow_refresh_ = allow_refresh;
+  allow_mark_memtable_for_flush_ = active_mem;
   memtable_range_tombstone_iter_ = nullptr;
 }
 
@@ -166,9 +165,8 @@ void ArenaWrappedDBIter::DoRefresh(const Snapshot* snapshot,
     read_callback_->Refresh(read_seq);
   }
   Init(env, read_options_, cfd->ioptions(), sv->mutable_cf_options, sv->current,
-       read_seq, sv->mutable_cf_options.max_sequential_skip_in_iterations,
-       sv->version_number, read_callback_, cfh_, expose_blob_index_,
-       allow_refresh_);
+       read_seq, sv->version_number, read_callback_, cfh_, expose_blob_index_,
+       allow_refresh_, allow_mark_memtable_for_flush_ ? sv->mem : nullptr);
 
   InternalIterator* internal_iter = db_impl->NewInternalIterator(
       read_options_, cfd, sv, &arena_, read_seq,
@@ -253,20 +251,26 @@ Status ArenaWrappedDBIter::Refresh(const Snapshot* snapshot) {
 }
 
 ArenaWrappedDBIter* NewArenaWrappedDbIterator(
-    Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions,
-    const MutableCFOptions& mutable_cf_options, const Version* version,
-    const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations,
-    uint64_t version_number, ReadCallback* read_callback,
-    ColumnFamilyHandleImpl* cfh, bool expose_blob_index, bool allow_refresh) {
-  ArenaWrappedDBIter* iter = new ArenaWrappedDBIter();
-  iter->Init(env, read_options, ioptions, mutable_cf_options, version, sequence,
-             max_sequential_skip_in_iterations, version_number, read_callback,
-             cfh, expose_blob_index, allow_refresh);
+    Env* env, const ReadOptions& read_options, ColumnFamilyHandleImpl* cfh,
+    SuperVersion* sv, const SequenceNumber& sequence,
+    ReadCallback* read_callback, DBImpl* db_impl, bool expose_blob_index,
+    bool allow_refresh, bool allow_mark_memtable_for_flush) {
+  ArenaWrappedDBIter* db_iter = new ArenaWrappedDBIter();
+  db_iter->Init(env, read_options, cfh->cfd()->ioptions(),
+                sv->mutable_cf_options, sv->current, sequence,
+                sv->version_number, read_callback, cfh, expose_blob_index,
+                allow_refresh,
+                allow_mark_memtable_for_flush ? sv->mem : nullptr);
   if (cfh != nullptr && allow_refresh) {
-    iter->StoreRefreshInfo(cfh, read_callback, expose_blob_index);
+    db_iter->StoreRefreshInfo(cfh, read_callback, expose_blob_index);
   }
 
-  return iter;
+  InternalIterator* internal_iter = db_impl->NewInternalIterator(
+      db_iter->GetReadOptions(), cfh->cfd(), sv, db_iter->GetArena(), sequence,
+      /*allow_unprepared_value=*/true, db_iter);
+  db_iter->SetIterUnderDBIter(internal_iter);
+
+  return db_iter;
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/arena_wrapped_db_iter.h b/db/arena_wrapped_db_iter.h
index 801988bfca7b..26062497a0b7 100644
--- a/db/arena_wrapped_db_iter.h
+++ b/db/arena_wrapped_db_iter.h
@@ -19,7 +19,6 @@
 #include "options/cf_options.h"
 #include "rocksdb/db.h"
 #include "rocksdb/iterator.h"
-#include "util/autovector.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -99,13 +98,19 @@ class ArenaWrappedDBIter : public Iterator {
 
   bool PrepareValue() override { return db_iter_->PrepareValue(); }
 
+  void Prepare(const MultiScanArgs& scan_opts) override {
+    db_iter_->Prepare(scan_opts);
+  }
+
+  // FIXME: we could just pass SV in for mutable cf option, version and version
+  // number, but this is used by SstFileReader which does not have a SV.
   void Init(Env* env, const ReadOptions& read_options,
             const ImmutableOptions& ioptions,
             const MutableCFOptions& mutable_cf_options, const Version* version,
-            const SequenceNumber& sequence,
-            uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
+            const SequenceNumber& sequence, uint64_t version_number,
             ReadCallback* read_callback, ColumnFamilyHandleImpl* cfh,
-            bool expose_blob_index, bool allow_refresh);
+            bool expose_blob_index, bool allow_refresh,
+            ReadOnlyMemTable* active_mem);
 
   // Store some parameters so we can refresh the iterator at a later point
   // with these same params
@@ -128,20 +133,16 @@ class ArenaWrappedDBIter : public Iterator {
   ReadCallback* read_callback_;
   bool expose_blob_index_ = false;
   bool allow_refresh_ = true;
+  bool allow_mark_memtable_for_flush_ = true;
   // If this is nullptr, it means the mutable memtable does not contain range
   // tombstone when added under this DBIter.
   std::unique_ptr<TruncatedRangeDelIterator>* memtable_range_tombstone_iter_ =
       nullptr;
 };
 
-// Generate the arena wrapped iterator class.
-// `cfh` is used for reneweal. If left null, renewal will not
-// be supported.
 ArenaWrappedDBIter* NewArenaWrappedDbIterator(
-    Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions,
-    const MutableCFOptions& mutable_cf_options, const Version* version,
-    const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations,
-    uint64_t version_number, ReadCallback* read_callback,
-    ColumnFamilyHandleImpl* cfh = nullptr, bool expose_blob_index = false,
-    bool allow_refresh = true);
+    Env* env, const ReadOptions& read_options, ColumnFamilyHandleImpl* cfh,
+    SuperVersion* sv, const SequenceNumber& sequence,
+    ReadCallback* read_callback, DBImpl* db_impl, bool expose_blob_index,
+    bool allow_refresh, bool allow_mark_memtable_for_flush);
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/blob/blob_file_builder.cc b/db/blob/blob_file_builder.cc
index dceb90cee57a..5e71c8a38236 100644
--- a/db/blob/blob_file_builder.cc
+++ b/db/blob/blob_file_builder.cc
@@ -67,6 +67,16 @@ BlobFileBuilder::BlobFileBuilder(
       min_blob_size_(mutable_cf_options->min_blob_size),
       blob_file_size_(mutable_cf_options->blob_file_size),
       blob_compression_type_(mutable_cf_options->blob_compression_type),
+      // TODO: support most CompressionOptions with a new CF option
+      // blob_compression_opts
+      // TODO with schema change: support custom compression manager and options
+      // such as max_compressed_bytes_per_kb
+      // NOTE: returns nullptr for kNoCompression
+      blob_compressor_(GetBuiltinV2CompressionManager()->GetCompressor(
+          CompressionOptions{}, blob_compression_type_)),
+      blob_compressor_wa_(blob_compressor_
+                              ? blob_compressor_->ObtainWorkingArea()
+                              : Compressor::ManagedWorkingArea{}),
       prepopulate_blob_cache_(mutable_cf_options->prepopulate_blob_cache),
       file_options_(file_options),
       write_options_(write_options),
@@ -113,7 +123,7 @@ Status BlobFileBuilder::Add(const Slice& key, const Slice& value,
   }
 
   Slice blob = value;
-  std::string compressed_blob;
+  GrowableBuffer compressed_blob;
 
   {
     const Status s = CompressBlobIfNeeded(&blob, &compressed_blob);
@@ -188,10 +198,12 @@ Status BlobFileBuilder::OpenBlobFileIfNeeded() {
   }
 
   std::unique_ptr<FSWritableFile> file;
-
+  FileOptions fo_copy;
   {
     assert(file_options_);
-    Status s = NewWritableFile(fs_, blob_file_path, &file, *file_options_);
+    fo_copy = *file_options_;
+    fo_copy.write_hint = write_hint_;
+    Status s = NewWritableFile(fs_, blob_file_path, &file, fo_copy);
 
     TEST_SYNC_POINT_CALLBACK(
         "BlobFileBuilder::OpenBlobFileIfNeeded:NewWritableFile", &s);
@@ -209,7 +221,9 @@ Status BlobFileBuilder::OpenBlobFileIfNeeded() {
 
   assert(file);
   file->SetIOPriority(write_options_->rate_limiter_priority);
-  file->SetWriteLifeTimeHint(write_hint_);
+  // Subsequent attempts to override the hint via SetWriteLifeTimeHint
+  // with the very same value will be ignored by the fs.
+  file->SetWriteLifeTimeHint(fo_copy.write_hint);
   FileTypeSet tmp_set = immutable_options_->checksum_handoff_file_types;
   Statistics* const statistics = immutable_options_->stats;
   std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
@@ -250,37 +264,27 @@ Status BlobFileBuilder::OpenBlobFileIfNeeded() {
 }
 
 Status BlobFileBuilder::CompressBlobIfNeeded(
-    Slice* blob, std::string* compressed_blob) const {
+    Slice* blob, GrowableBuffer* compressed_blob) const {
   assert(blob);
   assert(compressed_blob);
   assert(compressed_blob->empty());
   assert(immutable_options_);
 
-  if (blob_compression_type_ == kNoCompression) {
+  if (!blob_compressor_) {
+    assert(blob_compression_type_ == kNoCompression);
     return Status::OK();
   }
+  assert(blob_compression_type_ != kNoCompression);
 
-  // TODO: allow user CompressionOptions, including max_compressed_bytes_per_kb
-  CompressionOptions opts;
-  CompressionContext context(blob_compression_type_, opts);
-  constexpr uint64_t sample_for_compression = 0;
-
-  CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
-                       blob_compression_type_, sample_for_compression);
-
-  constexpr uint32_t compression_format_version = 2;
+  // WART: always stored as compressed even when that increases the size.
 
-  bool success = false;
-
-  {
-    StopWatch stop_watch(immutable_options_->clock, immutable_options_->stats,
-                         BLOB_DB_COMPRESSION_MICROS);
-    success =
-        CompressData(*blob, info, compression_format_version, compressed_blob);
-  }
-
-  if (!success) {
-    return Status::Corruption("Error compressing blob");
+  Status s;
+  StopWatch stop_watch(immutable_options_->clock, immutable_options_->stats,
+                       BLOB_DB_COMPRESSION_MICROS);
+  s = LegacyForceBuiltinCompression(*blob_compressor_, &blob_compressor_wa_,
+                                    *blob, compressed_blob);
+  if (!s.ok()) {
+    return s;
   }
 
   *blob = Slice(*compressed_blob);
diff --git a/db/blob/blob_file_builder.h b/db/blob/blob_file_builder.h
index 6ba7181aa09f..95d55f6bd9b6 100644
--- a/db/blob/blob_file_builder.h
+++ b/db/blob/blob_file_builder.h
@@ -10,12 +10,14 @@
 #include <string>
 #include <vector>
 
+#include "rocksdb/advanced_compression.h"
 #include "rocksdb/advanced_options.h"
 #include "rocksdb/compression_type.h"
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
 #include "rocksdb/rocksdb_namespace.h"
 #include "rocksdb/types.h"
+#include "util/aligned_buffer.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -76,7 +78,8 @@ class BlobFileBuilder {
  private:
   bool IsBlobFileOpen() const;
   Status OpenBlobFileIfNeeded();
-  Status CompressBlobIfNeeded(Slice* blob, std::string* compressed_blob) const;
+  Status CompressBlobIfNeeded(Slice* blob,
+                              GrowableBuffer* compressed_blob) const;
   Status WriteBlobToFile(const Slice& key, const Slice& blob,
                          uint64_t* blob_file_number, uint64_t* blob_offset);
   Status CloseBlobFile();
@@ -91,6 +94,8 @@ class BlobFileBuilder {
   uint64_t min_blob_size_;
   uint64_t blob_file_size_;
   CompressionType blob_compression_type_;
+  std::unique_ptr<Compressor> blob_compressor_;
+  mutable Compressor::ManagedWorkingArea blob_compressor_wa_;
   PrepopulateBlobCache prepopulate_blob_cache_;
   const FileOptions* file_options_;
   const WriteOptions* write_options_;
diff --git a/db/blob/blob_file_builder_test.cc b/db/blob/blob_file_builder_test.cc
index 8a2ecff13a74..ad09238e2f4f 100644
--- a/db/blob/blob_file_builder_test.cc
+++ b/db/blob/blob_file_builder_test.cc
@@ -403,23 +403,19 @@ TEST_F(BlobFileBuilderTest, Compression) {
   ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number);
   ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), 1);
 
-  CompressionOptions opts;
-  CompressionContext context(kSnappyCompression, opts);
-  constexpr uint64_t sample_for_compression = 0;
-
-  CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
-                       kSnappyCompression, sample_for_compression);
-
-  std::string compressed_value;
-  ASSERT_TRUE(Snappy_Compress(info, uncompressed_value.data(),
-                              uncompressed_value.size(), &compressed_value));
+  auto compressor =
+      GetBuiltinV2CompressionManager()->GetCompressor({}, kSnappyCompression);
+  GrowableBuffer compressed_value;
+  ASSERT_OK(LegacyForceBuiltinCompression(*compressor, /*working_area=*/nullptr,
+                                          uncompressed_value,
+                                          &compressed_value));
 
   ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(),
             BlobLogRecord::kHeaderSize + key_size + compressed_value.size());
 
   // Verify the contents of the new blob file as well as the blob reference
   std::vector<std::pair<std::string, std::string>> expected_key_value_pairs{
-      {key, compressed_value}};
+      {key, compressed_value.AsSlice().ToString()}};
   std::vector<std::string> blob_indexes{blob_index};
 
   VerifyBlobFile(blob_file_number, blob_file_path, column_family_id,
@@ -458,11 +454,12 @@ TEST_F(BlobFileBuilderTest, CompressionError) {
       nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
       BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions);
 
-  SyncPoint::GetInstance()->SetCallBack("CompressData:TamperWithReturnValue",
-                                        [](void* arg) {
-                                          bool* ret = static_cast<bool*>(arg);
-                                          *ret = false;
-                                        });
+  SyncPoint::GetInstance()->SetCallBack(
+      "LegacyForceBuiltinCompression:TamperWithStatus", [](void* arg) {
+        Status* ret = static_cast<Status*>(arg);
+        ASSERT_OK(*ret);
+        *ret = Status::Corruption("Tampered result");
+      });
   SyncPoint::GetInstance()->EnableProcessing();
 
   constexpr char key[] = "1";
@@ -470,7 +467,7 @@ TEST_F(BlobFileBuilderTest, CompressionError) {
 
   std::string blob_index;
 
-  ASSERT_TRUE(builder.Add(key, value, &blob_index).IsCorruption());
+  ASSERT_EQ(builder.Add(key, value, &blob_index).code(), Status::kCorruption);
 
   SyncPoint::GetInstance()->DisableProcessing();
   SyncPoint::GetInstance()->ClearAllCallBacks();
diff --git a/db/blob/blob_file_meta.h b/db/blob/blob_file_meta.h
index d7c8a124336d..2e47726f8d11 100644
--- a/db/blob/blob_file_meta.h
+++ b/db/blob/blob_file_meta.h
@@ -6,6 +6,7 @@
 #pragma once
 
 #include <cassert>
+#include <cstdint>
 #include <iosfwd>
 #include <memory>
 #include <string>
diff --git a/db/blob/blob_file_reader.cc b/db/blob/blob_file_reader.cc
index 0c30efbc119f..3f419c5a0814 100644
--- a/db/blob/blob_file_reader.cc
+++ b/db/blob/blob_file_reader.cc
@@ -17,10 +17,10 @@
 #include "rocksdb/file_system.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/status.h"
+#include "table/format.h"
 #include "table/multiget_context.h"
 #include "test_util/sync_point.h"
 #include "util/compression.h"
-#include "util/crc32c.h"
 #include "util/stop_watch.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -69,9 +69,16 @@ Status BlobFileReader::Create(
     }
   }
 
-  blob_file_reader->reset(
-      new BlobFileReader(std::move(file_reader), file_size, compression_type,
-                         immutable_options.clock, statistics));
+  std::shared_ptr<Decompressor> decompressor;
+  if (compression_type != kNoCompression) {
+    // The blob format has always used compression format 2
+    decompressor = GetBuiltinV2CompressionManager()->GetDecompressorOptimizeFor(
+        compression_type);
+  }
+
+  blob_file_reader->reset(new BlobFileReader(
+      std::move(file_reader), file_size, compression_type,
+      std::move(decompressor), immutable_options.clock, statistics));
 
   return Status::OK();
 }
@@ -250,7 +257,8 @@ Status BlobFileReader::ReadFromFile(const RandomAccessFileReader* file_reader,
   Status s;
 
   IOOptions io_options;
-  s = file_reader->PrepareIOOptions(read_options, io_options);
+  IODebugContext dbg;
+  s = file_reader->PrepareIOOptions(read_options, io_options, &dbg);
   if (!s.ok()) {
     return s;
   }
@@ -259,13 +267,13 @@ Status BlobFileReader::ReadFromFile(const RandomAccessFileReader* file_reader,
     constexpr char* scratch = nullptr;
 
     s = file_reader->Read(io_options, read_offset, read_size, slice, scratch,
-                          aligned_buf);
+                          aligned_buf, &dbg);
   } else {
     buf->reset(new char[read_size]);
     constexpr AlignedBuf* aligned_scratch = nullptr;
 
     s = file_reader->Read(io_options, read_offset, read_size, slice, buf->get(),
-                          aligned_scratch);
+                          aligned_scratch, &dbg);
   }
 
   if (!s.ok()) {
@@ -281,11 +289,13 @@ Status BlobFileReader::ReadFromFile(const RandomAccessFileReader* file_reader,
 
 BlobFileReader::BlobFileReader(
     std::unique_ptr<RandomAccessFileReader>&& file_reader, uint64_t file_size,
-    CompressionType compression_type, SystemClock* clock,
+    CompressionType compression_type,
+    std::shared_ptr<Decompressor> decompressor, SystemClock* clock,
     Statistics* statistics)
     : file_reader_(std::move(file_reader)),
       file_size_(file_size),
       compression_type_(compression_type),
+      decompressor_(std::move(decompressor)),
       clock_(clock),
       statistics_(statistics) {
   assert(file_reader_);
@@ -334,7 +344,8 @@ Status BlobFileReader::GetBlob(
     constexpr bool for_compaction = true;
 
     IOOptions io_options;
-    s = file_reader_->PrepareIOOptions(read_options, io_options);
+    IODebugContext dbg;
+    s = file_reader_->PrepareIOOptions(read_options, io_options, &dbg);
     if (!s.ok()) {
       return s;
     }
@@ -373,8 +384,9 @@ Status BlobFileReader::GetBlob(
   const Slice value_slice(record_slice.data() + adjustment, value_size);
 
   {
-    const Status s = UncompressBlobIfNeeded(
-        value_slice, compression_type, allocator, clock_, statistics_, result);
+    const Status s = UncompressBlobIfNeeded(value_slice, compression_type,
+                                            decompressor_.get(), allocator,
+                                            clock_, statistics_, result);
     if (!s.ok()) {
       return s;
     }
@@ -463,10 +475,11 @@ void BlobFileReader::MultiGetBlob(
   PERF_COUNTER_ADD(blob_read_count, num_blobs);
   PERF_COUNTER_ADD(blob_read_byte, total_len);
   IOOptions opts;
-  s = file_reader_->PrepareIOOptions(read_options, opts);
+  IODebugContext dbg;
+  s = file_reader_->PrepareIOOptions(read_options, opts, &dbg);
   if (s.ok()) {
     s = file_reader_->MultiRead(opts, read_reqs.data(), read_reqs.size(),
-                                direct_io ? &aligned_buf : nullptr);
+                                direct_io ? &aligned_buf : nullptr, &dbg);
   }
   if (!s.ok()) {
     for (auto& req : read_reqs) {
@@ -521,9 +534,9 @@ void BlobFileReader::MultiGetBlob(
 
     // Uncompress blob if needed
     Slice value_slice(record_slice.data() + adjustments[i], req->len);
-    *req->status =
-        UncompressBlobIfNeeded(value_slice, compression_type_, allocator,
-                               clock_, statistics_, &blob_reqs[i].second);
+    *req->status = UncompressBlobIfNeeded(
+        value_slice, compression_type_, decompressor_.get(), allocator, clock_,
+        statistics_, &blob_reqs[i].second);
     if (req->status->ok()) {
       total_bytes += record_slice.size();
     }
@@ -580,8 +593,8 @@ Status BlobFileReader::VerifyBlob(const Slice& record_slice,
 
 Status BlobFileReader::UncompressBlobIfNeeded(
     const Slice& value_slice, CompressionType compression_type,
-    MemoryAllocator* allocator, SystemClock* clock, Statistics* statistics,
-    std::unique_ptr<BlobContents>* result) {
+    Decompressor* decompressor, MemoryAllocator* allocator, SystemClock* clock,
+    Statistics* statistics, std::unique_ptr<BlobContents>* result) {
   assert(result);
 
   if (compression_type == kNoCompression) {
@@ -590,31 +603,33 @@ Status BlobFileReader::UncompressBlobIfNeeded(
     return Status::OK();
   }
 
-  UncompressionContext context(compression_type);
-  UncompressionInfo info(context, UncompressionDict::GetEmptyDict(),
-                         compression_type);
+  assert(decompressor);
+
+  Decompressor::Args args;
+  args.compression_type = compression_type;
+  args.compressed_data = value_slice;
 
-  size_t uncompressed_size = 0;
-  constexpr uint32_t compression_format_version = 2;
+  Status s = decompressor->ExtractUncompressedSize(args);
+  if (!s.ok()) {
+    return Status::Corruption(s.ToString());
+  }
 
-  CacheAllocationPtr output;
+  CacheAllocationPtr output = AllocateBlock(args.uncompressed_size, allocator);
 
   {
     PERF_TIMER_GUARD(blob_decompress_time);
     StopWatch stop_watch(clock, statistics, BLOB_DB_DECOMPRESSION_MICROS);
-    output = UncompressData(info, value_slice.data(), value_slice.size(),
-                            &uncompressed_size, compression_format_version,
-                            allocator);
+    s = decompressor->DecompressBlock(args, output.get());
   }
 
   TEST_SYNC_POINT_CALLBACK(
-      "BlobFileReader::UncompressBlobIfNeeded:TamperWithResult", &output);
+      "BlobFileReader::UncompressBlobIfNeeded:TamperWithResult", &s);
 
-  if (!output) {
-    return Status::Corruption("Unable to uncompress blob");
+  if (!s.ok()) {
+    return Status::Corruption(s.ToString());
   }
 
-  result->reset(new BlobContents(std::move(output), uncompressed_size));
+  result->reset(new BlobContents(std::move(output), args.uncompressed_size));
 
   return Status::OK();
 }
diff --git a/db/blob/blob_file_reader.h b/db/blob/blob_file_reader.h
index fa8aa501d45f..e13e3380302a 100644
--- a/db/blob/blob_file_reader.h
+++ b/db/blob/blob_file_reader.h
@@ -10,6 +10,7 @@
 
 #include "db/blob/blob_read_request.h"
 #include "file/random_access_file_reader.h"
+#include "rocksdb/advanced_compression.h"
 #include "rocksdb/compression_type.h"
 #include "rocksdb/rocksdb_namespace.h"
 #include "util/autovector.h"
@@ -64,7 +65,8 @@ class BlobFileReader {
  private:
   BlobFileReader(std::unique_ptr<RandomAccessFileReader>&& file_reader,
                  uint64_t file_size, CompressionType compression_type,
-                 SystemClock* clock, Statistics* statistics);
+                 std::shared_ptr<Decompressor> decompressor, SystemClock* clock,
+                 Statistics* statistics);
 
   static Status OpenFile(const ImmutableOptions& immutable_options,
                          const FileOptions& file_opts,
@@ -96,6 +98,7 @@ class BlobFileReader {
 
   static Status UncompressBlobIfNeeded(const Slice& value_slice,
                                        CompressionType compression_type,
+                                       Decompressor* decompressor,
                                        MemoryAllocator* allocator,
                                        SystemClock* clock,
                                        Statistics* statistics,
@@ -104,6 +107,7 @@ class BlobFileReader {
   std::unique_ptr<RandomAccessFileReader> file_reader_;
   uint64_t file_size_;
   CompressionType compression_type_;
+  std::shared_ptr<Decompressor> decompressor_;
   SystemClock* clock_;
   Statistics* statistics_;
 };
diff --git a/db/blob/blob_file_reader_test.cc b/db/blob/blob_file_reader_test.cc
index 676cbed41e85..0e98d2619b02 100644
--- a/db/blob/blob_file_reader_test.cc
+++ b/db/blob/blob_file_reader_test.cc
@@ -65,7 +65,7 @@ void WriteBlobFile(const ImmutableOptions& immutable_options,
 
   ASSERT_OK(blob_log_writer.WriteHeader(WriteOptions(), header));
 
-  std::vector<std::string> compressed_blobs(num);
+  std::vector<GrowableBuffer> compressed_blobs(num);
   std::vector<Slice> blobs_to_write(num);
   if (kNoCompression == compression) {
     for (size_t i = 0; i < num; ++i) {
@@ -73,17 +73,13 @@ void WriteBlobFile(const ImmutableOptions& immutable_options,
       blob_sizes[i] = blobs[i].size();
     }
   } else {
-    CompressionOptions opts;
-    CompressionContext context(compression, opts);
-    constexpr uint64_t sample_for_compression = 0;
-    CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
-                         compression, sample_for_compression);
-
-    constexpr uint32_t compression_format_version = 2;
+    auto compressor =
+        GetBuiltinV2CompressionManager()->GetCompressor({}, compression);
 
     for (size_t i = 0; i < num; ++i) {
-      ASSERT_TRUE(CompressData(blobs[i], info, compression_format_version,
-                               &compressed_blobs[i]));
+      ASSERT_OK(LegacyForceBuiltinCompression(*compressor,
+                                              /*working_area=*/nullptr,
+                                              blobs[i], &compressed_blobs[i]));
       blobs_to_write[i] = compressed_blobs[i];
       blob_sizes[i] = compressed_blobs[i].size();
     }
@@ -810,11 +806,10 @@ TEST_F(BlobFileReaderTest, UncompressionError) {
 
   SyncPoint::GetInstance()->SetCallBack(
       "BlobFileReader::UncompressBlobIfNeeded:TamperWithResult", [](void* arg) {
-        CacheAllocationPtr* const output =
-            static_cast<CacheAllocationPtr*>(arg);
-        assert(output);
+        auto* result = static_cast<Status*>(arg);
+        assert(result);
 
-        output->reset();
+        *result = Status::Corruption("Injected result");
       });
 
   SyncPoint::GetInstance()->EnableProcessing();
@@ -825,11 +820,12 @@ TEST_F(BlobFileReaderTest, UncompressionError) {
   std::unique_ptr<BlobContents> value;
   uint64_t bytes_read = 0;
 
-  ASSERT_TRUE(reader
-                  ->GetBlob(ReadOptions(), key, blob_offset, blob_size,
-                            kSnappyCompression, prefetch_buffer, allocator,
-                            &value, &bytes_read)
-                  .IsCorruption());
+  ASSERT_EQ(reader
+                ->GetBlob(ReadOptions(), key, blob_offset, blob_size,
+                          kSnappyCompression, prefetch_buffer, allocator,
+                          &value, &bytes_read)
+                .code(),
+            Status::Code::kCorruption);
   ASSERT_EQ(value, nullptr);
   ASSERT_EQ(bytes_read, 0);
 
diff --git a/db/blob/blob_index.h b/db/blob/blob_index.h
index e9944d78448b..fda6f946a672 100644
--- a/db/blob/blob_index.h
+++ b/db/blob/blob_index.h
@@ -137,6 +137,18 @@ class BlobIndex {
     return oss.str();
   }
 
+  // Encode this blob index into dst based on its type.
+  void EncodeTo(std::string* dst) const {
+    if (IsInlined()) {
+      EncodeInlinedTTL(dst, expiration_, value_);
+    } else if (HasTTL()) {
+      EncodeBlobTTL(dst, expiration_, file_number_, offset_, size_,
+                    compression_);
+    } else {
+      EncodeBlob(dst, file_number_, offset_, size_, compression_);
+    }
+  }
+
   static void EncodeInlinedTTL(std::string* dst, uint64_t expiration,
                                const Slice& value) {
     assert(dst != nullptr);
diff --git a/db/blob/blob_source_test.cc b/db/blob/blob_source_test.cc
index d0e9def7d8b8..07c47ee50256 100644
--- a/db/blob/blob_source_test.cc
+++ b/db/blob/blob_source_test.cc
@@ -67,7 +67,7 @@ void WriteBlobFile(const ImmutableOptions& immutable_options,
 
   ASSERT_OK(blob_log_writer.WriteHeader(WriteOptions(), header));
 
-  std::vector<std::string> compressed_blobs(num);
+  std::vector<GrowableBuffer> compressed_blobs(num);
   std::vector<Slice> blobs_to_write(num);
   if (kNoCompression == compression) {
     for (size_t i = 0; i < num; ++i) {
@@ -75,17 +75,13 @@ void WriteBlobFile(const ImmutableOptions& immutable_options,
       blob_sizes[i] = blobs[i].size();
     }
   } else {
-    CompressionOptions opts;
-    CompressionContext context(compression, opts);
-    constexpr uint64_t sample_for_compression = 0;
-    CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
-                         compression, sample_for_compression);
-
-    constexpr uint32_t compression_format_version = 2;
+    auto compressor =
+        GetBuiltinV2CompressionManager()->GetCompressor({}, compression);
 
     for (size_t i = 0; i < num; ++i) {
-      ASSERT_TRUE(CompressData(blobs[i], info, compression_format_version,
-                               &compressed_blobs[i]));
+      ASSERT_OK(LegacyForceBuiltinCompression(*compressor,
+                                              /*working_area=*/nullptr,
+                                              blobs[i], &compressed_blobs[i]));
       blobs_to_write[i] = compressed_blobs[i];
       blob_sizes[i] = compressed_blobs[i].size();
     }
diff --git a/db/builder.cc b/db/builder.cc
index 08a9fecc7278..0ca00a45bd5f 100644
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -56,6 +56,18 @@ TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions,
   return tboptions.moptions.table_factory->NewTableBuilder(tboptions, file);
 }
 
+void ExtractTimestampFromTableProperties(const TableProperties& tp,
+                                         FileMetaData* meta) {
+  auto min_ts_iter = tp.user_collected_properties.find("rocksdb.timestamp_min");
+  if (min_ts_iter != tp.user_collected_properties.end()) {
+    meta->min_timestamp = min_ts_iter->second;
+  }
+  auto max_ts_iter = tp.user_collected_properties.find("rocksdb.timestamp_max");
+  if (max_ts_iter != tp.user_collected_properties.end()) {
+    meta->max_timestamp = max_ts_iter->second;
+  }
+}
+
 Status BuildTable(
     const std::string& dbname, VersionSet* versions,
     const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions,
@@ -74,8 +86,8 @@ Status BuildTable(
     EventLogger* event_logger, int job_id, TableProperties* table_properties,
     Env::WriteLifeTimeHint write_hint, const std::string* full_history_ts_low,
     BlobFileCompletionCallback* blob_callback, Version* version,
-    uint64_t* num_input_entries, uint64_t* memtable_payload_bytes,
-    uint64_t* memtable_garbage_bytes) {
+    uint64_t* memtable_payload_bytes, uint64_t* memtable_garbage_bytes,
+    InternalStats::CompactionStats* flush_stats) {
   assert((tboptions.column_family_id ==
           TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
          tboptions.column_family_name.empty());
@@ -145,7 +157,9 @@ Status BuildTable(
       bool use_direct_writes = file_options.use_direct_writes;
       TEST_SYNC_POINT_CALLBACK("BuildTable:create_file", &use_direct_writes);
 #endif  // !NDEBUG
-      IOStatus io_s = NewWritableFile(fs, fname, &file, file_options);
+      FileOptions fo_copy = file_options;
+      fo_copy.write_hint = write_hint;
+      IOStatus io_s = NewWritableFile(fs, fname, &file, fo_copy);
       assert(s.ok());
       s = io_s;
       if (io_status->ok()) {
@@ -163,7 +177,9 @@ Status BuildTable(
       table_file_created = true;
       FileTypeSet tmp_set = ioptions.checksum_handoff_file_types;
       file->SetIOPriority(tboptions.write_options.rate_limiter_priority);
-      file->SetWriteLifeTimeHint(write_hint);
+      // Subsequent attempts to override the hint via SetWriteLifeTimeHint
+      // with the very same value will be ignored by the fs.
+      file->SetWriteLifeTimeHint(fo_copy.write_hint);
       file_writer.reset(new WritableFileWriter(
           std::move(file), fname, file_options, ioptions.clock, io_tracer,
           ioptions.stats, Histograms::SST_WRITE_MICROS, ioptions.listeners,
@@ -197,8 +213,7 @@ Status BuildTable(
     CompactionIterator c_iter(
         iter, ucmp, &merge, kMaxSequenceNumber, &snapshots, earliest_snapshot,
         earliest_write_conflict_snapshot, job_snapshot, snapshot_checker, env,
-        ShouldReportDetailedTime(env, ioptions.stats),
-        true /* internal key corruption is not ok */, range_del_agg.get(),
+        ShouldReportDetailedTime(env, ioptions.stats), range_del_agg.get(),
         blob_file_builder.get(), ioptions.allow_data_in_errors,
         ioptions.enforce_single_del_contracts,
         /*manual_compaction_canceled=*/kManualCompactionCanceledFalse,
@@ -214,8 +229,7 @@ Status BuildTable(
       const Slice& key = c_iter.key();
       const Slice& value = c_iter.value();
       ParsedInternalKey ikey = c_iter.ikey();
-      key_after_flush_buf.assign(key.data(), key.size());
-      Slice key_after_flush = key_after_flush_buf;
+      Slice key_after_flush = key;
       Slice value_after_flush = value;
 
       if (ikey.type == kTypeValuePreferredSeqno) {
@@ -233,6 +247,7 @@ Status BuildTable(
               std::min(smallest_preferred_seqno, preferred_seqno);
         } else {
           // Cannot get a useful preferred seqno, convert it to a kTypeValue.
+          key_after_flush_buf.assign(key.data(), key.size());
           UpdateInternalKey(&key_after_flush_buf, ikey.sequence, kTypeValue);
           ikey = ParsedInternalKey(ikey.user_key, ikey.sequence, kTypeValue);
           key_after_flush = key_after_flush_buf;
@@ -249,6 +264,10 @@ Status BuildTable(
       }
       builder->Add(key_after_flush, value_after_flush);
 
+      if (flush_stats) {
+        flush_stats->num_output_records++;
+      }
+
       s = meta->UpdateBoundaries(key_after_flush, value_after_flush,
                                  ikey.sequence, ikey.type);
       if (!s.ok()) {
@@ -280,6 +299,9 @@ Status BuildTable(
         auto tombstone = range_del_it->Tombstone();
         std::pair<InternalKey, Slice> kv = tombstone.Serialize();
         builder->Add(kv.first.Encode(), kv.second);
+        if (flush_stats) {
+          flush_stats->num_output_records++;
+        }
         InternalKey tombstone_end = tombstone.SerializeEndKey();
         meta->UpdateBoundariesForRange(kv.first, tombstone_end, tombstone.seq_,
                                        tboptions.internal_comparator);
@@ -301,9 +323,9 @@ Status BuildTable(
 
     TEST_SYNC_POINT("BuildTable:BeforeFinishBuildTable");
     const bool empty = builder->IsEmpty();
-    if (num_input_entries != nullptr) {
+    if (flush_stats) {
       assert(c_iter.HasNumInputEntryScanned());
-      *num_input_entries =
+      flush_stats->num_input_records =
           c_iter.NumInputEntryScanned() + num_unfragmented_tombstones;
     }
     if (!s.ok() || empty) {
@@ -330,6 +352,12 @@ Status BuildTable(
     }
 
     if (s.ok() && !empty) {
+      if (flush_stats) {
+        flush_stats->bytes_written_pre_comp = builder->PreCompressionSize();
+        // Add worker CPU micros here. Caller needs to add CPU micros from
+        // calling thread.
+        flush_stats->cpu_micros += builder->GetWorkerCPUMicros();
+      }
       uint64_t file_size = builder->FileSize();
       meta->fd.file_size = file_size;
       meta->tail_size = builder->GetTailSize();
@@ -339,6 +367,7 @@ Status BuildTable(
       assert(meta->fd.GetFileSize() > 0);
       tp = builder
                ->GetTableProperties();  // refresh now that builder is finished
+      ExtractTimestampFromTableProperties(tp, meta);
       if (memtable_payload_bytes != nullptr &&
           memtable_garbage_bytes != nullptr) {
         const CompactionIterationStats& ci_stats = c_iter.iter_stats();
diff --git a/db/builder.h b/db/builder.h
index 08dd5fcab001..9f83a6f5dc16 100644
--- a/db/builder.h
+++ b/db/builder.h
@@ -10,6 +10,7 @@
 #include <utility>
 #include <vector>
 
+#include "db/internal_stats.h"
 #include "db/range_tombstone_fragmenter.h"
 #include "db/seqno_to_time_mapping.h"
 #include "db/table_properties_collector.h"
@@ -34,13 +35,19 @@ class SnapshotChecker;
 class TableCache;
 class TableBuilder;
 class WritableFileWriter;
-class InternalStats;
 class BlobFileCompletionCallback;
 
 // Convenience function for NewTableBuilder on the embedded table_factory.
 TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions,
                               WritableFileWriter* file);
 
+// Extract min/max timestamps from table properties and populate FileMetaData.
+// This is used by both flush (BuildTable) and compaction (CompactionOutputs)
+// to populate timestamp range in FileMetaData from the TimestampTableProperties
+// collector output.
+void ExtractTimestampFromTableProperties(const TableProperties& tp,
+                                         FileMetaData* meta);
+
 // Build a Table file from the contents of *iter.  The generated file
 // will be named according to number specified in meta. On success, the rest of
 // *meta will be filled with metadata about the generated table.
@@ -49,6 +56,7 @@ TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions,
 //
 // @param column_family_name Name of the column family that is also identified
 //    by column_family_id, or empty string if unknown.
+// @param flush_stats treat flush as level 0 compaction in internal stats
 Status BuildTable(
     const std::string& dbname, VersionSet* versions,
     const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions,
@@ -69,8 +77,8 @@ Status BuildTable(
     Env::WriteLifeTimeHint write_hint = Env::WLTH_NOT_SET,
     const std::string* full_history_ts_low = nullptr,
     BlobFileCompletionCallback* blob_callback = nullptr,
-    Version* version = nullptr, uint64_t* num_input_entries = nullptr,
-    uint64_t* memtable_payload_bytes = nullptr,
-    uint64_t* memtable_garbage_bytes = nullptr);
+    Version* version = nullptr, uint64_t* memtable_payload_bytes = nullptr,
+    uint64_t* memtable_garbage_bytes = nullptr,
+    InternalStats::CompactionStats* flush_stats = nullptr);
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/c.cc b/db/c.cc
index b101540ffa1b..6e00a0761cf6 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -11,6 +11,7 @@
 
 #include <cstdlib>
 #include <map>
+#include <memory>
 #include <unordered_set>
 #include <vector>
 
@@ -24,12 +25,14 @@
 #include "rocksdb/experimental.h"
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/iterator.h"
+#include "rocksdb/listener.h"
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/options.h"
 #include "rocksdb/perf_context.h"
 #include "rocksdb/rate_limiter.h"
 #include "rocksdb/slice_transform.h"
+#include "rocksdb/sst_file_manager.h"
 #include "rocksdb/statistics.h"
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
@@ -49,6 +52,7 @@
 #include "util/stderr_logger.h"
 #include "utilities/merge_operators.h"
 
+using ROCKSDB_NAMESPACE::BackgroundErrorReason;
 using ROCKSDB_NAMESPACE::BackupEngine;
 using ROCKSDB_NAMESPACE::BackupEngineOptions;
 using ROCKSDB_NAMESPACE::BackupID;
@@ -65,7 +69,14 @@ using ROCKSDB_NAMESPACE::ColumnFamilyMetaData;
 using ROCKSDB_NAMESPACE::ColumnFamilyOptions;
 using ROCKSDB_NAMESPACE::CompactionFilter;
 using ROCKSDB_NAMESPACE::CompactionFilterFactory;
+using ROCKSDB_NAMESPACE::CompactionJobInfo;
 using ROCKSDB_NAMESPACE::CompactionOptionsFIFO;
+using ROCKSDB_NAMESPACE::CompactionReason;
+using ROCKSDB_NAMESPACE::CompactionService;
+using ROCKSDB_NAMESPACE::CompactionServiceJobInfo;
+using ROCKSDB_NAMESPACE::CompactionServiceJobStatus;
+using ROCKSDB_NAMESPACE::CompactionServiceOptionsOverride;
+using ROCKSDB_NAMESPACE::CompactionServiceScheduleResponse;
 using ROCKSDB_NAMESPACE::CompactRangeOptions;
 using ROCKSDB_NAMESPACE::Comparator;
 using ROCKSDB_NAMESPACE::CompressionType;
@@ -76,11 +87,18 @@ using ROCKSDB_NAMESPACE::DBOptions;
 using ROCKSDB_NAMESPACE::DbPath;
 using ROCKSDB_NAMESPACE::Env;
 using ROCKSDB_NAMESPACE::EnvOptions;
+using ROCKSDB_NAMESPACE::EventListener;
+using ROCKSDB_NAMESPACE::ExportImportFilesMetaData;
+using ROCKSDB_NAMESPACE::ExternalFileIngestionInfo;
+using ROCKSDB_NAMESPACE::FileChecksumGenFactory;
 using ROCKSDB_NAMESPACE::FileLock;
 using ROCKSDB_NAMESPACE::FilterPolicy;
+using ROCKSDB_NAMESPACE::FlushJobInfo;
 using ROCKSDB_NAMESPACE::FlushOptions;
+using ROCKSDB_NAMESPACE::GetFileChecksumGenCrc32cFactory;
 using ROCKSDB_NAMESPACE::HistogramData;
 using ROCKSDB_NAMESPACE::HyperClockCacheOptions;
+using ROCKSDB_NAMESPACE::ImportColumnFamilyOptions;
 using ROCKSDB_NAMESPACE::InfoLogLevel;
 using ROCKSDB_NAMESPACE::IngestExternalFileOptions;
 using ROCKSDB_NAMESPACE::Iterator;
@@ -90,12 +108,15 @@ using ROCKSDB_NAMESPACE::Logger;
 using ROCKSDB_NAMESPACE::LRUCacheOptions;
 using ROCKSDB_NAMESPACE::MemoryAllocator;
 using ROCKSDB_NAMESPACE::MemoryUtil;
+using ROCKSDB_NAMESPACE::MemTableInfo;
 using ROCKSDB_NAMESPACE::MergeOperator;
 using ROCKSDB_NAMESPACE::NewBloomFilterPolicy;
 using ROCKSDB_NAMESPACE::NewCompactOnDeletionCollectorFactory;
 using ROCKSDB_NAMESPACE::NewGenericRateLimiter;
 using ROCKSDB_NAMESPACE::NewLRUCache;
 using ROCKSDB_NAMESPACE::NewRibbonFilterPolicy;
+using ROCKSDB_NAMESPACE::NewSstPartitionerFixedPrefixFactory;
+using ROCKSDB_NAMESPACE::OpenAndCompactOptions;
 using ROCKSDB_NAMESPACE::OptimisticTransactionDB;
 using ROCKSDB_NAMESPACE::OptimisticTransactionOptions;
 using ROCKSDB_NAMESPACE::Options;
@@ -113,10 +134,14 @@ using ROCKSDB_NAMESPACE::Slice;
 using ROCKSDB_NAMESPACE::SliceParts;
 using ROCKSDB_NAMESPACE::SliceTransform;
 using ROCKSDB_NAMESPACE::Snapshot;
+using ROCKSDB_NAMESPACE::SstFileManager;
 using ROCKSDB_NAMESPACE::SstFileMetaData;
 using ROCKSDB_NAMESPACE::SstFileWriter;
+using ROCKSDB_NAMESPACE::SstPartitionerFactory;
 using ROCKSDB_NAMESPACE::Status;
 using ROCKSDB_NAMESPACE::StderrLogger;
+using ROCKSDB_NAMESPACE::SubcompactionJobInfo;
+using ROCKSDB_NAMESPACE::TableFactory;
 using ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory;
 using ROCKSDB_NAMESPACE::Transaction;
 using ROCKSDB_NAMESPACE::TransactionDB;
@@ -130,6 +155,8 @@ using ROCKSDB_NAMESPACE::WriteBatch;
 using ROCKSDB_NAMESPACE::WriteBatchWithIndex;
 using ROCKSDB_NAMESPACE::WriteBufferManager;
 using ROCKSDB_NAMESPACE::WriteOptions;
+using ROCKSDB_NAMESPACE::WriteStallCondition;
+using ROCKSDB_NAMESPACE::WriteStallInfo;
 
 using std::unordered_set;
 using std::vector;
@@ -139,6 +166,9 @@ extern "C" {
 struct rocksdb_t {
   DB* rep;
 };
+struct rocksdb_status_ptr_t {
+  Status* rep;
+};
 struct rocksdb_backup_engine_t {
   BackupEngine* rep;
 };
@@ -211,6 +241,15 @@ struct rocksdb_filelock_t {
 struct rocksdb_logger_t {
   std::shared_ptr<Logger> rep;
 };
+struct rocksdb_file_checksum_gen_factory_t {
+  std::shared_ptr<FileChecksumGenFactory> rep;
+};
+struct rocksdb_sst_partitioner_factory_t {
+  std::shared_ptr<SstPartitionerFactory> rep;
+};
+struct rocksdb_table_properties_collector_factory_t {
+  std::shared_ptr<TablePropertiesCollectorFactory> rep;
+};
 struct rocksdb_lru_cache_options_t {
   LRUCacheOptions rep;
 };
@@ -226,6 +265,12 @@ struct rocksdb_cache_t {
 struct rocksdb_write_buffer_manager_t {
   std::shared_ptr<WriteBufferManager> rep;
 };
+struct rocksdb_sst_file_manager_t {
+  std::shared_ptr<SstFileManager> rep;
+};
+struct rocksdb_livefile_t {
+  LiveFileMetaData rep;
+};
 struct rocksdb_livefiles_t {
   std::vector<LiveFileMetaData> rep;
 };
@@ -236,6 +281,12 @@ struct rocksdb_column_family_handle_t {
 struct rocksdb_column_family_metadata_t {
   ColumnFamilyMetaData rep;
 };
+struct rocksdb_export_import_files_metadata_t {
+  ExportImportFilesMetaData* rep;
+};
+struct rocksdb_import_column_family_options_t {
+  ImportColumnFamilyOptions rep;
+};
 struct rocksdb_level_metadata_t {
   const LevelMetaData* rep;
 };
@@ -292,11 +343,49 @@ struct rocksdb_compactionfiltercontext_t {
   CompactionFilter::Context rep;
 };
 
+struct rocksdb_flushjobinfo_t {
+  FlushJobInfo rep;
+};
+struct rocksdb_writestallcondition_t {
+  WriteStallCondition rep;
+};
+struct rocksdb_writestallinfo_t {
+  WriteStallInfo rep;
+};
+struct rocksdb_memtableinfo_t {
+  MemTableInfo rep;
+};
+struct rocksdb_compactionjobinfo_t {
+  CompactionJobInfo rep;
+};
+struct rocksdb_subcompactionjobinfo_t {
+  SubcompactionJobInfo rep;
+};
+struct rocksdb_externalfileingestioninfo_t {
+  ExternalFileIngestionInfo rep;
+};
+
 struct rocksdb_statistics_histogram_data_t {
   rocksdb_statistics_histogram_data_t() : rep() {}
   HistogramData rep;
 };
 
+struct rocksdb_compactionservice_scheduleresponse_t {
+  CompactionServiceScheduleResponse rep;
+};
+
+struct rocksdb_compactionservice_jobinfo_t {
+  CompactionServiceJobInfo rep;
+};
+
+struct rocksdb_compaction_service_options_override_t {
+  CompactionServiceOptionsOverride rep;
+};
+
+struct rocksdb_open_and_compact_options_t {
+  OpenAndCompactOptions rep;
+};
+
 struct rocksdb_compactionfilter_t : public CompactionFilter {
   void* state_;
   void (*destructor_)(void*);
@@ -507,7 +596,6 @@ struct rocksdb_slicetransform_t : public SliceTransform {
   char* (*transform_)(void*, const char* key, size_t length,
                       size_t* dst_length);
   unsigned char (*in_domain_)(void*, const char* key, size_t length);
-  unsigned char (*in_range_)(void*, const char* key, size_t length);
 
   ~rocksdb_slicetransform_t() override { (*destructor_)(state_); }
 
@@ -522,10 +610,6 @@ struct rocksdb_slicetransform_t : public SliceTransform {
   bool InDomain(const Slice& src) const override {
     return (*in_domain_)(state_, src.data(), src.size());
   }
-
-  bool InRange(const Slice& src) const override {
-    return (*in_range_)(state_, src.data(), src.size());
-  }
 };
 
 struct rocksdb_universal_compaction_options_t {
@@ -583,21 +667,563 @@ static bool SaveError(char** errptr, const Status& s) {
   return true;
 }
 
-// Copies str to a new malloc()-ed buffer. The buffer is not NUL terminated.
-static char* CopyString(const std::string& str) {
-  char* result = reinterpret_cast<char*>(malloc(sizeof(char) * str.size()));
-  memcpy(result, str.data(), sizeof(char) * str.size());
+// Helper function to copy string data to a malloc'd buffer
+// Works with std::string, Slice, and PinnableSlice through implicit conversion
+static inline char* CopyString(const Slice& slice) {
+  char* result = reinterpret_cast<char*>(malloc(slice.size()));
+  memcpy(result, slice.data(), slice.size());
+  return result;
+}
+
+const char* rocksdb_compactionservice_jobinfo_t_get_db_name(
+    const rocksdb_compactionservice_jobinfo_t* info, size_t* len) {
+  *len = info->rep.db_name.size();
+  return info->rep.db_name.data();
+}
+
+const char* rocksdb_compactionservice_jobinfo_t_get_db_id(
+    const rocksdb_compactionservice_jobinfo_t* info, size_t* len) {
+  *len = info->rep.db_id.size();
+  return info->rep.db_id.data();
+}
+
+const char* rocksdb_compactionservice_jobinfo_t_get_db_session_id(
+    const rocksdb_compactionservice_jobinfo_t* info, size_t* len) {
+  *len = info->rep.db_session_id.size();
+  return info->rep.db_session_id.data();
+}
+
+const char* rocksdb_compactionservice_jobinfo_t_get_cf_name(
+    const rocksdb_compactionservice_jobinfo_t* info, size_t* len) {
+  *len = info->rep.cf_name.size();
+  return info->rep.cf_name.data();
+}
+
+uint32_t rocksdb_compactionservice_jobinfo_t_get_cf_id(
+    const rocksdb_compactionservice_jobinfo_t* info) {
+  return info->rep.cf_id;
+}
+
+uint64_t rocksdb_compactionservice_jobinfo_t_get_job_id(
+    const rocksdb_compactionservice_jobinfo_t* info) {
+  return info->rep.job_id;
+}
+
+int rocksdb_compactionservice_jobinfo_t_get_priority(
+    const rocksdb_compactionservice_jobinfo_t* info) {
+  return static_cast<int>(info->rep.priority);
+}
+
+int rocksdb_compactionservice_jobinfo_t_get_compaction_reason(
+    const rocksdb_compactionservice_jobinfo_t* info) {
+  return static_cast<int>(info->rep.compaction_reason);
+}
+
+int rocksdb_compactionservice_jobinfo_t_get_base_input_level(
+    const rocksdb_compactionservice_jobinfo_t* info) {
+  return info->rep.base_input_level;
+}
+
+int rocksdb_compactionservice_jobinfo_t_get_output_level(
+    const rocksdb_compactionservice_jobinfo_t* info) {
+  return info->rep.output_level;
+}
+
+unsigned char rocksdb_compactionservice_jobinfo_t_is_full_compaction(
+    const rocksdb_compactionservice_jobinfo_t* info) {
+  return info->rep.is_full_compaction;
+}
+
+unsigned char rocksdb_compactionservice_jobinfo_t_is_manual_compaction(
+    const rocksdb_compactionservice_jobinfo_t* info) {
+  return info->rep.is_manual_compaction;
+}
+
+unsigned char rocksdb_compactionservice_jobinfo_t_is_bottommost_level(
+    const rocksdb_compactionservice_jobinfo_t* info) {
+  return info->rep.bottommost_level;
+}
+
+// Helper function to validate compaction service job status
+static inline bool IsValidCompactionServiceJobStatus(int status) {
+  return status >= rocksdb_compactionservice_jobstatus_success &&
+         status <= rocksdb_compactionservice_jobstatus_use_local;
+}
+
+rocksdb_compactionservice_scheduleresponse_t*
+rocksdb_compactionservice_scheduleresponse_create(const char* scheduled_job_id,
+                                                  int status, char** errptr) {
+  // Validate status is in range [success=0, failure=1, aborted=2, use_local=3]
+  if (!IsValidCompactionServiceJobStatus(status)) {
+    SaveError(errptr,
+              Status::InvalidArgument("Invalid status value. Must be 0-3."));
+    return nullptr;
+  }
+
+  rocksdb_compactionservice_scheduleresponse_t* response =
+      new rocksdb_compactionservice_scheduleresponse_t{
+          CompactionServiceScheduleResponse(
+              scheduled_job_id ? std::string(scheduled_job_id) : "",
+              static_cast<CompactionServiceJobStatus>(status))};
+  return response;
+}
+
+rocksdb_compactionservice_scheduleresponse_t*
+rocksdb_compactionservice_scheduleresponse_create_with_status(int status,
+                                                              char** errptr) {
+  // Validate status is in range [success=0, failure=1, aborted=2, use_local=3]
+  if (!IsValidCompactionServiceJobStatus(status)) {
+    SaveError(errptr,
+              Status::InvalidArgument("Invalid status value. Must be 0-3."));
+    return nullptr;
+  }
+
+  rocksdb_compactionservice_scheduleresponse_t* response =
+      new rocksdb_compactionservice_scheduleresponse_t{
+          CompactionServiceScheduleResponse(
+              static_cast<CompactionServiceJobStatus>(status))};
+  return response;
+}
+
+void rocksdb_compactionservice_scheduleresponse_t_destroy(
+    rocksdb_compactionservice_scheduleresponse_t* response) {
+  if (response) {
+    delete response;
+  }
+}
+
+int rocksdb_compactionservice_scheduleresponse_getstatus(
+    const rocksdb_compactionservice_scheduleresponse_t* response) {
+  if (!response) {
+    return rocksdb_compactionservice_jobstatus_failure;
+  }
+  return static_cast<int>(response->rep.status);
+}
+
+const char* rocksdb_compactionservice_scheduleresponse_get_scheduled_job_id(
+    const rocksdb_compactionservice_scheduleresponse_t* response, size_t* len) {
+  if (!response || !len) {
+    if (len) {
+      *len = 0;
+    }
+    return "";
+  }
+  *len = response->rep.scheduled_job_id.size();
+  return response->rep.scheduled_job_id.data();
+}
+
+struct rocksdb_compactionservice_t : public CompactionService {
+  void* state_;
+  void (*destructor_)(void*);
+  rocksdb_compaction_service_schedule_cb schedule_;
+  std::string name_;
+  rocksdb_compaction_service_wait_cb wait_;
+  rocksdb_compaction_service_cancel_awaiting_jobs_cb cancel_awaiting_jobs_;
+  rocksdb_compaction_service_on_installation_cb on_installation_;
+
+  rocksdb_compactionservice_t(
+      void* state, void (*destructor)(void*),
+      rocksdb_compaction_service_schedule_cb
+          rocksdb_compaction_service_schedule_ptr,
+      const char* name, rocksdb_compaction_service_wait_cb wait,
+      rocksdb_compaction_service_cancel_awaiting_jobs_cb cancel_awaiting_jobs,
+      rocksdb_compaction_service_on_installation_cb on_installation)
+      : state_(state),
+        destructor_(destructor),
+        schedule_(rocksdb_compaction_service_schedule_ptr),
+        name_(name ? name : "CompactionService"),
+        wait_(wait),
+        cancel_awaiting_jobs_(cancel_awaiting_jobs),
+        on_installation_(on_installation) {}
+
+  ~rocksdb_compactionservice_t() override {
+    if (destructor_) {
+      (*destructor_)(state_);
+    }
+  }
+
+  const char* Name() const override { return name_.c_str(); }
+
+  CompactionServiceScheduleResponse Schedule(
+      const CompactionServiceJobInfo& info,
+      const std::string& compaction_service_input) override {
+    if (schedule_ == nullptr) {
+      return CompactionServiceScheduleResponse(
+          CompactionServiceJobStatus::kUseLocal);
+    }
+
+    rocksdb_compactionservice_scheduleresponse_t* c_response = (*schedule_)(
+        state_,
+        reinterpret_cast<const rocksdb_compactionservice_jobinfo_t*>(&info),
+        compaction_service_input.data(), compaction_service_input.size());
+
+    if (c_response == nullptr) {
+      return CompactionServiceScheduleResponse(
+          CompactionServiceJobStatus::kFailure);
+    }
+
+    CompactionServiceScheduleResponse response = std::move(c_response->rep);
+    delete c_response;
+    return response;
+  }
+
+  CompactionServiceJobStatus Wait(const std::string& scheduled_job_id,
+                                  std::string* result) override {
+    if (wait_ == nullptr) {
+      return CompactionServiceJobStatus::kUseLocal;
+    }
+
+    char* c_result = nullptr;
+    size_t result_len = 0;
+
+    int status =
+        (*wait_)(state_, scheduled_job_id.c_str(), &c_result, &result_len);
+
+    if (c_result != nullptr) {
+      if (result != nullptr) {
+        result->assign(c_result, result_len);
+      }
+      free(c_result);
+    }
+
+    return static_cast<CompactionServiceJobStatus>(status);
+  }
+
+  void CancelAwaitingJobs() override {
+    if (cancel_awaiting_jobs_ != nullptr) {
+      (*cancel_awaiting_jobs_)(state_);
+    }
+  }
+
+  void OnInstallation(const std::string& scheduled_job_id,
+                      CompactionServiceJobStatus status) override {
+    if (on_installation_ != nullptr) {
+      (*on_installation_)(state_, scheduled_job_id.c_str(),
+                          static_cast<int>(status));
+    }
+  }
+};
+
+rocksdb_compactionservice_t* rocksdb_compactionservice_create(
+    void* state, void (*destructor)(void*),
+    rocksdb_compaction_service_schedule_cb schedule, const char* name,
+    rocksdb_compaction_service_wait_cb wait,
+    rocksdb_compaction_service_cancel_awaiting_jobs_cb cancel_awaiting_jobs,
+    rocksdb_compaction_service_on_installation_cb on_installation) {
+  return new rocksdb_compactionservice_t(state, destructor, schedule, name,
+                                         wait, cancel_awaiting_jobs,
+                                         on_installation);
+}
+
+void rocksdb_options_set_compaction_service(
+    rocksdb_options_t* opt, rocksdb_compactionservice_t* service) {
+  if (!opt || !service) {
+    return;
+  }
+
+  opt->rep.compaction_service = std::shared_ptr<CompactionService>(service);
+}
+
+// CompactionServiceOptionsOverride functions
+rocksdb_compaction_service_options_override_t*
+rocksdb_compaction_service_options_override_create() {
+  return new rocksdb_compaction_service_options_override_t;
+}
+
+rocksdb_compaction_service_options_override_t*
+rocksdb_compaction_service_options_override_create_from_options(
+    rocksdb_options_t* options) {
+  if (!options) {
+    return nullptr;
+  }
+
+  rocksdb_compaction_service_options_override_t* override_opts =
+      new rocksdb_compaction_service_options_override_t;
+
+  // Copy all relevant options from rocksdb_options_t
+  override_opts->rep.env = options->rep.env;
+  override_opts->rep.file_checksum_gen_factory =
+      options->rep.file_checksum_gen_factory;
+  override_opts->rep.comparator = options->rep.comparator;
+  override_opts->rep.merge_operator = options->rep.merge_operator;
+  override_opts->rep.compaction_filter = options->rep.compaction_filter;
+  override_opts->rep.compaction_filter_factory =
+      options->rep.compaction_filter_factory;
+  override_opts->rep.prefix_extractor = options->rep.prefix_extractor;
+  override_opts->rep.table_factory = options->rep.table_factory;
+  override_opts->rep.sst_partitioner_factory =
+      options->rep.sst_partitioner_factory;
+  override_opts->rep.listeners = options->rep.listeners;
+  override_opts->rep.statistics = options->rep.statistics;
+  override_opts->rep.info_log = options->rep.info_log;
+  override_opts->rep.table_properties_collector_factories =
+      options->rep.table_properties_collector_factories;
+
+  return override_opts;
+}
+
+void rocksdb_compaction_service_options_override_destroy(
+    rocksdb_compaction_service_options_override_t* override_options) {
+  if (override_options) {
+    delete override_options;
+  }
+}
+
+void rocksdb_compaction_service_options_override_set_env(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_env_t* env) {
+  if (override_options && env) {
+    override_options->rep.env = env->rep;
+  }
+}
+
+void rocksdb_compaction_service_options_override_set_comparator(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_comparator_t* comparator) {
+  if (override_options && comparator) {
+    override_options->rep.comparator = comparator;
+  }
+}
+
+void rocksdb_compaction_service_options_override_set_merge_operator(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_mergeoperator_t* merge_operator) {
+  if (override_options && merge_operator) {
+    override_options->rep.merge_operator =
+        std::shared_ptr<MergeOperator>(merge_operator);
+  }
+}
+
+void rocksdb_compaction_service_options_override_set_compaction_filter(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_compactionfilter_t* compaction_filter) {
+  if (override_options && compaction_filter) {
+    override_options->rep.compaction_filter = compaction_filter;
+  }
+}
+
+void rocksdb_compaction_service_options_override_set_compaction_filter_factory(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_compactionfilterfactory_t* compaction_filter_factory) {
+  if (override_options && compaction_filter_factory) {
+    override_options->rep.compaction_filter_factory =
+        std::shared_ptr<CompactionFilterFactory>(compaction_filter_factory);
+  }
+}
+
+void rocksdb_compaction_service_options_override_set_prefix_extractor(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_slicetransform_t* prefix_extractor) {
+  if (override_options && prefix_extractor) {
+    override_options->rep.prefix_extractor =
+        std::shared_ptr<const SliceTransform>(prefix_extractor);
+  }
+}
+
+void rocksdb_compaction_service_options_override_set_block_based_table_factory(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_block_based_table_options_t* table_options) {
+  if (override_options && table_options) {
+    override_options->rep.table_factory = std::shared_ptr<TableFactory>(
+        NewBlockBasedTableFactory(table_options->rep));
+  }
+}
+
+void rocksdb_compaction_service_options_override_set_cuckoo_table_factory(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_cuckoo_table_options_t* table_options) {
+  if (override_options && table_options) {
+    override_options->rep.table_factory = std::shared_ptr<TableFactory>(
+        NewCuckooTableFactory(table_options->rep));
+  }
+}
+
+// Note: add_event_listener is defined later after rocksdb_eventlistener_t
+// struct
+
+void rocksdb_compaction_service_options_override_set_statistics(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_options_t* options) {
+  if (override_options && options) {
+    override_options->rep.statistics = options->rep.statistics;
+  }
+}
+
+void rocksdb_compaction_service_options_override_set_info_log(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_logger_t* logger) {
+  if (override_options && logger) {
+    override_options->rep.info_log = logger->rep;
+  }
+}
+
+void rocksdb_compaction_service_options_override_set_option(
+    rocksdb_compaction_service_options_override_t* override_options,
+    const char* key, const char* value) {
+  if (override_options && key && value) {
+    override_options->rep.options_map[std::string(key)] = std::string(value);
+  }
+}
+
+void rocksdb_compaction_service_options_override_set_file_checksum_gen_factory(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_file_checksum_gen_factory_t* factory) {
+  if (override_options && factory) {
+    override_options->rep.file_checksum_gen_factory = factory->rep;
+  }
+}
+
+void rocksdb_compaction_service_options_override_set_sst_partitioner_factory(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_sst_partitioner_factory_t* factory) {
+  if (override_options && factory) {
+    override_options->rep.sst_partitioner_factory = factory->rep;
+  }
+}
+
+void rocksdb_compaction_service_options_override_add_table_properties_collector_factory(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_table_properties_collector_factory_t* factory) {
+  if (override_options && factory) {
+    override_options->rep.table_properties_collector_factories.push_back(
+        factory->rep);
+  }
+}
+
+// Atomic bool management for cancellation
+unsigned char* rocksdb_open_and_compact_canceled_create() {
+  return reinterpret_cast<unsigned char*>(new std::atomic<bool>(false));
+}
+
+void rocksdb_open_and_compact_canceled_destroy(unsigned char* canceled) {
+  if (canceled) {
+    delete reinterpret_cast<std::atomic<bool>*>(canceled);
+  }
+}
+
+void rocksdb_open_and_compact_canceled_set(unsigned char* canceled,
+                                           unsigned char value) {
+  if (canceled) {
+    reinterpret_cast<std::atomic<bool>*>(canceled)->store(value != 0);
+  }
+}
+
+// OpenAndCompactOptions functions
+rocksdb_open_and_compact_options_t* rocksdb_open_and_compact_options_create() {
+  return new rocksdb_open_and_compact_options_t;
+}
+
+void rocksdb_open_and_compact_options_destroy(
+    rocksdb_open_and_compact_options_t* options) {
+  if (options) {
+    delete options;
+  }
+}
+
+void rocksdb_open_and_compact_options_set_canceled(
+    rocksdb_open_and_compact_options_t* options, unsigned char* canceled) {
+  if (options && canceled) {
+    options->rep.canceled = reinterpret_cast<std::atomic<bool>*>(canceled);
+  }
+}
+
+void rocksdb_open_and_compact_options_set_allow_resumption(
+    rocksdb_open_and_compact_options_t* options,
+    unsigned char allow_resumption) {
+  if (options) {
+    options->rep.allow_resumption = allow_resumption != 0;
+  }
+}
+
+// OpenAndCompact functions
+char* rocksdb_open_and_compact(
+    const char* db_path, const char* output_directory, const char* input,
+    size_t input_len, size_t* output_len,
+    const rocksdb_compaction_service_options_override_t* override_options,
+    char** errptr) {
+  if (!db_path || !output_directory || !input || !override_options) {
+    SaveError(errptr, Status::InvalidArgument("Invalid arguments"));
+    return nullptr;
+  }
+
+  std::string input_str(input, input_len);
+  std::string output_str;
+
+  Status s = DB::OpenAndCompact(db_path, output_directory, input_str,
+                                &output_str, override_options->rep);
+
+  if (!s.ok()) {
+    SaveError(errptr, s);
+    return nullptr;
+  }
+
+  // Allocate +1 for null terminator
+  char* result = static_cast<char*>(malloc(output_str.size() + 1));
+  if (!result) {
+    SaveError(errptr, Status::MemoryLimit("Failed to allocate output buffer"));
+    return nullptr;
+  }
+
+  memcpy(result, output_str.data(), output_str.size());
+  result[output_str.size()] = '\0';
+
+  // Only set output_len after successful allocation
+  if (output_len) {
+    *output_len = output_str.size();
+  }
+
+  return result;
+}
+
+char* rocksdb_open_and_compact_with_options(
+    const rocksdb_open_and_compact_options_t* options, const char* db_path,
+    const char* output_directory, const char* input, size_t input_len,
+    size_t* output_len,
+    const rocksdb_compaction_service_options_override_t* override_options,
+    char** errptr) {
+  if (!options || !db_path || !output_directory || !input ||
+      !override_options) {
+    SaveError(errptr, Status::InvalidArgument("Invalid arguments"));
+    return nullptr;
+  }
+
+  std::string input_str(input, input_len);
+  std::string output_str;
+
+  Status s = DB::OpenAndCompact(options->rep, db_path, output_directory,
+                                input_str, &output_str, override_options->rep);
+
+  if (!s.ok()) {
+    SaveError(errptr, s);
+    return nullptr;
+  }
+
+  // Allocate +1 for null terminator
+  char* result = static_cast<char*>(malloc(output_str.size() + 1));
+  if (!result) {
+    SaveError(errptr, Status::MemoryLimit("Failed to allocate output buffer"));
+    return nullptr;
+  }
+
+  memcpy(result, output_str.data(), output_str.size());
+  result[output_str.size()] = '\0';  // Null terminate
+
+  // Only set output_len after successful allocation
+  if (output_len) {
+    *output_len = output_str.size();
+  }
+
   return result;
 }
 
 rocksdb_t* rocksdb_open(const rocksdb_options_t* options, const char* name,
                         char** errptr) {
-  DB* db;
-  if (SaveError(errptr, DB::Open(options->rep, std::string(name), &db))) {
+  std::unique_ptr<DB> dbptr;
+  if (SaveError(errptr, DB::Open(options->rep, std::string(name), &dbptr))) {
     return nullptr;
   }
   rocksdb_t* result = new rocksdb_t;
-  result->rep = db;
+  result->rep = dbptr.release();
   return result;
 }
 
@@ -617,13 +1243,14 @@ rocksdb_t* rocksdb_open_for_read_only(const rocksdb_options_t* options,
                                       const char* name,
                                       unsigned char error_if_wal_file_exists,
                                       char** errptr) {
-  DB* db;
-  if (SaveError(errptr, DB::OpenForReadOnly(options->rep, std::string(name),
-                                            &db, error_if_wal_file_exists))) {
+  std::unique_ptr<DB> dbptr;
+  if (SaveError(errptr,
+                DB::OpenForReadOnly(options->rep, std::string(name), &dbptr,
+                                    error_if_wal_file_exists))) {
     return nullptr;
   }
   rocksdb_t* result = new rocksdb_t;
-  result->rep = db;
+  result->rep = dbptr.release();
   return result;
 }
 
@@ -631,14 +1258,14 @@ rocksdb_t* rocksdb_open_as_secondary(const rocksdb_options_t* options,
                                      const char* name,
                                      const char* secondary_path,
                                      char** errptr) {
-  DB* db;
+  std::unique_ptr<DB> dbptr;
   if (SaveError(errptr,
                 DB::OpenAsSecondary(options->rep, std::string(name),
-                                    std::string(secondary_path), &db))) {
+                                    std::string(secondary_path), &dbptr))) {
     return nullptr;
   }
   rocksdb_t* result = new rocksdb_t;
-  result->rep = db;
+  result->rep = dbptr.release();
   return result;
 }
 
@@ -884,6 +1511,10 @@ void rocksdb_backup_engine_options_destroy(
   delete options;
 }
 
+void rocksdb_status_ptr_get_error(rocksdb_status_ptr_t* status, char** errptr) {
+  SaveError(errptr, *(status->rep));
+}
+
 rocksdb_checkpoint_t* rocksdb_checkpoint_object_create(rocksdb_t* db,
                                                        char** errptr) {
   Checkpoint* checkpoint;
@@ -902,6 +1533,22 @@ void rocksdb_checkpoint_create(rocksdb_checkpoint_t* checkpoint,
                         std::string(checkpoint_dir), log_size_for_flush));
 }
 
+rocksdb_export_import_files_metadata_t* rocksdb_checkpoint_export_column_family(
+    rocksdb_checkpoint_t* checkpoint,
+    rocksdb_column_family_handle_t* column_family, const char* export_dir,
+    char** errptr) {
+  ExportImportFilesMetaData* metadata = nullptr;
+  if (SaveError(errptr,
+                checkpoint->rep->ExportColumnFamily(
+                    column_family->rep, std::string(export_dir), &metadata))) {
+    return nullptr;
+  }
+  rocksdb_export_import_files_metadata_t* result =
+      new rocksdb_export_import_files_metadata_t;
+  result->rep = metadata;
+  return result;
+}
+
 void rocksdb_checkpoint_object_destroy(rocksdb_checkpoint_t* checkpoint) {
   delete checkpoint->rep;
   delete checkpoint;
@@ -932,11 +1579,11 @@ rocksdb_t* rocksdb_open_and_trim_history(
 
   std::string trim_ts_(trim_ts, trim_tslen);
 
-  DB* db;
+  std::unique_ptr<DB> dbptr;
   std::vector<ColumnFamilyHandle*> handles;
   if (SaveError(errptr, DB::OpenAndTrimHistory(
                             DBOptions(db_options->rep), std::string(name),
-                            column_families, &handles, &db, trim_ts_))) {
+                            column_families, &handles, &dbptr, trim_ts_))) {
     return nullptr;
   }
 
@@ -948,7 +1595,7 @@ rocksdb_t* rocksdb_open_and_trim_history(
     column_family_handles[i] = c_handle;
   }
   rocksdb_t* result = new rocksdb_t;
-  result->rep = db;
+  result->rep = dbptr.release();
   return result;
 }
 
@@ -964,10 +1611,10 @@ rocksdb_t* rocksdb_open_column_families(
         ColumnFamilyOptions(column_family_options[i]->rep));
   }
 
-  DB* db;
+  std::unique_ptr<DB> dbptr;
   std::vector<ColumnFamilyHandle*> handles;
   if (SaveError(errptr, DB::Open(DBOptions(db_options->rep), std::string(name),
-                                 column_families, &handles, &db))) {
+                                 column_families, &handles, &dbptr))) {
     return nullptr;
   }
 
@@ -979,7 +1626,7 @@ rocksdb_t* rocksdb_open_column_families(
     column_family_handles[i] = c_handle;
   }
   rocksdb_t* result = new rocksdb_t;
-  result->rep = db;
+  result->rep = dbptr.release();
   return result;
 }
 
@@ -1032,12 +1679,12 @@ rocksdb_t* rocksdb_open_for_read_only_column_families(
         ColumnFamilyOptions(column_family_options[i]->rep));
   }
 
-  DB* db;
+  std::unique_ptr<DB> dbptr;
   std::vector<ColumnFamilyHandle*> handles;
-  if (SaveError(errptr,
-                DB::OpenForReadOnly(DBOptions(db_options->rep),
-                                    std::string(name), column_families,
-                                    &handles, &db, error_if_wal_file_exists))) {
+  if (SaveError(errptr, DB::OpenForReadOnly(DBOptions(db_options->rep),
+                                            std::string(name), column_families,
+                                            &handles, &dbptr,
+                                            error_if_wal_file_exists))) {
     return nullptr;
   }
 
@@ -1049,7 +1696,7 @@ rocksdb_t* rocksdb_open_for_read_only_column_families(
     column_family_handles[i] = c_handle;
   }
   rocksdb_t* result = new rocksdb_t;
-  result->rep = db;
+  result->rep = dbptr.release();
   return result;
 }
 
@@ -1065,12 +1712,12 @@ rocksdb_t* rocksdb_open_as_secondary_column_families(
         std::string(column_family_names[i]),
         ColumnFamilyOptions(column_family_options[i]->rep));
   }
-  DB* db;
+  std::unique_ptr<DB> dbptr;
   std::vector<ColumnFamilyHandle*> handles;
-  if (SaveError(errptr, DB::OpenAsSecondary(DBOptions(db_options->rep),
-                                            std::string(name),
-                                            std::string(secondary_path),
-                                            column_families, &handles, &db))) {
+  if (SaveError(errptr, DB::OpenAsSecondary(
+                            DBOptions(db_options->rep), std::string(name),
+                            std::string(secondary_path), column_families,
+                            &handles, &dbptr))) {
     return nullptr;
   }
   for (size_t i = 0; i != handles.size(); ++i) {
@@ -1081,7 +1728,7 @@ rocksdb_t* rocksdb_open_as_secondary_column_families(
     column_family_handles[i] = c_handle;
   }
   rocksdb_t* result = new rocksdb_t;
-  result->rep = db;
+  result->rep = dbptr.release();
   return result;
 }
 
@@ -1145,6 +1792,26 @@ rocksdb_column_family_handle_t** rocksdb_create_column_families(
   return c_handles;
 }
 
+rocksdb_column_family_handle_t* rocksdb_create_column_family_with_import(
+    rocksdb_t* db, rocksdb_options_t* column_family_options,
+    const char* column_family_name,
+    rocksdb_import_column_family_options_t* import_options,
+    rocksdb_export_import_files_metadata_t* export_import_files_metadata,
+    char** errptr) {
+  rocksdb_column_family_handle_t* handle = new rocksdb_column_family_handle_t;
+  handle->rep = nullptr;
+  if (SaveError(errptr,
+                db->rep->CreateColumnFamilyWithImport(
+                    ColumnFamilyOptions(column_family_options->rep),
+                    std::string(column_family_name), import_options->rep,
+                    *(export_import_files_metadata->rep), &(handle->rep)))) {
+    delete handle;
+    return nullptr;
+  }
+  handle->immortal = false;
+  return handle;
+}
+
 void rocksdb_create_column_families_destroy(
     rocksdb_column_family_handle_t** list) {
   free(list);
@@ -1348,11 +2015,14 @@ char* rocksdb_get(rocksdb_t* db, const rocksdb_readoptions_t* options,
                   const char* key, size_t keylen, size_t* vallen,
                   char** errptr) {
   char* result = nullptr;
-  std::string tmp;
-  Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp);
+  // Use PinnableSlice to avoid unnecessary copy
+  PinnableSlice pinnable_val;
+  Status s = db->rep->Get(options->rep, db->rep->DefaultColumnFamily(),
+                          Slice(key, keylen), &pinnable_val);
   if (s.ok()) {
-    *vallen = tmp.size();
-    result = CopyString(tmp);
+    *vallen = pinnable_val.size();
+    // Only one copy: from PinnableSlice to malloc'd buffer
+    result = CopyString(pinnable_val);
   } else {
     *vallen = 0;
     if (!s.IsNotFound()) {
@@ -1367,12 +2037,14 @@ char* rocksdb_get_cf(rocksdb_t* db, const rocksdb_readoptions_t* options,
                      const char* key, size_t keylen, size_t* vallen,
                      char** errptr) {
   char* result = nullptr;
-  std::string tmp;
-  Status s =
-      db->rep->Get(options->rep, column_family->rep, Slice(key, keylen), &tmp);
+  // Use PinnableSlice to avoid unnecessary copy
+  PinnableSlice pinnable_val;
+  Status s = db->rep->Get(options->rep, column_family->rep, Slice(key, keylen),
+                          &pinnable_val);
   if (s.ok()) {
-    *vallen = tmp.size();
-    result = CopyString(tmp);
+    *vallen = pinnable_val.size();
+    // Only one copy: from PinnableSlice to malloc'd buffer
+    result = CopyString(pinnable_val);
   } else {
     *vallen = 0;
     if (!s.IsNotFound()) {
@@ -1445,12 +2117,17 @@ void rocksdb_multi_get(rocksdb_t* db, const rocksdb_readoptions_t* options,
                        size_t num_keys, const char* const* keys_list,
                        const size_t* keys_list_sizes, char** values_list,
                        size_t* values_list_sizes, char** errs) {
-  std::vector<Slice> keys(num_keys);
+  // Use unique_ptr for efficiency (avoids vector overhead for fixed-size array)
+  std::unique_ptr<Slice[]> keys(new Slice[num_keys]);
   for (size_t i = 0; i < num_keys; i++) {
     keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
   }
-  std::vector<std::string> values(num_keys);
-  std::vector<Status> statuses = db->rep->MultiGet(options->rep, keys, &values);
+  // Use PinnableSlice to avoid unnecessary allocations
+  auto cfh = db->rep->DefaultColumnFamily();
+  std::vector<PinnableSlice> values(num_keys);
+  std::vector<Status> statuses(num_keys);
+  db->rep->MultiGet(options->rep, cfh, num_keys, keys.get(), values.data(),
+                    statuses.data());
   for (size_t i = 0; i < num_keys; i++) {
     if (statuses[i].ok()) {
       values_list[i] = CopyString(values[i]);
@@ -1475,10 +2152,13 @@ void rocksdb_multi_get_with_ts(rocksdb_t* db,
                                char** values_list, size_t* values_list_sizes,
                                char** timestamp_list,
                                size_t* timestamp_list_sizes, char** errs) {
-  std::vector<Slice> keys(num_keys);
+  // Use unique_ptr for efficiency
+  std::unique_ptr<Slice[]> keys_arr(new Slice[num_keys]);
   for (size_t i = 0; i < num_keys; i++) {
-    keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+    keys_arr[i] = Slice(keys_list[i], keys_list_sizes[i]);
   }
+  // Note: MultiGet with timestamps only has vector-based API
+  std::vector<Slice> keys(keys_arr.get(), keys_arr.get() + num_keys);
   std::vector<std::string> values(num_keys);
   std::vector<std::string> timestamps(num_keys);
   std::vector<Status> statuses =
@@ -1510,15 +2190,19 @@ void rocksdb_multi_get_cf(
     size_t num_keys, const char* const* keys_list,
     const size_t* keys_list_sizes, char** values_list,
     size_t* values_list_sizes, char** errs) {
-  std::vector<Slice> keys(num_keys);
-  std::vector<ColumnFamilyHandle*> cfs(num_keys);
+  // Use unique_ptr for efficiency (avoids vector overhead for fixed-size
+  // arrays)
+  std::unique_ptr<Slice[]> keys(new Slice[num_keys]);
+  std::unique_ptr<ColumnFamilyHandle*[]> cfs(new ColumnFamilyHandle*[num_keys]);
   for (size_t i = 0; i < num_keys; i++) {
     keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
     cfs[i] = column_families[i]->rep;
   }
-  std::vector<std::string> values(num_keys);
-  std::vector<Status> statuses =
-      db->rep->MultiGet(options->rep, cfs, keys, &values);
+  // Use PinnableSlice to avoid unnecessary allocations
+  std::vector<PinnableSlice> values(num_keys);
+  std::vector<Status> statuses(num_keys);
+  db->rep->MultiGet(options->rep, num_keys, cfs.get(), keys.get(),
+                    values.data(), statuses.data());
   for (size_t i = 0; i < num_keys; i++) {
     if (statuses[i].ok()) {
       values_list[i] = CopyString(values[i]);
@@ -1543,16 +2227,20 @@ void rocksdb_multi_get_cf_with_ts(
     const size_t* keys_list_sizes, char** values_list,
     size_t* values_list_sizes, char** timestamps_list,
     size_t* timestamps_list_sizes, char** errs) {
-  std::vector<Slice> keys(num_keys);
-  std::vector<ColumnFamilyHandle*> cfs(num_keys);
+  // Use unique_ptr for efficiency (avoids vector overhead for fixed-size
+  // arrays)
+  std::unique_ptr<Slice[]> keys(new Slice[num_keys]);
+  std::unique_ptr<ColumnFamilyHandle*[]> cfs(new ColumnFamilyHandle*[num_keys]);
   for (size_t i = 0; i < num_keys; i++) {
     keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
     cfs[i] = column_families[i]->rep;
   }
-  std::vector<std::string> values(num_keys);
+  // Use PinnableSlice to avoid unnecessary allocations
+  std::vector<PinnableSlice> values(num_keys);
   std::vector<std::string> timestamps(num_keys);
-  std::vector<Status> statuses =
-      db->rep->MultiGet(options->rep, cfs, keys, &values, &timestamps);
+  std::vector<Status> statuses(num_keys);
+  db->rep->MultiGet(options->rep, num_keys, cfs.get(), keys.get(),
+                    values.data(), timestamps.data(), statuses.data());
   for (size_t i = 0; i < num_keys; i++) {
     if (statuses[i].ok()) {
       values_list[i] = CopyString(values[i]);
@@ -1611,6 +2299,41 @@ void rocksdb_batched_multi_get_cf(rocksdb_t* db,
   delete[] statuses;
 }
 
+// Batched MultiGet that takes pre-built Slice array, avoiding key conversion
+// overhead
+void rocksdb_batched_multi_get_cf_slice(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, size_t num_keys,
+    const rocksdb_slice_t* keys_list, rocksdb_pinnableslice_t** values,
+    char** errs, const bool sorted_input) {
+  PinnableSlice* value_slices = new PinnableSlice[num_keys];
+  Status* statuses = new Status[num_keys];
+
+  // Cast rocksdb_slice_t* to Slice* - they have identical memory layout
+  const Slice* key_slices = reinterpret_cast<const Slice*>(keys_list);
+
+  db->rep->MultiGet(options->rep, column_family->rep, num_keys, key_slices,
+                    value_slices, statuses, sorted_input);
+
+  for (size_t i = 0; i < num_keys; ++i) {
+    if (statuses[i].ok()) {
+      values[i] = new (rocksdb_pinnableslice_t);
+      values[i]->rep = std::move(value_slices[i]);
+      errs[i] = nullptr;
+    } else {
+      values[i] = nullptr;
+      if (!statuses[i].IsNotFound()) {
+        errs[i] = strdup(statuses[i].ToString().c_str());
+      } else {
+        errs[i] = nullptr;
+      }
+    }
+  }
+
+  delete[] value_slices;
+  delete[] statuses;
+}
+
 unsigned char rocksdb_key_may_exist(rocksdb_t* db,
                                     const rocksdb_readoptions_t* options,
                                     const char* key, size_t key_len,
@@ -2031,6 +2754,32 @@ void rocksdb_iter_get_error(const rocksdb_iterator_t* iter, char** errptr) {
   SaveError(errptr, iter->rep->status());
 }
 
+// Iterator functions that return rocksdb_slice_t directly for better
+// performance
+rocksdb_slice_t rocksdb_iter_key_slice(const rocksdb_iterator_t* iter) {
+  Slice s = iter->rep->key();
+  rocksdb_slice_t result;
+  result.data = s.data();
+  result.size = s.size();
+  return result;
+}
+
+rocksdb_slice_t rocksdb_iter_value_slice(const rocksdb_iterator_t* iter) {
+  Slice s = iter->rep->value();
+  rocksdb_slice_t result;
+  result.data = s.data();
+  result.size = s.size();
+  return result;
+}
+
+rocksdb_slice_t rocksdb_iter_timestamp_slice(const rocksdb_iterator_t* iter) {
+  Slice s = iter->rep->timestamp();
+  rocksdb_slice_t result;
+  result.data = s.data();
+  result.size = s.size();
+  return result;
+}
+
 void rocksdb_iter_refresh(const rocksdb_iterator_t* iter, char** errptr) {
   SaveError(errptr, iter->rep->Refresh());
 }
@@ -2086,16 +2835,18 @@ void rocksdb_writebatch_putv(rocksdb_writebatch_t* b, int num_keys,
                              const size_t* keys_list_sizes, int num_values,
                              const char* const* values_list,
                              const size_t* values_list_sizes) {
-  std::vector<Slice> key_slices(num_keys);
+  // Use unique_ptr instead of vector to avoid overhead
+  // Safe because WriteBatch::Put immediately copies the data
+  std::unique_ptr<Slice[]> key_slices(new Slice[num_keys]);
   for (int i = 0; i < num_keys; i++) {
     key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
   }
-  std::vector<Slice> value_slices(num_values);
+  std::unique_ptr<Slice[]> value_slices(new Slice[num_values]);
   for (int i = 0; i < num_values; i++) {
     value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
   }
-  b->rep.Put(SliceParts(key_slices.data(), num_keys),
-             SliceParts(value_slices.data(), num_values));
+  b->rep.Put(SliceParts(key_slices.get(), num_keys),
+             SliceParts(value_slices.get(), num_values));
 }
 
 void rocksdb_writebatch_putv_cf(rocksdb_writebatch_t* b,
@@ -2104,16 +2855,18 @@ void rocksdb_writebatch_putv_cf(rocksdb_writebatch_t* b,
                                 const size_t* keys_list_sizes, int num_values,
                                 const char* const* values_list,
                                 const size_t* values_list_sizes) {
-  std::vector<Slice> key_slices(num_keys);
+  // Use unique_ptr instead of vector to avoid overhead
+  // Safe because WriteBatch::Put immediately copies the data
+  std::unique_ptr<Slice[]> key_slices(new Slice[num_keys]);
   for (int i = 0; i < num_keys; i++) {
     key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
   }
-  std::vector<Slice> value_slices(num_values);
+  std::unique_ptr<Slice[]> value_slices(new Slice[num_values]);
   for (int i = 0; i < num_values; i++) {
     value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
   }
-  b->rep.Put(column_family->rep, SliceParts(key_slices.data(), num_keys),
-             SliceParts(value_slices.data(), num_values));
+  b->rep.Put(column_family->rep, SliceParts(key_slices.get(), num_keys),
+             SliceParts(value_slices.get(), num_values));
 }
 
 void rocksdb_writebatch_merge(rocksdb_writebatch_t* b, const char* key,
@@ -2133,16 +2886,18 @@ void rocksdb_writebatch_mergev(rocksdb_writebatch_t* b, int num_keys,
                                const size_t* keys_list_sizes, int num_values,
                                const char* const* values_list,
                                const size_t* values_list_sizes) {
-  std::vector<Slice> key_slices(num_keys);
+  // Use unique_ptr instead of vector to avoid overhead
+  // Safe because WriteBatch::Merge immediately copies the data
+  std::unique_ptr<Slice[]> key_slices(new Slice[num_keys]);
   for (int i = 0; i < num_keys; i++) {
     key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
   }
-  std::vector<Slice> value_slices(num_values);
+  std::unique_ptr<Slice[]> value_slices(new Slice[num_values]);
   for (int i = 0; i < num_values; i++) {
     value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
   }
-  b->rep.Merge(SliceParts(key_slices.data(), num_keys),
-               SliceParts(value_slices.data(), num_values));
+  b->rep.Merge(SliceParts(key_slices.get(), num_keys),
+               SliceParts(value_slices.get(), num_values));
 }
 
 void rocksdb_writebatch_mergev_cf(rocksdb_writebatch_t* b,
@@ -2151,16 +2906,18 @@ void rocksdb_writebatch_mergev_cf(rocksdb_writebatch_t* b,
                                   const size_t* keys_list_sizes, int num_values,
                                   const char* const* values_list,
                                   const size_t* values_list_sizes) {
-  std::vector<Slice> key_slices(num_keys);
+  // Use unique_ptr instead of vector to avoid overhead
+  // Safe because WriteBatch::Merge immediately copies the data
+  std::unique_ptr<Slice[]> key_slices(new Slice[num_keys]);
   for (int i = 0; i < num_keys; i++) {
     key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
   }
-  std::vector<Slice> value_slices(num_values);
+  std::unique_ptr<Slice[]> value_slices(new Slice[num_values]);
   for (int i = 0; i < num_values; i++) {
     value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
   }
-  b->rep.Merge(column_family->rep, SliceParts(key_slices.data(), num_keys),
-               SliceParts(value_slices.data(), num_values));
+  b->rep.Merge(column_family->rep, SliceParts(key_slices.get(), num_keys),
+               SliceParts(value_slices.get(), num_values));
 }
 
 void rocksdb_writebatch_delete(rocksdb_writebatch_t* b, const char* key,
@@ -2200,21 +2957,25 @@ void rocksdb_writebatch_singledelete_cf_with_ts(
 void rocksdb_writebatch_deletev(rocksdb_writebatch_t* b, int num_keys,
                                 const char* const* keys_list,
                                 const size_t* keys_list_sizes) {
-  std::vector<Slice> key_slices(num_keys);
+  // Use unique_ptr instead of vector to avoid overhead
+  // Safe because WriteBatch::Delete immediately copies the data
+  std::unique_ptr<Slice[]> key_slices(new Slice[num_keys]);
   for (int i = 0; i < num_keys; i++) {
     key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
   }
-  b->rep.Delete(SliceParts(key_slices.data(), num_keys));
+  b->rep.Delete(SliceParts(key_slices.get(), num_keys));
 }
 
 void rocksdb_writebatch_deletev_cf(
     rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
     int num_keys, const char* const* keys_list, const size_t* keys_list_sizes) {
-  std::vector<Slice> key_slices(num_keys);
+  // Use unique_ptr instead of vector to avoid overhead
+  // Safe because WriteBatch::Delete immediately copies the data
+  std::unique_ptr<Slice[]> key_slices(new Slice[num_keys]);
   for (int i = 0; i < num_keys; i++) {
     key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
   }
-  b->rep.Delete(column_family->rep, SliceParts(key_slices.data(), num_keys));
+  b->rep.Delete(column_family->rep, SliceParts(key_slices.get(), num_keys));
 }
 
 void rocksdb_writebatch_delete_range(rocksdb_writebatch_t* b,
@@ -2238,14 +2999,16 @@ void rocksdb_writebatch_delete_rangev(rocksdb_writebatch_t* b, int num_keys,
                                       const size_t* start_keys_list_sizes,
                                       const char* const* end_keys_list,
                                       const size_t* end_keys_list_sizes) {
-  std::vector<Slice> start_key_slices(num_keys);
-  std::vector<Slice> end_key_slices(num_keys);
+  // Use unique_ptr instead of vector to avoid overhead
+  // Safe because WriteBatch::DeleteRange immediately copies the data
+  std::unique_ptr<Slice[]> start_key_slices(new Slice[num_keys]);
+  std::unique_ptr<Slice[]> end_key_slices(new Slice[num_keys]);
   for (int i = 0; i < num_keys; i++) {
     start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]);
     end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]);
   }
-  b->rep.DeleteRange(SliceParts(start_key_slices.data(), num_keys),
-                     SliceParts(end_key_slices.data(), num_keys));
+  b->rep.DeleteRange(SliceParts(start_key_slices.get(), num_keys),
+                     SliceParts(end_key_slices.get(), num_keys));
 }
 
 void rocksdb_writebatch_delete_rangev_cf(
@@ -2253,15 +3016,17 @@ void rocksdb_writebatch_delete_rangev_cf(
     int num_keys, const char* const* start_keys_list,
     const size_t* start_keys_list_sizes, const char* const* end_keys_list,
     const size_t* end_keys_list_sizes) {
-  std::vector<Slice> start_key_slices(num_keys);
-  std::vector<Slice> end_key_slices(num_keys);
+  // Use unique_ptr instead of vector to avoid overhead
+  // Safe because WriteBatch::DeleteRange immediately copies the data
+  std::unique_ptr<Slice[]> start_key_slices(new Slice[num_keys]);
+  std::unique_ptr<Slice[]> end_key_slices(new Slice[num_keys]);
   for (int i = 0; i < num_keys; i++) {
     start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]);
     end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]);
   }
   b->rep.DeleteRange(column_family->rep,
-                     SliceParts(start_key_slices.data(), num_keys),
-                     SliceParts(end_key_slices.data(), num_keys));
+                     SliceParts(start_key_slices.get(), num_keys),
+                     SliceParts(end_key_slices.get(), num_keys));
 }
 
 void rocksdb_writebatch_put_log_data(rocksdb_writebatch_t* b, const char* blob,
@@ -2274,12 +3039,19 @@ class H : public WriteBatch::Handler {
   void* state_;
   void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen);
   void (*deleted_)(void*, const char* k, size_t klen);
+  void (*log_data_)(void*, const char* blob, size_t blob_len);
+
   void Put(const Slice& key, const Slice& value) override {
     (*put_)(state_, key.data(), key.size(), value.data(), value.size());
   }
   void Delete(const Slice& key) override {
     (*deleted_)(state_, key.data(), key.size());
   }
+  void LogData(const Slice& blob) override {
+    if (log_data_) {
+      (*log_data_)(state_, blob.data(), blob.size());
+    }
+  }
 };
 
 class HCF : public WriteBatch::Handler {
@@ -2290,6 +3062,8 @@ class HCF : public WriteBatch::Handler {
   void (*deleted_cf_)(void*, uint32_t cfid, const char* k, size_t klen);
   void (*merge_cf_)(void*, uint32_t cfid, const char* k, size_t klen,
                     const char* v, size_t vlen);
+  void (*log_data_)(void*, const char* blob, size_t blob_len);
+
   Status PutCF(uint32_t column_family_id, const Slice& key,
                const Slice& value) override {
     (*put_cf_)(state_, column_family_id, key.data(), key.size(), value.data(),
@@ -2306,6 +3080,11 @@ class HCF : public WriteBatch::Handler {
                  value.size());
     return Status::OK();
   }
+  void LogData(const Slice& blob) override {
+    if (log_data_) {
+      (*log_data_)(state_, blob.data(), blob.size());
+    }
+  }
 };
 
 void rocksdb_writebatch_iterate(rocksdb_writebatch_t* b, void* state,
@@ -2317,6 +3096,20 @@ void rocksdb_writebatch_iterate(rocksdb_writebatch_t* b, void* state,
   handler.state_ = state;
   handler.put_ = put;
   handler.deleted_ = deleted;
+  handler.log_data_ = nullptr;
+  b->rep.Iterate(&handler);
+}
+
+void rocksdb_writebatch_iterate_ld(
+    rocksdb_writebatch_t* b, void* state,
+    void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
+    void (*deleted)(void*, const char* k, size_t klen),
+    void (*log_data)(void*, const char* blob, size_t blob_len)) {
+  H handler;
+  handler.state_ = state;
+  handler.put_ = put;
+  handler.deleted_ = deleted;
+  handler.log_data_ = log_data;
   b->rep.Iterate(&handler);
 }
 
@@ -2332,6 +3125,24 @@ void rocksdb_writebatch_iterate_cf(
   handler.put_cf_ = put_cf;
   handler.deleted_cf_ = deleted_cf;
   handler.merge_cf_ = merge_cf;
+  handler.log_data_ = nullptr;
+  b->rep.Iterate(&handler);
+}
+
+void rocksdb_writebatch_iterate_cf_ld(
+    rocksdb_writebatch_t* b, void* state,
+    void (*put_cf)(void*, uint32_t cfid, const char* k, size_t klen,
+                   const char* v, size_t vlen),
+    void (*deleted_cf)(void*, uint32_t cfid, const char* k, size_t klen),
+    void (*merge_cf)(void*, uint32_t cfid, const char* k, size_t klen,
+                     const char* v, size_t vlen),
+    void (*log_data)(void*, const char* blob, size_t blob_len)) {
+  HCF handler;
+  handler.state_ = state;
+  handler.put_cf_ = put_cf;
+  handler.deleted_cf_ = deleted_cf;
+  handler.merge_cf_ = merge_cf;
+  handler.log_data_ = log_data;
   b->rep.Iterate(&handler);
 }
 
@@ -2422,16 +3233,17 @@ void rocksdb_writebatch_wi_putv(rocksdb_writebatch_wi_t* b, int num_keys,
                                 const size_t* keys_list_sizes, int num_values,
                                 const char* const* values_list,
                                 const size_t* values_list_sizes) {
-  std::vector<Slice> key_slices(num_keys);
+  // Use unique_ptr for better performance (avoids vector overhead)
+  std::unique_ptr<Slice[]> key_slices(new Slice[num_keys]);
   for (int i = 0; i < num_keys; i++) {
     key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
   }
-  std::vector<Slice> value_slices(num_values);
+  std::unique_ptr<Slice[]> value_slices(new Slice[num_values]);
   for (int i = 0; i < num_values; i++) {
     value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
   }
-  b->rep->Put(SliceParts(key_slices.data(), num_keys),
-              SliceParts(value_slices.data(), num_values));
+  b->rep->Put(SliceParts(key_slices.get(), num_keys),
+              SliceParts(value_slices.get(), num_values));
 }
 
 void rocksdb_writebatch_wi_putv_cf(
@@ -2467,16 +3279,17 @@ void rocksdb_writebatch_wi_mergev(rocksdb_writebatch_wi_t* b, int num_keys,
                                   const size_t* keys_list_sizes, int num_values,
                                   const char* const* values_list,
                                   const size_t* values_list_sizes) {
-  std::vector<Slice> key_slices(num_keys);
+  // Use unique_ptr for better performance (avoids vector overhead)
+  std::unique_ptr<Slice[]> key_slices(new Slice[num_keys]);
   for (int i = 0; i < num_keys; i++) {
     key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
   }
-  std::vector<Slice> value_slices(num_values);
+  std::unique_ptr<Slice[]> value_slices(new Slice[num_values]);
   for (int i = 0; i < num_values; i++) {
     value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
   }
-  b->rep->Merge(SliceParts(key_slices.data(), num_keys),
-                SliceParts(value_slices.data(), num_values));
+  b->rep->Merge(SliceParts(key_slices.get(), num_keys),
+                SliceParts(value_slices.get(), num_values));
 }
 
 void rocksdb_writebatch_wi_mergev_cf(
@@ -2484,16 +3297,17 @@ void rocksdb_writebatch_wi_mergev_cf(
     int num_keys, const char* const* keys_list, const size_t* keys_list_sizes,
     int num_values, const char* const* values_list,
     const size_t* values_list_sizes) {
-  std::vector<Slice> key_slices(num_keys);
+  // Use unique_ptr for better performance (avoids vector overhead)
+  std::unique_ptr<Slice[]> key_slices(new Slice[num_keys]);
   for (int i = 0; i < num_keys; i++) {
     key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
   }
-  std::vector<Slice> value_slices(num_values);
+  std::unique_ptr<Slice[]> value_slices(new Slice[num_values]);
   for (int i = 0; i < num_values; i++) {
     value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
   }
-  b->rep->Merge(column_family->rep, SliceParts(key_slices.data(), num_keys),
-                SliceParts(value_slices.data(), num_values));
+  b->rep->Merge(column_family->rep, SliceParts(key_slices.get(), num_keys),
+                SliceParts(value_slices.get(), num_values));
 }
 
 void rocksdb_writebatch_wi_delete(rocksdb_writebatch_wi_t* b, const char* key,
@@ -2531,11 +3345,12 @@ void rocksdb_writebatch_wi_deletev(rocksdb_writebatch_wi_t* b, int num_keys,
 void rocksdb_writebatch_wi_deletev_cf(
     rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
     int num_keys, const char* const* keys_list, const size_t* keys_list_sizes) {
-  std::vector<Slice> key_slices(num_keys);
+  // Use unique_ptr for better performance (avoids vector overhead)
+  std::unique_ptr<Slice[]> key_slices(new Slice[num_keys]);
   for (int i = 0; i < num_keys; i++) {
     key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
   }
-  b->rep->Delete(column_family->rep, SliceParts(key_slices.data(), num_keys));
+  b->rep->Delete(column_family->rep, SliceParts(key_slices.get(), num_keys));
 }
 
 void rocksdb_writebatch_wi_delete_range(rocksdb_writebatch_wi_t* b,
@@ -2561,14 +3376,15 @@ void rocksdb_writebatch_wi_delete_rangev(rocksdb_writebatch_wi_t* b,
                                          const size_t* start_keys_list_sizes,
                                          const char* const* end_keys_list,
                                          const size_t* end_keys_list_sizes) {
-  std::vector<Slice> start_key_slices(num_keys);
-  std::vector<Slice> end_key_slices(num_keys);
+  // Use unique_ptr for better performance (avoids vector overhead)
+  std::unique_ptr<Slice[]> start_key_slices(new Slice[num_keys]);
+  std::unique_ptr<Slice[]> end_key_slices(new Slice[num_keys]);
   for (int i = 0; i < num_keys; i++) {
     start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]);
     end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]);
   }
-  b->rep->DeleteRange(SliceParts(start_key_slices.data(), num_keys),
-                      SliceParts(end_key_slices.data(), num_keys));
+  b->rep->DeleteRange(SliceParts(start_key_slices.get(), num_keys),
+                      SliceParts(end_key_slices.get(), num_keys));
 }
 
 void rocksdb_writebatch_wi_delete_rangev_cf(
@@ -2576,15 +3392,16 @@ void rocksdb_writebatch_wi_delete_rangev_cf(
     int num_keys, const char* const* start_keys_list,
     const size_t* start_keys_list_sizes, const char* const* end_keys_list,
     const size_t* end_keys_list_sizes) {
-  std::vector<Slice> start_key_slices(num_keys);
-  std::vector<Slice> end_key_slices(num_keys);
+  // Use unique_ptr for better performance (avoids vector overhead)
+  std::unique_ptr<Slice[]> start_key_slices(new Slice[num_keys]);
+  std::unique_ptr<Slice[]> end_key_slices(new Slice[num_keys]);
   for (int i = 0; i < num_keys; i++) {
     start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]);
     end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]);
   }
   b->rep->DeleteRange(column_family->rep,
-                      SliceParts(start_key_slices.data(), num_keys),
-                      SliceParts(end_key_slices.data(), num_keys));
+                      SliceParts(start_key_slices.get(), num_keys),
+                      SliceParts(end_key_slices.get(), num_keys));
 }
 
 void rocksdb_writebatch_wi_put_log_data(rocksdb_writebatch_wi_t* b,
@@ -2627,6 +3444,16 @@ rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base(
   return result;
 }
 
+rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base_readopts(
+    rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator,
+    const rocksdb_readoptions_t* options) {
+  rocksdb_iterator_t* result = new rocksdb_iterator_t;
+  result->rep =
+      wbwi->rep->NewIteratorWithBase(base_iterator->rep, &options->rep);
+  delete base_iterator;
+  return result;
+}
+
 rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base_cf(
     rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator,
     rocksdb_column_family_handle_t* column_family) {
@@ -2637,6 +3464,17 @@ rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base_cf(
   return result;
 }
 
+rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base_cf_readopts(
+    rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator,
+    rocksdb_column_family_handle_t* column_family,
+    const rocksdb_readoptions_t* options) {
+  rocksdb_iterator_t* result = new rocksdb_iterator_t;
+  result->rep = wbwi->rep->NewIteratorWithBase(
+      column_family->rep, base_iterator->rep, &options->rep);
+  delete base_iterator;
+  return result;
+}
+
 char* rocksdb_writebatch_wi_get_from_batch(rocksdb_writebatch_wi_t* wbwi,
                                            const rocksdb_options_t* options,
                                            const char* key, size_t keylen,
@@ -2681,12 +3519,13 @@ char* rocksdb_writebatch_wi_get_from_batch_and_db(
     const rocksdb_readoptions_t* options, const char* key, size_t keylen,
     size_t* vallen, char** errptr) {
   char* result = nullptr;
-  std::string tmp;
+  // Use PinnableSlice to avoid unnecessary allocations
+  PinnableSlice pinnable_val;
   Status s = wbwi->rep->GetFromBatchAndDB(db->rep, options->rep,
-                                          Slice(key, keylen), &tmp);
+                                          Slice(key, keylen), &pinnable_val);
   if (s.ok()) {
-    *vallen = tmp.size();
-    result = CopyString(tmp);
+    *vallen = pinnable_val.size();
+    result = CopyString(pinnable_val);
   } else {
     *vallen = 0;
     if (!s.IsNotFound()) {
@@ -2696,18 +3535,37 @@ char* rocksdb_writebatch_wi_get_from_batch_and_db(
   return result;
 }
 
+rocksdb_pinnableslice_t* rocksdb_writebatch_wi_get_pinned_from_batch_and_db(
+    rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db,
+    const rocksdb_readoptions_t* options, const char* key, size_t keylen,
+    char** errptr) {
+  rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
+  Status s = wbwi->rep->GetFromBatchAndDB(db->rep, options->rep,
+                                          Slice(key, keylen), &v->rep);
+  if (!s.ok()) {
+    delete (v);
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+    return nullptr;
+  }
+  return v;
+}
+
 char* rocksdb_writebatch_wi_get_from_batch_and_db_cf(
     rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db,
     const rocksdb_readoptions_t* options,
     rocksdb_column_family_handle_t* column_family, const char* key,
     size_t keylen, size_t* vallen, char** errptr) {
   char* result = nullptr;
-  std::string tmp;
-  Status s = wbwi->rep->GetFromBatchAndDB(
-      db->rep, options->rep, column_family->rep, Slice(key, keylen), &tmp);
+  // Use PinnableSlice to avoid unnecessary allocations
+  PinnableSlice pinnable_val;
+  Status s =
+      wbwi->rep->GetFromBatchAndDB(db->rep, options->rep, column_family->rep,
+                                   Slice(key, keylen), &pinnable_val);
   if (s.ok()) {
-    *vallen = tmp.size();
-    result = CopyString(tmp);
+    *vallen = pinnable_val.size();
+    result = CopyString(pinnable_val);
   } else {
     *vallen = 0;
     if (!s.IsNotFound()) {
@@ -2717,6 +3575,24 @@ char* rocksdb_writebatch_wi_get_from_batch_and_db_cf(
   return result;
 }
 
+rocksdb_pinnableslice_t* rocksdb_writebatch_wi_get_pinned_from_batch_and_db_cf(
+    rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db,
+    const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, char** errptr) {
+  rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
+  Status s = wbwi->rep->GetFromBatchAndDB(
+      db->rep, options->rep, column_family->rep, Slice(key, keylen), &v->rep);
+  if (!s.ok()) {
+    delete (v);
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+    return nullptr;
+  }
+  return v;
+}
+
 void rocksdb_write_writebatch_wi(rocksdb_t* db,
                                  const rocksdb_writeoptions_t* options,
                                  rocksdb_writebatch_wi_t* wbwi, char** errptr) {
@@ -2878,6 +3754,12 @@ void rocksdb_block_based_options_set_data_block_index_type(
       static_cast<BlockBasedTableOptions::DataBlockIndexType>(v);
 }
 
+void rocksdb_block_based_options_set_index_block_search_type(
+    rocksdb_block_based_table_options_t* options, int v) {
+  options->rep.index_block_search_type =
+      static_cast<BlockBasedTableOptions::BlockSearchType>(v);
+}
+
 void rocksdb_block_based_options_set_data_block_hash_ratio(
     rocksdb_block_based_table_options_t* options, double v) {
   options->rep.data_block_hash_table_util_ratio = v;
@@ -2924,10 +3806,379 @@ void rocksdb_block_based_options_set_partition_pinning_tier(
       static_cast<ROCKSDB_NAMESPACE::PinningTier>(v);
 }
 
-void rocksdb_block_based_options_set_unpartitioned_pinning_tier(
-    rocksdb_block_based_table_options_t* options, int v) {
-  options->rep.metadata_cache_options.unpartitioned_pinning =
-      static_cast<ROCKSDB_NAMESPACE::PinningTier>(v);
+void rocksdb_block_based_options_set_unpartitioned_pinning_tier(
+    rocksdb_block_based_table_options_t* options, int v) {
+  options->rep.metadata_cache_options.unpartitioned_pinning =
+      static_cast<ROCKSDB_NAMESPACE::PinningTier>(v);
+}
+
+void rocksdb_block_based_options_set_block_align(
+    rocksdb_block_based_table_options_t* options, unsigned char v) {
+  options->rep.block_align = v;
+}
+
+/* FlushJobInfo */
+
+const char* rocksdb_flushjobinfo_cf_name(const rocksdb_flushjobinfo_t* info,
+                                         size_t* size) {
+  *size = info->rep.cf_name.size();
+  return info->rep.cf_name.data();
+}
+
+const char* rocksdb_flushjobinfo_file_path(const rocksdb_flushjobinfo_t* info,
+                                           size_t* size) {
+  *size = info->rep.file_path.size();
+  return info->rep.file_path.data();
+}
+
+unsigned char rocksdb_flushjobinfo_triggered_writes_slowdown(
+    const rocksdb_flushjobinfo_t* info) {
+  return info->rep.triggered_writes_slowdown;
+}
+
+unsigned char rocksdb_flushjobinfo_triggered_writes_stop(
+    const rocksdb_flushjobinfo_t* info) {
+  return info->rep.triggered_writes_stop;
+}
+
+uint64_t rocksdb_flushjobinfo_largest_seqno(
+    const rocksdb_flushjobinfo_t* info) {
+  return info->rep.largest_seqno;
+}
+
+uint64_t rocksdb_flushjobinfo_smallest_seqno(
+    const rocksdb_flushjobinfo_t* info) {
+  return info->rep.smallest_seqno;
+}
+
+uint32_t rocksdb_flushjobinfo_flush_reason(const rocksdb_flushjobinfo_t* info) {
+  return static_cast<uint32_t>(info->rep.flush_reason);
+}
+
+void rocksdb_reset_status(rocksdb_status_ptr_t* status_ptr) {
+  auto ptr = status_ptr->rep;
+  *ptr = Status::OK();
+}
+
+/* CompactionJobInfo */
+
+void rocksdb_compactionjobinfo_status(const rocksdb_compactionjobinfo_t* info,
+                                      char** errptr) {
+  SaveError(errptr, info->rep.status);
+}
+
+const char* rocksdb_compactionjobinfo_cf_name(
+    const rocksdb_compactionjobinfo_t* info, size_t* size) {
+  *size = info->rep.cf_name.size();
+  return info->rep.cf_name.data();
+}
+
+size_t rocksdb_compactionjobinfo_input_files_count(
+    const rocksdb_compactionjobinfo_t* info) {
+  return info->rep.input_files.size();
+}
+
+const char* rocksdb_compactionjobinfo_input_file_at(
+    const rocksdb_compactionjobinfo_t* info, size_t pos, size_t* size) {
+  assert(info != nullptr);
+  assert(pos < info->rep.input_files.size());
+
+  const std::string& path = info->rep.input_files[pos];
+  *size = path.size();
+  return path.data();
+}
+
+size_t rocksdb_compactionjobinfo_output_files_count(
+    const rocksdb_compactionjobinfo_t* info) {
+  return info->rep.output_files.size();
+}
+
+const char* rocksdb_compactionjobinfo_output_file_at(
+    const rocksdb_compactionjobinfo_t* info, size_t pos, size_t* size) {
+  assert(info != nullptr);
+  assert(pos < info->rep.output_files.size());
+
+  const std::string& path = info->rep.output_files[pos];
+  *size = path.size();
+  return path.data();
+}
+
+uint64_t rocksdb_compactionjobinfo_elapsed_micros(
+    const rocksdb_compactionjobinfo_t* info) {
+  return info->rep.stats.elapsed_micros;
+}
+
+uint64_t rocksdb_compactionjobinfo_num_corrupt_keys(
+    const rocksdb_compactionjobinfo_t* info) {
+  return info->rep.stats.num_corrupt_keys;
+}
+
+int rocksdb_compactionjobinfo_base_input_level(
+    const rocksdb_compactionjobinfo_t* info) {
+  return info->rep.base_input_level;
+}
+
+int rocksdb_compactionjobinfo_output_level(
+    const rocksdb_compactionjobinfo_t* info) {
+  return info->rep.output_level;
+}
+
+size_t rocksdb_compactionjobinfo_num_input_files(
+    const rocksdb_compactionjobinfo_t* info) {
+  return info->rep.stats.num_input_files;
+}
+
+size_t rocksdb_compactionjobinfo_num_input_files_at_output_level(
+    const rocksdb_compactionjobinfo_t* info) {
+  return info->rep.stats.num_input_files_at_output_level;
+}
+
+uint64_t rocksdb_compactionjobinfo_input_records(
+    const rocksdb_compactionjobinfo_t* info) {
+  return info->rep.stats.num_input_records;
+}
+
+uint64_t rocksdb_compactionjobinfo_output_records(
+    const rocksdb_compactionjobinfo_t* info) {
+  return info->rep.stats.num_output_records;
+}
+
+uint64_t rocksdb_compactionjobinfo_total_input_bytes(
+    const rocksdb_compactionjobinfo_t* info) {
+  return info->rep.stats.total_input_bytes;
+}
+
+uint64_t rocksdb_compactionjobinfo_total_output_bytes(
+    const rocksdb_compactionjobinfo_t* info) {
+  return info->rep.stats.total_output_bytes;
+}
+
+uint32_t rocksdb_compactionjobinfo_compaction_reason(
+    const rocksdb_compactionjobinfo_t* info) {
+  return static_cast<uint32_t>(info->rep.compaction_reason);
+}
+
+/* SubcompactionJobInfo */
+
+void rocksdb_subcompactionjobinfo_status(
+    const rocksdb_subcompactionjobinfo_t* info, char** errptr) {
+  SaveError(errptr, info->rep.status);
+}
+
+const char* rocksdb_subcompactionjobinfo_cf_name(
+    const rocksdb_subcompactionjobinfo_t* info, size_t* size) {
+  *size = info->rep.cf_name.size();
+  return info->rep.cf_name.data();
+}
+
+uint64_t rocksdb_subcompactionjobinfo_thread_id(
+    const rocksdb_subcompactionjobinfo_t* info) {
+  return info->rep.thread_id;
+}
+
+int rocksdb_subcompactionjobinfo_base_input_level(
+    const rocksdb_subcompactionjobinfo_t* info) {
+  return info->rep.base_input_level;
+}
+
+int rocksdb_subcompactionjobinfo_output_level(
+    const rocksdb_subcompactionjobinfo_t* info) {
+  return info->rep.output_level;
+}
+
+uint32_t rocksdb_subcompactionjobinfo_compaction_reason(
+    const rocksdb_subcompactionjobinfo_t* info) {
+  return static_cast<uint32_t>(info->rep.compaction_reason);
+}
+
+/* ExternalFileIngestionInfo */
+
+const char* rocksdb_externalfileingestioninfo_cf_name(
+    const rocksdb_externalfileingestioninfo_t* info, size_t* size) {
+  *size = info->rep.cf_name.size();
+  return info->rep.cf_name.data();
+}
+
+const char* rocksdb_externalfileingestioninfo_internal_file_path(
+    const rocksdb_externalfileingestioninfo_t* info, size_t* size) {
+  *size = info->rep.internal_file_path.size();
+  return info->rep.internal_file_path.data();
+}
+
+/* External write stall info */
+extern ROCKSDB_LIBRARY_API const char* rocksdb_writestallinfo_cf_name(
+    const rocksdb_writestallinfo_t* info, size_t* size) {
+  *size = info->rep.cf_name.size();
+  return info->rep.cf_name.data();
+}
+
+const rocksdb_writestallcondition_t* rocksdb_writestallinfo_cur(
+    const rocksdb_writestallinfo_t* info) {
+  return reinterpret_cast<const rocksdb_writestallcondition_t*>(
+      &info->rep.condition.cur);
+}
+
+const rocksdb_writestallcondition_t* rocksdb_writestallinfo_prev(
+    const rocksdb_writestallinfo_t* info) {
+  return reinterpret_cast<const rocksdb_writestallcondition_t*>(
+      &info->rep.condition.prev);
+}
+
+const char* rocksdb_memtableinfo_cf_name(const rocksdb_memtableinfo_t* info,
+                                         size_t* size) {
+  *size = info->rep.cf_name.size();
+  return info->rep.cf_name.data();
+}
+
+uint64_t rocksdb_memtableinfo_first_seqno(const rocksdb_memtableinfo_t* info) {
+  return info->rep.first_seqno;
+}
+uint64_t rocksdb_memtableinfo_earliest_seqno(
+    const rocksdb_memtableinfo_t* info) {
+  return info->rep.earliest_seqno;
+}
+uint64_t rocksdb_memtableinfo_num_entries(const rocksdb_memtableinfo_t* info) {
+  return info->rep.num_entries;
+}
+uint64_t rocksdb_memtableinfo_num_deletes(const rocksdb_memtableinfo_t* info) {
+  return info->rep.num_deletes;
+}
+
+/* event listener */
+
+struct rocksdb_eventlistener_t : public EventListener {
+  void* state_{};
+  void (*destructor_)(void*){};
+  void (*on_flush_begin)(void*, rocksdb_t*, const rocksdb_flushjobinfo_t*){};
+  void (*on_flush_completed)(void*, rocksdb_t*,
+                             const rocksdb_flushjobinfo_t*){};
+  void (*on_compaction_begin)(void*, rocksdb_t*,
+                              const rocksdb_compactionjobinfo_t*){};
+  void (*on_compaction_completed)(void*, rocksdb_t*,
+                                  const rocksdb_compactionjobinfo_t*){};
+  void (*on_subcompaction_begin)(void*,
+                                 const rocksdb_subcompactionjobinfo_t*){};
+  void (*on_subcompaction_completed)(void*,
+                                     const rocksdb_subcompactionjobinfo_t*){};
+  void (*on_external_file_ingested)(
+      void*, rocksdb_t*, const rocksdb_externalfileingestioninfo_t*){};
+  void (*on_background_error)(void*, uint32_t, rocksdb_status_ptr_t*){};
+  void (*on_stall_conditions_changed)(void*, const rocksdb_writestallinfo_t*){};
+  void (*on_memtable_sealed)(void*, const rocksdb_memtableinfo_t*){};
+
+  rocksdb_eventlistener_t() = default;
+
+  rocksdb_eventlistener_t(const rocksdb_eventlistener_t&) = delete;
+  rocksdb_eventlistener_t& operator=(const rocksdb_eventlistener_t&) = delete;
+  rocksdb_eventlistener_t(rocksdb_eventlistener_t&&) = delete;
+  rocksdb_eventlistener_t& operator=(rocksdb_eventlistener_t&&) = delete;
+
+  void OnFlushBegin(DB* db, const FlushJobInfo& info) override {
+    rocksdb_t c_db = {db};
+    on_flush_begin(state_, &c_db,
+                   reinterpret_cast<const rocksdb_flushjobinfo_t*>(&info));
+  }
+
+  void OnFlushCompleted(DB* db, const FlushJobInfo& info) override {
+    rocksdb_t c_db = {db};
+    on_flush_completed(state_, &c_db,
+                       reinterpret_cast<const rocksdb_flushjobinfo_t*>(&info));
+  }
+
+  void OnCompactionBegin(DB* db, const CompactionJobInfo& info) override {
+    rocksdb_t c_db = {db};
+    on_compaction_begin(
+        state_, &c_db,
+        reinterpret_cast<const rocksdb_compactionjobinfo_t*>(&info));
+  }
+
+  void OnCompactionCompleted(DB* db, const CompactionJobInfo& info) override {
+    rocksdb_t c_db = {db};
+    on_compaction_completed(
+        state_, &c_db,
+        reinterpret_cast<const rocksdb_compactionjobinfo_t*>(&info));
+  }
+
+  void OnSubcompactionBegin(const SubcompactionJobInfo& info) override {
+    on_subcompaction_begin(
+        state_, reinterpret_cast<const rocksdb_subcompactionjobinfo_t*>(&info));
+  }
+
+  void OnSubcompactionCompleted(const SubcompactionJobInfo& info) override {
+    on_subcompaction_completed(
+        state_, reinterpret_cast<const rocksdb_subcompactionjobinfo_t*>(&info));
+  }
+
+  void OnExternalFileIngested(DB* db,
+                              const ExternalFileIngestionInfo& info) override {
+    rocksdb_t c_db = {db};
+    on_external_file_ingested(
+        state_, &c_db,
+        reinterpret_cast<const rocksdb_externalfileingestioninfo_t*>(&info));
+  }
+
+  void OnBackgroundError(BackgroundErrorReason reason,
+                         Status* status) override {
+    rocksdb_status_ptr_t* s = new rocksdb_status_ptr_t;
+    s->rep = status;
+    on_background_error(state_, static_cast<uint32_t>(reason), s);
+    delete s;
+  }
+
+  void OnStallConditionsChanged(const WriteStallInfo& info) override {
+    on_stall_conditions_changed(
+        state_, reinterpret_cast<const rocksdb_writestallinfo_t*>(&info));
+  }
+
+  void OnMemTableSealed(const MemTableInfo& info) override {
+    on_memtable_sealed(state_,
+                       reinterpret_cast<const rocksdb_memtableinfo_t*>(&info));
+  }
+
+  ~rocksdb_eventlistener_t() override { destructor_(state_); }
+};
+
+rocksdb_eventlistener_t* rocksdb_eventlistener_create(
+    void* state_, void (*destructor_)(void*), on_flush_begin_cb on_flush_begin,
+    on_flush_completed_cb on_flush_completed,
+    on_compaction_begin_cb on_compaction_begin,
+    on_compaction_completed_cb on_compaction_completed,
+    on_subcompaction_begin_cb on_subcompaction_begin,
+    on_subcompaction_completed_cb on_subcompaction_completed,
+    on_external_file_ingested_cb on_external_file_ingested,
+    on_background_error_cb on_background_error,
+    on_stall_conditions_changed_cb on_stall_conditions_changed,
+    on_memtable_sealed_cb on_memtable_sealed) {
+  rocksdb_eventlistener_t* et = new rocksdb_eventlistener_t;
+  et->state_ = state_;
+  et->destructor_ = destructor_;
+  et->on_flush_begin = on_flush_begin;
+  et->on_flush_completed = on_flush_completed;
+  et->on_compaction_begin = on_compaction_begin;
+  et->on_compaction_completed = on_compaction_completed;
+  et->on_subcompaction_begin = on_subcompaction_begin;
+  et->on_subcompaction_completed = on_subcompaction_completed;
+  et->on_external_file_ingested = on_external_file_ingested;
+  et->on_background_error = on_background_error;
+  et->on_stall_conditions_changed = on_stall_conditions_changed;
+  et->on_memtable_sealed = on_memtable_sealed;
+  return et;
+}
+
+void rocksdb_eventlistener_destroy(rocksdb_eventlistener_t* t) { delete t; }
+
+void rocksdb_options_add_eventlistener(rocksdb_options_t* opt,
+                                       rocksdb_eventlistener_t* t) {
+  opt->rep.listeners.emplace_back(std::shared_ptr<EventListener>(t));
+}
+
+void rocksdb_compaction_service_options_override_add_event_listener(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_eventlistener_t* event_listener) {
+  if (override_options && event_listener) {
+    override_options->rep.listeners.emplace_back(
+        std::shared_ptr<EventListener>(event_listener));
+  }
 }
 
 rocksdb_cuckoo_table_options_t* rocksdb_cuckoo_options_create() {
@@ -3141,6 +4392,65 @@ rocksdb_logger_t* rocksdb_logger_create_callback_logger(
 
 void rocksdb_logger_destroy(rocksdb_logger_t* logger) { delete logger; }
 
+/* File Checksum Gen Factory */
+
+rocksdb_file_checksum_gen_factory_t*
+rocksdb_file_checksum_gen_crc32c_factory_create() {
+  rocksdb_file_checksum_gen_factory_t* factory =
+      new rocksdb_file_checksum_gen_factory_t;
+  factory->rep = GetFileChecksumGenCrc32cFactory();
+  return factory;
+}
+
+void rocksdb_file_checksum_gen_factory_destroy(
+    rocksdb_file_checksum_gen_factory_t* factory) {
+  delete factory;
+}
+
+void rocksdb_options_set_file_checksum_gen_factory(
+    rocksdb_options_t* opt, rocksdb_file_checksum_gen_factory_t* factory) {
+  if (opt && factory) {
+    opt->rep.file_checksum_gen_factory = factory->rep;
+  }
+}
+
+/* SST Partitioner Factory */
+
+rocksdb_sst_partitioner_factory_t*
+rocksdb_sst_partitioner_fixed_prefix_factory_create(size_t prefix_len) {
+  rocksdb_sst_partitioner_factory_t* factory =
+      new rocksdb_sst_partitioner_factory_t;
+  factory->rep = NewSstPartitionerFixedPrefixFactory(prefix_len);
+  return factory;
+}
+
+void rocksdb_sst_partitioner_factory_destroy(
+    rocksdb_sst_partitioner_factory_t* factory) {
+  delete factory;
+}
+
+void rocksdb_options_set_sst_partitioner_factory(
+    rocksdb_options_t* opt, rocksdb_sst_partitioner_factory_t* factory) {
+  if (opt && factory) {
+    opt->rep.sst_partitioner_factory = factory->rep;
+  }
+}
+
+/* Table Properties Collector Factory */
+
+void rocksdb_table_properties_collector_factory_destroy(
+    rocksdb_table_properties_collector_factory_t* factory) {
+  delete factory;
+}
+
+void rocksdb_options_add_table_properties_collector_factory(
+    rocksdb_options_t* opt,
+    rocksdb_table_properties_collector_factory_t* factory) {
+  if (opt && factory) {
+    opt->rep.table_properties_collector_factories.push_back(factory->rep);
+  }
+}
+
 void rocksdb_options_set_env(rocksdb_options_t* opt, rocksdb_env_t* env) {
   opt->rep.env = (env ? env->rep : nullptr);
 }
@@ -3183,6 +4493,11 @@ void rocksdb_options_set_write_buffer_manager(
   opt->rep.write_buffer_manager = wbm->rep;
 }
 
+void rocksdb_options_set_sst_file_manager(rocksdb_options_t* opt,
+                                          rocksdb_sst_file_manager_t* sfm) {
+  opt->rep.sst_file_manager = sfm->rep;
+}
+
 size_t rocksdb_options_get_write_buffer_size(rocksdb_options_t* opt) {
   return opt->rep.write_buffer_size;
 }
@@ -3295,6 +4610,26 @@ uint64_t rocksdb_options_get_periodic_compaction_seconds(
   return opt->rep.periodic_compaction_seconds;
 }
 
+void rocksdb_options_set_memtable_op_scan_flush_trigger(rocksdb_options_t* opt,
+                                                        uint32_t n) {
+  opt->rep.memtable_op_scan_flush_trigger = n;
+}
+
+uint32_t rocksdb_options_get_memtable_op_scan_flush_trigger(
+    rocksdb_options_t* opt) {
+  return opt->rep.memtable_op_scan_flush_trigger;
+}
+
+void rocksdb_options_set_memtable_avg_op_scan_flush_trigger(
+    rocksdb_options_t* opt, uint32_t n) {
+  opt->rep.memtable_avg_op_scan_flush_trigger = n;
+}
+
+uint32_t rocksdb_options_get_memtable_avg_op_scan_flush_trigger(
+    rocksdb_options_t* opt) {
+  return opt->rep.memtable_avg_op_scan_flush_trigger;
+}
+
 void rocksdb_options_enable_statistics(rocksdb_options_t* opt) {
   opt->rep.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
 }
@@ -3332,16 +4667,6 @@ unsigned char rocksdb_options_get_skip_stats_update_on_db_open(
   return opt->rep.skip_stats_update_on_db_open;
 }
 
-void rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(
-    rocksdb_options_t* opt, unsigned char val) {
-  opt->rep.skip_checking_sst_file_sizes_on_db_open = val;
-}
-
-unsigned char rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(
-    rocksdb_options_t* opt) {
-  return opt->rep.skip_checking_sst_file_sizes_on_db_open;
-}
-
 /* Blob Options Settings */
 void rocksdb_options_set_enable_blob_files(rocksdb_options_t* opt,
                                            unsigned char val) {
@@ -3804,16 +5129,6 @@ int rocksdb_options_get_min_write_buffer_number_to_merge(
   return opt->rep.min_write_buffer_number_to_merge;
 }
 
-void rocksdb_options_set_max_write_buffer_number_to_maintain(
-    rocksdb_options_t* opt, int n) {
-  opt->rep.max_write_buffer_number_to_maintain = n;
-}
-
-int rocksdb_options_get_max_write_buffer_number_to_maintain(
-    rocksdb_options_t* opt) {
-  return opt->rep.max_write_buffer_number_to_maintain;
-}
-
 void rocksdb_options_set_max_write_buffer_size_to_maintain(
     rocksdb_options_t* opt, int64_t n) {
   opt->rep.max_write_buffer_size_to_maintain = n;
@@ -4280,6 +5595,15 @@ void rocksdb_options_add_compact_on_deletion_collector_factory_del_ratio(
   opt->rep.table_properties_collector_factories.emplace_back(compact_on_del);
 }
 
+void rocksdb_options_add_compact_on_deletion_collector_factory_min_file_size(
+    rocksdb_options_t* opt, size_t window_size, size_t num_dels_trigger,
+    double deletion_ratio, uint64_t min_file_size) {
+  std::shared_ptr<ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory>
+      compact_on_del = NewCompactOnDeletionCollectorFactory(
+          window_size, num_dels_trigger, deletion_ratio, min_file_size);
+  opt->rep.table_properties_collector_factories.emplace_back(compact_on_del);
+}
+
 void rocksdb_set_perf_level(int v) {
   PerfLevel level = static_cast<PerfLevel>(v);
   SetPerfLevel(level);
@@ -4332,6 +5656,8 @@ uint64_t rocksdb_perfcontext_metric(rocksdb_perfcontext_t* context,
       return rep->internal_recent_skipped_count;
     case rocksdb_internal_merge_count:
       return rep->internal_merge_count;
+    case rocksdb_internal_merge_point_lookup_count:
+      return rep->internal_merge_point_lookup_count;
     case rocksdb_get_snapshot_time:
       return rep->get_snapshot_time;
     case rocksdb_get_from_memtable_time:
@@ -4756,11 +6082,6 @@ unsigned char rocksdb_readoptions_get_tailing(rocksdb_readoptions_t* opt) {
   return opt->rep.tailing;
 }
 
-void rocksdb_readoptions_set_managed(rocksdb_readoptions_t* opt,
-                                     unsigned char v) {
-  opt->rep.managed = v;
-}
-
 void rocksdb_readoptions_set_readahead_size(rocksdb_readoptions_t* opt,
                                             size_t v) {
   opt->rep.readahead_size = v;
@@ -5239,6 +6560,67 @@ ROCKSDB_LIBRARY_API void rocksdb_write_buffer_manager_set_allow_stall(
   wbm->rep->SetAllowStall(new_allow_stall);
 }
 
+rocksdb_sst_file_manager_t* rocksdb_sst_file_manager_create(
+    rocksdb_env_t* env) {
+  rocksdb_sst_file_manager_t* sfm = new rocksdb_sst_file_manager_t;
+  sfm->rep.reset(ROCKSDB_NAMESPACE::NewSstFileManager(env->rep));
+  return sfm;
+}
+
+void rocksdb_sst_file_manager_destroy(rocksdb_sst_file_manager_t* sfm) {
+  delete sfm;
+}
+
+void rocksdb_sst_file_manager_set_max_allowed_space_usage(
+    rocksdb_sst_file_manager_t* sfm, uint64_t max_allowed_space) {
+  sfm->rep->SetMaxAllowedSpaceUsage(max_allowed_space);
+}
+
+void rocksdb_sst_file_manager_set_compaction_buffer_size(
+    rocksdb_sst_file_manager_t* sfm, uint64_t compaction_buffer_size) {
+  sfm->rep->SetCompactionBufferSize(compaction_buffer_size);
+}
+
+bool rocksdb_sst_file_manager_is_max_allowed_space_reached(
+    rocksdb_sst_file_manager_t* sfm) {
+  return sfm->rep->IsMaxAllowedSpaceReached();
+}
+
+bool rocksdb_sst_file_manager_is_max_allowed_space_reached_including_compactions(
+    rocksdb_sst_file_manager_t* sfm) {
+  return sfm->rep->IsMaxAllowedSpaceReachedIncludingCompactions();
+}
+
+uint64_t rocksdb_sst_file_manager_get_total_size(
+    rocksdb_sst_file_manager_t* sfm) {
+  return sfm->rep->GetTotalSize();
+}
+
+int64_t rocksdb_sst_file_manager_get_delete_rate_bytes_per_second(
+    rocksdb_sst_file_manager_t* sfm) {
+  return sfm->rep->GetDeleteRateBytesPerSecond();
+}
+
+void rocksdb_sst_file_manager_set_delete_rate_bytes_per_second(
+    rocksdb_sst_file_manager_t* sfm, int64_t delete_rate) {
+  return sfm->rep->SetDeleteRateBytesPerSecond(delete_rate);
+}
+
+double rocksdb_sst_file_manager_get_max_trash_db_ratio(
+    rocksdb_sst_file_manager_t* sfm) {
+  return sfm->rep->GetMaxTrashDBRatio();
+}
+
+void rocksdb_sst_file_manager_set_max_trash_db_ratio(
+    rocksdb_sst_file_manager_t* sfm, double ratio) {
+  return sfm->rep->SetMaxTrashDBRatio(ratio);
+}
+
+uint64_t rocksdb_sst_file_manager_get_total_trash_size(
+    rocksdb_sst_file_manager_t* sfm) {
+  return sfm->rep->GetTotalTrashSize();
+}
+
 rocksdb_dbpath_t* rocksdb_dbpath_create(const char* path,
                                         uint64_t target_size) {
   rocksdb_dbpath_t* result = new rocksdb_dbpath_t;
@@ -5500,14 +6882,12 @@ rocksdb_slicetransform_t* rocksdb_slicetransform_create(
     char* (*transform)(void*, const char* key, size_t length,
                        size_t* dst_length),
     unsigned char (*in_domain)(void*, const char* key, size_t length),
-    unsigned char (*in_range)(void*, const char* key, size_t length),
     const char* (*name)(void*)) {
   rocksdb_slicetransform_t* result = new rocksdb_slicetransform_t;
   result->state_ = state;
   result->destructor_ = destructor;
   result->transform_ = transform;
   result->in_domain_ = in_domain;
-  result->in_range_ = in_range;
   result->name_ = name;
   return result;
 }
@@ -5523,7 +6903,6 @@ struct SliceTransformWrapper : public rocksdb_slicetransform_t {
     return rep_->Transform(src);
   }
   bool InDomain(const Slice& src) const override { return rep_->InDomain(src); }
-  bool InRange(const Slice& src) const override { return rep_->InRange(src); }
   static void DoNothing(void*) {}
 };
 
@@ -5647,6 +7026,27 @@ uint64_t rocksdb_fifo_compaction_options_get_max_table_files_size(
   return fifo_opts->rep.max_table_files_size;
 }
 
+void rocksdb_fifo_compaction_options_set_max_data_files_size(
+    rocksdb_fifo_compaction_options_t* fifo_opts, uint64_t size) {
+  fifo_opts->rep.max_data_files_size = size;
+}
+
+uint64_t rocksdb_fifo_compaction_options_get_max_data_files_size(
+    rocksdb_fifo_compaction_options_t* fifo_opts) {
+  return fifo_opts->rep.max_data_files_size;
+}
+
+void rocksdb_fifo_compaction_options_set_use_kv_ratio_compaction(
+    rocksdb_fifo_compaction_options_t* fifo_opts,
+    unsigned char use_kv_ratio_compaction) {
+  fifo_opts->rep.use_kv_ratio_compaction = use_kv_ratio_compaction;
+}
+
+unsigned char rocksdb_fifo_compaction_options_get_use_kv_ratio_compaction(
+    rocksdb_fifo_compaction_options_t* fifo_opts) {
+  return fifo_opts->rep.use_kv_ratio_compaction;
+}
+
 void rocksdb_fifo_compaction_options_destroy(
     rocksdb_fifo_compaction_options_t* fifo_opts) {
   delete fifo_opts;
@@ -5666,6 +7066,10 @@ void rocksdb_options_set_min_level_to_compress(rocksdb_options_t* opt,
   }
 }
 
+rocksdb_livefiles_t* rocksdb_livefiles_create() {
+  return new rocksdb_livefiles_t;
+}
+
 int rocksdb_livefiles_count(const rocksdb_livefiles_t* lf) {
   return static_cast<int>(lf->rep.size());
 }
@@ -5679,6 +7083,16 @@ const char* rocksdb_livefiles_name(const rocksdb_livefiles_t* lf, int index) {
   return lf->rep[index].name.c_str();
 }
 
+const char* rocksdb_livefiles_directory(const rocksdb_livefiles_t* lf,
+                                        int index) {
+  if (lf->rep[index].directory.empty()) {
+    // db_path is deprecated but still returned by some code paths
+    return lf->rep[index].db_path.c_str();
+  } else {
+    return lf->rep[index].directory.c_str();
+  }
+}
+
 int rocksdb_livefiles_level(const rocksdb_livefiles_t* lf, int index) {
   return lf->rep[index].level;
 }
@@ -5699,6 +7113,16 @@ const char* rocksdb_livefiles_largestkey(const rocksdb_livefiles_t* lf,
   return lf->rep[index].largestkey.data();
 }
 
+uint64_t rocksdb_livefiles_smallest_seqno(const rocksdb_livefiles_t* lf,
+                                          int index) {
+  return lf->rep[index].smallest_seqno;
+}
+
+uint64_t rocksdb_livefiles_largest_seqno(const rocksdb_livefiles_t* lf,
+                                         int index) {
+  return lf->rep[index].largest_seqno;
+}
+
 uint64_t rocksdb_livefiles_entries(const rocksdb_livefiles_t* lf, int index) {
   return lf->rep[index].num_entries;
 }
@@ -5709,6 +7133,71 @@ uint64_t rocksdb_livefiles_deletions(const rocksdb_livefiles_t* lf, int index) {
 
 void rocksdb_livefiles_destroy(const rocksdb_livefiles_t* lf) { delete lf; }
 
+rocksdb_livefile_t* rocksdb_livefile_create() { return new rocksdb_livefile_t; }
+
+void rocksdb_livefile_set_column_family_name(rocksdb_livefile_t* lf,
+                                             const char* column_family_name) {
+  lf->rep.column_family_name = std::string(column_family_name);
+}
+
+void rocksdb_livefile_set_level(rocksdb_livefile_t* lf, int level) {
+  lf->rep.level = level;
+}
+
+void rocksdb_livefile_set_name(rocksdb_livefile_t* lf, const char* name) {
+  lf->rep.name = std::string(name);
+}
+
+void rocksdb_livefile_set_directory(rocksdb_livefile_t* lf,
+                                    const char* directory) {
+  lf->rep.directory = std::string(directory);
+  lf->rep.db_path = std::string(directory);  // deprecated but still needed
+}
+
+void rocksdb_livefile_set_size(rocksdb_livefile_t* lf, size_t size) {
+  lf->rep.size = size;
+}
+
+void rocksdb_livefile_set_smallest_key(rocksdb_livefile_t* lf,
+                                       const char* smallest_key,
+                                       size_t smallest_key_len) {
+  lf->rep.smallestkey = std::string(smallest_key, smallest_key_len);
+}
+
+void rocksdb_livefile_set_largest_key(rocksdb_livefile_t* lf,
+                                      const char* largest_key,
+                                      size_t largest_key_len) {
+  lf->rep.largestkey = std::string(largest_key, largest_key_len);
+}
+
+void rocksdb_livefile_set_smallest_seqno(rocksdb_livefile_t* lf,
+                                         uint64_t smallest_seqno) {
+  lf->rep.smallest_seqno = smallest_seqno;
+}
+
+void rocksdb_livefile_set_largest_seqno(rocksdb_livefile_t* lf,
+                                        uint64_t largest_seqno) {
+  lf->rep.largest_seqno = largest_seqno;
+}
+
+void rocksdb_livefile_set_num_entries(rocksdb_livefile_t* lf,
+                                      uint64_t num_entries) {
+  lf->rep.num_entries = num_entries;
+}
+
+void rocksdb_livefile_set_num_deletions(rocksdb_livefile_t* lf,
+                                        uint64_t num_deletions) {
+  lf->rep.num_deletions = num_deletions;
+}
+
+void rocksdb_livefile_destroy(rocksdb_livefile_t* lf) { delete lf; }
+
+void rocksdb_livefiles_add(rocksdb_livefiles_t* lf,
+                           rocksdb_livefile_t* livefile) {
+  lf->rep.push_back(std::move(livefile->rep));
+  delete livefile;
+}
+
 void rocksdb_get_options_from_string(const rocksdb_options_t* base_options,
                                      const char* opts_str,
                                      rocksdb_options_t* new_options,
@@ -5859,6 +7348,58 @@ char* rocksdb_sst_file_metadata_get_largestkey(
   return CopyString(file_meta->rep->largestkey);
 }
 
+rocksdb_import_column_family_options_t*
+rocksdb_import_column_family_options_create() {
+  return new rocksdb_import_column_family_options_t;
+}
+
+void rocksdb_import_column_family_options_set_move_files(
+    rocksdb_import_column_family_options_t* opt, unsigned char v) {
+  opt->rep.move_files = v;
+}
+
+void rocksdb_import_column_family_options_destroy(
+    rocksdb_import_column_family_options_t* metadata) {
+  delete metadata;
+}
+
+rocksdb_export_import_files_metadata_t*
+rocksdb_export_import_files_metadata_create() {
+  auto metadata = new rocksdb_export_import_files_metadata_t;
+  metadata->rep = new ExportImportFilesMetaData;
+  return metadata;
+}
+
+char* rocksdb_export_import_files_metadata_get_db_comparator_name(
+    rocksdb_export_import_files_metadata_t* metadata) {
+  return strdup(metadata->rep->db_comparator_name.c_str());
+}
+
+void rocksdb_export_import_files_metadata_set_db_comparator_name(
+    rocksdb_export_import_files_metadata_t* metadata, const char* name) {
+  metadata->rep->db_comparator_name = std::string(name);
+}
+
+rocksdb_livefiles_t* rocksdb_export_import_files_metadata_get_files(
+    rocksdb_export_import_files_metadata_t* export_import_metadata) {
+  auto files = new rocksdb_livefiles_t;
+  files->rep = std::vector(export_import_metadata->rep->files);
+  return files;
+}
+
+void rocksdb_export_import_files_metadata_set_files(
+    rocksdb_export_import_files_metadata_t* metadata,
+    rocksdb_livefiles_t* files) {
+  metadata->rep->files = std::move(files->rep);
+  delete files;
+}
+
+void rocksdb_export_import_files_metadata_destroy(
+    rocksdb_export_import_files_metadata_t* metadata) {
+  delete metadata->rep;
+  delete metadata;
+}
+
 /* Transactions */
 
 rocksdb_transactiondb_options_t* rocksdb_transactiondb_options_create() {
@@ -5890,6 +7431,11 @@ void rocksdb_transactiondb_options_set_default_lock_timeout(
   opt->rep.default_lock_timeout = default_lock_timeout;
 }
 
+void rocksdb_transactiondb_options_set_use_per_key_point_lock_mgr(
+    rocksdb_transactiondb_options_t* opt, int use_per_key_point_lock_mgr) {
+  opt->rep.use_per_key_point_lock_mgr = use_per_key_point_lock_mgr;
+}
+
 rocksdb_transaction_options_t* rocksdb_transaction_options_create() {
   return new rocksdb_transaction_options_t;
 }
@@ -6186,11 +7732,11 @@ char* rocksdb_transaction_get(rocksdb_transaction_t* txn,
                               const char* key, size_t klen, size_t* vlen,
                               char** errptr) {
   char* result = nullptr;
-  std::string tmp;
-  Status s = txn->rep->Get(options->rep, Slice(key, klen), &tmp);
+  PinnableSlice pinnable_val;
+  Status s = txn->rep->Get(options->rep, Slice(key, klen), &pinnable_val);
   if (s.ok()) {
-    *vlen = tmp.size();
-    result = CopyString(tmp);
+    *vlen = pinnable_val.size();
+    result = CopyString(pinnable_val);
   } else {
     *vlen = 0;
     if (!s.IsNotFound()) {
@@ -6221,12 +7767,12 @@ char* rocksdb_transaction_get_cf(rocksdb_transaction_t* txn,
                                  const char* key, size_t klen, size_t* vlen,
                                  char** errptr) {
   char* result = nullptr;
-  std::string tmp;
-  Status s =
-      txn->rep->Get(options->rep, column_family->rep, Slice(key, klen), &tmp);
+  PinnableSlice pinnable_val;
+  Status s = txn->rep->Get(options->rep, column_family->rep, Slice(key, klen),
+                           &pinnable_val);
   if (s.ok()) {
-    *vlen = tmp.size();
-    result = CopyString(tmp);
+    *vlen = pinnable_val.size();
+    result = CopyString(pinnable_val);
   } else {
     *vlen = 0;
     if (!s.IsNotFound()) {
@@ -6260,12 +7806,12 @@ char* rocksdb_transaction_get_for_update(rocksdb_transaction_t* txn,
                                          size_t* vlen, unsigned char exclusive,
                                          char** errptr) {
   char* result = nullptr;
-  std::string tmp;
-  Status s =
-      txn->rep->GetForUpdate(options->rep, Slice(key, klen), &tmp, exclusive);
+  PinnableSlice pinnable_val;
+  Status s = txn->rep->GetForUpdate(options->rep, Slice(key, klen),
+                                    &pinnable_val, exclusive);
   if (s.ok()) {
-    *vlen = tmp.size();
-    result = CopyString(tmp);
+    *vlen = pinnable_val.size();
+    result = CopyString(pinnable_val);
   } else {
     *vlen = 0;
     if (!s.IsNotFound()) {
@@ -6297,12 +7843,12 @@ char* rocksdb_transaction_get_for_update_cf(
     rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
     size_t* vlen, unsigned char exclusive, char** errptr) {
   char* result = nullptr;
-  std::string tmp;
+  PinnableSlice pinnable_val;
   Status s = txn->rep->GetForUpdate(options->rep, column_family->rep,
-                                    Slice(key, klen), &tmp, exclusive);
+                                    Slice(key, klen), &pinnable_val, exclusive);
   if (s.ok()) {
-    *vlen = tmp.size();
-    result = CopyString(tmp);
+    *vlen = pinnable_val.size();
+    result = CopyString(pinnable_val);
   } else {
     *vlen = 0;
     if (!s.IsNotFound()) {
@@ -6336,10 +7882,13 @@ void rocksdb_transaction_multi_get(rocksdb_transaction_t* txn,
                                    const size_t* keys_list_sizes,
                                    char** values_list,
                                    size_t* values_list_sizes, char** errs) {
-  std::vector<Slice> keys(num_keys);
+  // Use unique_ptr for efficiency
+  std::unique_ptr<Slice[]> keys_arr(new Slice[num_keys]);
   for (size_t i = 0; i < num_keys; i++) {
-    keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+    keys_arr[i] = Slice(keys_list[i], keys_list_sizes[i]);
   }
+  // Note: Transaction only has vector-based MultiGet API
+  std::vector<Slice> keys(keys_arr.get(), keys_arr.get() + num_keys);
   std::vector<std::string> values(num_keys);
   std::vector<Status> statuses =
       txn->rep->MultiGet(options->rep, keys, &values);
@@ -6365,10 +7914,14 @@ void rocksdb_transaction_multi_get_for_update(
     size_t num_keys, const char* const* keys_list,
     const size_t* keys_list_sizes, char** values_list,
     size_t* values_list_sizes, char** errs) {
-  std::vector<Slice> keys(num_keys);
+  // Use unique_ptr for efficiency
+  std::unique_ptr<Slice[]> keys_arr(new Slice[num_keys]);
   for (size_t i = 0; i < num_keys; i++) {
-    keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+    keys_arr[i] = Slice(keys_list[i], keys_list_sizes[i]);
   }
+  // Note: GetForUpdate only has vector-based API, no array-based PinnableSlice
+  // variant
+  std::vector<Slice> keys(keys_arr.get(), keys_arr.get() + num_keys);
   std::vector<std::string> values(num_keys);
   std::vector<Status> statuses =
       txn->rep->MultiGetForUpdate(options->rep, keys, &values);
@@ -6395,12 +7948,15 @@ void rocksdb_transaction_multi_get_cf(
     size_t num_keys, const char* const* keys_list,
     const size_t* keys_list_sizes, char** values_list,
     size_t* values_list_sizes, char** errs) {
-  std::vector<Slice> keys(num_keys);
+  // Use unique_ptr for efficiency
+  std::unique_ptr<Slice[]> keys_arr(new Slice[num_keys]);
   std::vector<ColumnFamilyHandle*> cfs(num_keys);
   for (size_t i = 0; i < num_keys; i++) {
-    keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+    keys_arr[i] = Slice(keys_list[i], keys_list_sizes[i]);
     cfs[i] = column_families[i]->rep;
   }
+  // Note: Transaction only has vector-based MultiGet API
+  std::vector<Slice> keys(keys_arr.get(), keys_arr.get() + num_keys);
   std::vector<std::string> values(num_keys);
   std::vector<Status> statuses =
       txn->rep->MultiGet(options->rep, cfs, keys, &values);
@@ -6427,12 +7983,16 @@ void rocksdb_transaction_multi_get_for_update_cf(
     size_t num_keys, const char* const* keys_list,
     const size_t* keys_list_sizes, char** values_list,
     size_t* values_list_sizes, char** errs) {
-  std::vector<Slice> keys(num_keys);
+  // Use unique_ptr for efficiency
+  std::unique_ptr<Slice[]> keys_arr(new Slice[num_keys]);
   std::vector<ColumnFamilyHandle*> cfs(num_keys);
   for (size_t i = 0; i < num_keys; i++) {
-    keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+    keys_arr[i] = Slice(keys_list[i], keys_list_sizes[i]);
     cfs[i] = column_families[i]->rep;
   }
+  // Note: GetForUpdate only has vector-based API, no array-based PinnableSlice
+  // variant
+  std::vector<Slice> keys(keys_arr.get(), keys_arr.get() + num_keys);
   std::vector<std::string> values(num_keys);
   std::vector<Status> statuses =
       txn->rep->MultiGetForUpdate(options->rep, cfs, keys, &values);
@@ -6459,11 +8019,12 @@ char* rocksdb_transactiondb_get(rocksdb_transactiondb_t* txn_db,
                                 const char* key, size_t klen, size_t* vlen,
                                 char** errptr) {
   char* result = nullptr;
-  std::string tmp;
-  Status s = txn_db->rep->Get(options->rep, Slice(key, klen), &tmp);
+  PinnableSlice pinnable_val;
+  Status s = txn_db->rep->Get(options->rep, txn_db->rep->DefaultColumnFamily(),
+                              Slice(key, klen), &pinnable_val);
   if (s.ok()) {
-    *vlen = tmp.size();
-    result = CopyString(tmp);
+    *vlen = pinnable_val.size();
+    result = CopyString(pinnable_val);
   } else {
     *vlen = 0;
     if (!s.IsNotFound()) {
@@ -6494,12 +8055,12 @@ char* rocksdb_transactiondb_get_cf(
     rocksdb_column_family_handle_t* column_family, const char* key,
     size_t keylen, size_t* vallen, char** errptr) {
   char* result = nullptr;
-  std::string tmp;
+  PinnableSlice pinnable_val;
   Status s = txn_db->rep->Get(options->rep, column_family->rep,
-                              Slice(key, keylen), &tmp);
+                              Slice(key, keylen), &pinnable_val);
   if (s.ok()) {
-    *vallen = tmp.size();
-    result = CopyString(tmp);
+    *vallen = pinnable_val.size();
+    result = CopyString(pinnable_val);
   } else {
     *vallen = 0;
     if (!s.IsNotFound()) {
@@ -6533,13 +8094,17 @@ void rocksdb_transactiondb_multi_get(rocksdb_transactiondb_t* txn_db,
                                      const size_t* keys_list_sizes,
                                      char** values_list,
                                      size_t* values_list_sizes, char** errs) {
-  std::vector<Slice> keys(num_keys);
+  // Use unique_ptr for efficiency
+  std::unique_ptr<Slice[]> keys(new Slice[num_keys]);
   for (size_t i = 0; i < num_keys; i++) {
     keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
   }
-  std::vector<std::string> values(num_keys);
-  std::vector<Status> statuses =
-      txn_db->rep->MultiGet(options->rep, keys, &values);
+  // Use PinnableSlice to avoid unnecessary allocations
+  auto cfh = txn_db->rep->DefaultColumnFamily();
+  std::vector<PinnableSlice> values(num_keys);
+  std::vector<Status> statuses(num_keys);
+  txn_db->rep->MultiGet(options->rep, cfh, num_keys, keys.get(), values.data(),
+                        statuses.data());
   for (size_t i = 0; i < num_keys; i++) {
     if (statuses[i].ok()) {
       values_list[i] = CopyString(values[i]);
@@ -6563,15 +8128,18 @@ void rocksdb_transactiondb_multi_get_cf(
     size_t num_keys, const char* const* keys_list,
     const size_t* keys_list_sizes, char** values_list,
     size_t* values_list_sizes, char** errs) {
-  std::vector<Slice> keys(num_keys);
-  std::vector<ColumnFamilyHandle*> cfs(num_keys);
+  // Use unique_ptr for efficiency
+  std::unique_ptr<Slice[]> keys(new Slice[num_keys]);
+  std::unique_ptr<ColumnFamilyHandle*[]> cfs(new ColumnFamilyHandle*[num_keys]);
   for (size_t i = 0; i < num_keys; i++) {
     keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
     cfs[i] = column_families[i]->rep;
   }
-  std::vector<std::string> values(num_keys);
-  std::vector<Status> statuses =
-      txn_db->rep->MultiGet(options->rep, cfs, keys, &values);
+  // Use PinnableSlice to avoid unnecessary allocations
+  std::vector<PinnableSlice> values(num_keys);
+  std::vector<Status> statuses(num_keys);
+  txn_db->rep->MultiGet(options->rep, num_keys, cfs.get(), keys.get(),
+                        values.data(), statuses.data());
   for (size_t i = 0; i < num_keys; i++) {
     if (statuses[i].ok()) {
       values_list[i] = CopyString(values[i]);
@@ -6975,7 +8543,7 @@ rocksdb_memory_usage_t* rocksdb_approximate_memory_usage_create(
     dbs.push_back(db->rep);
   }
 
-  unordered_set<const Cache*> cache_set;
+  std::unordered_set<const Cache*> cache_set;
   for (auto cache : consumers->caches) {
     cache_set.insert(const_cast<const Cache*>(cache->rep.get()));
   }
@@ -7054,6 +8622,14 @@ void rocksdb_enable_manual_compaction(rocksdb_t* db) {
   db->rep->EnableManualCompaction();
 }
 
+void rocksdb_abort_all_compactions(rocksdb_t* db) {
+  db->rep->AbortAllCompactions();
+}
+
+void rocksdb_resume_all_compactions(rocksdb_t* db) {
+  db->rep->ResumeAllCompactions();
+}
+
 rocksdb_statistics_histogram_data_t*
 rocksdb_statistics_histogram_data_create() {
   return new rocksdb_statistics_histogram_data_t{};
@@ -7164,4 +8740,110 @@ uint64_t rocksdb_wait_for_compact_options_get_timeout(
   return opt->rep.timeout.count();
 }
 
+/* High-performance zero-copy Get implementations */
+
+struct rocksdb_pinnable_handle_t {
+  PinnableSlice rep;
+};
+
+rocksdb_pinnable_handle_t* rocksdb_get_pinned_v2(
+    rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key,
+    size_t keylen, char** errptr) {
+  rocksdb_pinnable_handle_t* handle = new rocksdb_pinnable_handle_t;
+  Status s = db->rep->Get(options->rep, db->rep->DefaultColumnFamily(),
+                          Slice(key, keylen), &handle->rep);
+  if (!s.ok()) {
+    delete handle;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+    return nullptr;
+  }
+  return handle;
+}
+
+rocksdb_pinnable_handle_t* rocksdb_get_pinned_cf_v2(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, char** errptr) {
+  rocksdb_pinnable_handle_t* handle = new rocksdb_pinnable_handle_t;
+  Status s = db->rep->Get(options->rep, column_family->rep, Slice(key, keylen),
+                          &handle->rep);
+  if (!s.ok()) {
+    delete handle;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+    return nullptr;
+  }
+  return handle;
+}
+
+const char* rocksdb_pinnable_handle_get_value(
+    const rocksdb_pinnable_handle_t* handle, size_t* vallen) {
+  if (!handle) {
+    *vallen = 0;
+    return nullptr;
+  }
+  *vallen = handle->rep.size();
+  return handle->rep.data();
+}
+
+void rocksdb_pinnable_handle_destroy(rocksdb_pinnable_handle_t* handle) {
+  delete handle;
+}
+
+unsigned char rocksdb_get_into_buffer(rocksdb_t* db,
+                                      const rocksdb_readoptions_t* options,
+                                      const char* key, size_t keylen,
+                                      char* buffer, size_t buffer_size,
+                                      size_t* vallen, unsigned char* found,
+                                      char** errptr) {
+  PinnableSlice pinnable_val;
+  Status s = db->rep->Get(options->rep, db->rep->DefaultColumnFamily(),
+                          Slice(key, keylen), &pinnable_val);
+  if (s.ok()) {
+    *found = 1;
+    *vallen = pinnable_val.size();
+    if (buffer_size >= pinnable_val.size()) {
+      memcpy(buffer, pinnable_val.data(), pinnable_val.size());
+      return 1;  // Success - data copied
+    }
+    return 0;  // Buffer too small
+  } else {
+    *found = 0;
+    *vallen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+    return 0;
+  }
+}
+
+unsigned char rocksdb_get_into_buffer_cf(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, char* buffer, size_t buffer_size, size_t* vallen,
+    unsigned char* found, char** errptr) {
+  PinnableSlice pinnable_val;
+  Status s = db->rep->Get(options->rep, column_family->rep, Slice(key, keylen),
+                          &pinnable_val);
+  if (s.ok()) {
+    *found = 1;
+    *vallen = pinnable_val.size();
+    if (buffer_size >= pinnable_val.size()) {
+      memcpy(buffer, pinnable_val.data(), pinnable_val.size());
+      return 1;  // Success - data copied
+    }
+    return 0;  // Buffer too small
+  } else {
+    *found = 0;
+    *vallen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+    return 0;
+  }
+}
+
 }  // end extern "C"
diff --git a/db/c_test.c b/db/c_test.c
index 18bf2961ded3..8c57d0fcf6ec 100644
--- a/db/c_test.c
+++ b/db/c_test.c
@@ -103,6 +103,12 @@ static void CheckValue(char* err, const char* expected, char** actual,
   Free(actual);
 }
 
+static void CheckPinnedValue(char* err, const char* expected,
+                             const char** actual, size_t actual_length) {
+  CheckNoError(err);
+  CheckEqual(expected, *actual, actual_length);
+}
+
 static void CheckGet(rocksdb_t* db, const rocksdb_readoptions_t* options,
                      const char* key, const char* expected) {
   char* err = NULL;
@@ -716,6 +722,88 @@ static void LoadAndCheckLatestOptions(const char* db_name, rocksdb_env_t* env,
                                       num_column_families);
 }
 
+// Global state for tracking remote compaction calls
+typedef struct {
+  int schedule_called;
+  int wait_called;
+  int cancel_called;
+  char last_scheduled_job_id[256];
+  char last_db_name[256];
+} RemoteCompactionState;
+
+// Schedule callback - gets called when compaction is scheduled
+static rocksdb_compactionservice_scheduleresponse_t* RemoteCompactionSchedule(
+    void* state, const rocksdb_compactionservice_jobinfo_t* info,
+    const char* input, size_t input_len) {
+  (void)input;
+  (void)input_len;
+  RemoteCompactionState* rcs = (RemoteCompactionState*)state;
+  rcs->schedule_called++;
+
+  // Extract job info
+  size_t db_name_len;
+  const char* db_name =
+      rocksdb_compactionservice_jobinfo_t_get_db_name(info, &db_name_len);
+  memcpy(rcs->last_db_name, db_name, db_name_len);
+  rcs->last_db_name[db_name_len] = '\0';
+
+  // Generate a job ID
+  snprintf(rcs->last_scheduled_job_id, sizeof(rcs->last_scheduled_job_id),
+           "job-%d", rcs->schedule_called);
+
+  // Create response with success status
+  char* err = NULL;
+  rocksdb_compactionservice_scheduleresponse_t* response =
+      rocksdb_compactionservice_scheduleresponse_create(
+          rcs->last_scheduled_job_id,
+          rocksdb_compactionservice_jobstatus_success, &err);
+  if (err) {
+    free(err);
+  }
+  return response;
+}
+
+// Wait callback - simulates waiting for remote compaction to complete
+static int RemoteCompactionWait(void* state, const char* scheduled_job_id,
+                                char** result, size_t* result_len) {
+  RemoteCompactionState* rcs = (RemoteCompactionState*)state;
+  rcs->wait_called++;
+
+  if (strcmp(scheduled_job_id, rcs->last_scheduled_job_id) != 0) {
+    return rocksdb_compactionservice_jobstatus_failure;
+  }
+
+  // For testing purposes, return kUseLocal to cause RocksDB to fall back to
+  // local compaction. This tests the callback mechanism without needing a fully
+  // serialized result. In a real scenario, this would communicate with a remote
+  // worker that calls rocksdb_open_and_compact() and returns a properly
+  // serialized CompactionServiceResult
+  *result = NULL;
+  *result_len = 0;
+
+  return rocksdb_compactionservice_jobstatus_use_local;
+}
+
+// Cancel callback - cancels pending jobs
+static void RemoteCompactionCancel(void* state) {
+  RemoteCompactionState* rcs = (RemoteCompactionState*)state;
+  rcs->cancel_called++;
+}
+
+// Destructor callback
+static void RemoteCompactionDestroy(void* state) { (void)state; }
+
+// NULL schedule callback for testing failure handling
+static rocksdb_compactionservice_scheduleresponse_t* NullSchedule(
+    void* state, const rocksdb_compactionservice_jobinfo_t* info,
+    const char* input, size_t input_len) {
+  (void)state;
+  (void)info;
+  (void)input;
+  (void)input_len;
+  return NULL;  // Return NULL to simulate failure
+}
+
 int main(int argc, char** argv) {
   (void)argc;
   (void)argv;
@@ -1030,6 +1118,78 @@ int main(int argc, char** argv) {
     rocksdb_options_set_error_if_exists(options, 1);
   }
 
+  StartPhase("checkpoint_export_column_family");
+  {
+    static char cf_export_path[200];
+    static char db_import_path[200];
+    snprintf(cf_export_path, sizeof(cf_export_path),
+             "%s/rocksdb_c_test-%d-cf_export", GetTempDir(), ((int)geteuid()));
+    snprintf(db_import_path, sizeof(db_import_path),
+             "%s/rocksdb_c_test-%d-db_import", GetTempDir(), ((int)geteuid()));
+
+    rocksdb_options_t* db_options = rocksdb_options_create();
+    rocksdb_column_family_handle_t* cf_export =
+        rocksdb_create_column_family(db, db_options, "cf_export", &err);
+    CheckNoError(err);
+
+    rocksdb_put_cf(db, woptions, cf_export, "k1", 2, "v1", 2, &err);
+    CheckNoError(err);
+    rocksdb_put_cf(db, woptions, cf_export, "k2", 2, "v2", 2, &err);
+    CheckNoError(err);
+
+    rocksdb_checkpoint_t* checkpoint =
+        rocksdb_checkpoint_object_create(db, &err);
+    CheckNoError(err);
+
+    rocksdb_export_import_files_metadata_t* export_metadata =
+        rocksdb_checkpoint_export_column_family(checkpoint, cf_export,
+                                                cf_export_path, &err);
+    CheckNoError(err);
+    const char* comparator_name =
+        rocksdb_export_import_files_metadata_get_db_comparator_name(
+            export_metadata);
+    CheckEqual("leveldb.BytewiseComparator", comparator_name, 26);
+    rocksdb_free((void*)comparator_name);
+    rocksdb_checkpoint_object_destroy(checkpoint);
+    checkpoint = NULL;
+    rocksdb_drop_column_family(db, cf_export, &err);
+    CheckNoError(err);
+    rocksdb_column_family_handle_destroy(cf_export);
+    rocksdb_options_set_create_if_missing(db_options, 1);
+    rocksdb_options_set_error_if_exists(db_options, 1);
+    rocksdb_t* db_import = rocksdb_open(db_options, db_import_path, &err);
+    CheckNoError(err);
+    rocksdb_import_column_family_options_t* import_options =
+        rocksdb_import_column_family_options_create();
+    rocksdb_column_family_handle_t* cf_import =
+        rocksdb_create_column_family_with_import(db_import, db_options,
+                                                 "cf_import", import_options,
+                                                 export_metadata, &err);
+    CheckNoError(err);
+    rocksdb_import_column_family_options_destroy(import_options);
+    rocksdb_export_import_files_metadata_destroy(export_metadata);
+    size_t val_len;
+    char* val =
+        rocksdb_get_cf(db_import, roptions, cf_import, "k1", 2, &val_len, &err);
+    CheckNoError(err);
+    CheckEqual("v1", val, val_len);
+    free(val);
+
+    val =
+        rocksdb_get_cf(db_import, roptions, cf_import, "k2", 2, &val_len, &err);
+    CheckNoError(err);
+    CheckEqual("v2", val, val_len);
+    free(val);
+
+    rocksdb_column_family_handle_destroy(cf_import);
+    cf_import = NULL;
+    rocksdb_close(db_import);
+    rocksdb_destroy_db(db_options, db_import_path, &err);
+    CheckNoError(err);
+    rocksdb_options_destroy(db_options);
+    db_options = NULL;
+  }
+
   StartPhase("compactall");
   rocksdb_compact_range(db, NULL, 0, NULL, 0);
   CheckGet(db, roptions, "foo", "hello");
@@ -1177,6 +1337,70 @@ int main(int argc, char** argv) {
     rocksdb_writebatch_destroy(wb);
   }
 
+  StartPhase("writebatch_vectors_cf");
+  {
+    const char* cf_name = "wb_vectors_cf";
+    rocksdb_column_family_handle_t* wb_cf =
+        rocksdb_create_column_family(db, options, cf_name, &err);
+    CheckNoError(err);
+
+    rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
+
+    // Test putv_cf: concatenates multiple slices into a single key/value
+    const char* put_keys[2] = {"k", "ey"};
+    const size_t put_key_sizes[2] = {1, 2};
+    const char* put_vals[3] = {"v", "a", "l"};
+    const size_t put_val_sizes[3] = {1, 1, 1};
+    rocksdb_writebatch_putv_cf(wb, wb_cf, 2, put_keys, put_key_sizes, 3,
+                               put_vals, put_val_sizes);
+    rocksdb_write(db, woptions, wb, &err);
+    CheckNoError(err);
+    // putv_cf concatenates: key="k"+"ey"="key", value="v"+"a"+"l"="val"
+    CheckGetCF(db, roptions, wb_cf, "key", "val");
+    CheckGetCF(db, roptions, wb_cf, "k", NULL);
+    CheckGetCF(db, roptions, wb_cf, "ey", NULL);
+
+    // Test deletev_cf: concatenates multiple slices for key
+    rocksdb_writebatch_clear(wb);
+    const char* del_keys[2] = {"k", "ey"};
+    const size_t del_key_sizes[2] = {1, 2};
+    rocksdb_writebatch_deletev_cf(wb, wb_cf, 2, del_keys, del_key_sizes);
+    rocksdb_write(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGetCF(db, roptions, wb_cf, "key", NULL);
+
+    // Test delete_rangev_cf: concatenates slices for range deletion
+    rocksdb_writebatch_clear(wb);
+    rocksdb_writebatch_put_cf(wb, wb_cf, "a", 1, "1", 1);
+    rocksdb_writebatch_put_cf(wb, wb_cf, "b", 1, "2", 1);
+    rocksdb_writebatch_put_cf(wb, wb_cf, "c", 1, "3", 1);
+    rocksdb_write(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGetCF(db, roptions, wb_cf, "a", "1");
+    CheckGetCF(db, roptions, wb_cf, "b", "2");
+    CheckGetCF(db, roptions, wb_cf, "c", "3");
+
+    rocksdb_writebatch_clear(wb);
+    const char* range_start[2] = {"a", ""};  // "a" + "" = "a"
+    const size_t range_start_sizes[2] = {1, 0};
+    const char* range_end[2] = {"c", ""};
+    const size_t range_end_sizes[2] = {1, 0};
+    rocksdb_writebatch_delete_rangev_cf(wb, wb_cf, 2, range_start,
+                                        range_start_sizes, range_end,
+                                        range_end_sizes);
+    rocksdb_write(db, woptions, wb, &err);
+    CheckNoError(err);
+    // Range [a, c) should delete "a" and "b", but not "c"
+    CheckGetCF(db, roptions, wb_cf, "a", NULL);
+    CheckGetCF(db, roptions, wb_cf, "b", NULL);
+    CheckGetCF(db, roptions, wb_cf, "c", "3");
+
+    rocksdb_writebatch_destroy(wb);
+    rocksdb_drop_column_family(db, wb_cf, &err);
+    CheckNoError(err);
+    rocksdb_column_family_handle_destroy(wb_cf);
+  }
+
   StartPhase("writebatch_vectors");
   {
     rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
@@ -1245,6 +1469,8 @@ int main(int argc, char** argv) {
     CheckCondition(count == 3);
     size_t size;
     char* value;
+    const char* pinned_value;
+    rocksdb_pinnableslice_t* p;
     value = rocksdb_writebatch_wi_get_from_batch(wbi, options, "box", 3, &size,
                                                  &err);
     CheckValue(err, "c", &value, size);
@@ -1254,9 +1480,19 @@ int main(int argc, char** argv) {
     value = rocksdb_writebatch_wi_get_from_batch_and_db(wbi, db, roptions,
                                                         "foo", 3, &size, &err);
     CheckValue(err, "hello", &value, size);
+    p = rocksdb_writebatch_wi_get_pinned_from_batch_and_db(wbi, db, roptions,
+                                                           "foo", 3, &err);
+    pinned_value = rocksdb_pinnableslice_value(p, &size);
+    CheckPinnedValue(err, "hello", &pinned_value, size);
+    rocksdb_pinnableslice_destroy(p);
     value = rocksdb_writebatch_wi_get_from_batch_and_db(wbi, db, roptions,
                                                         "box", 3, &size, &err);
     CheckValue(err, "c", &value, size);
+    p = rocksdb_writebatch_wi_get_pinned_from_batch_and_db(wbi, db, roptions,
+                                                           "box", 3, &err);
+    pinned_value = rocksdb_pinnableslice_value(p, &size);
+    CheckPinnedValue(err, "c", &pinned_value, size);
+    rocksdb_pinnableslice_destroy(p);
     rocksdb_write_writebatch_wi(db, woptions, wbi, &err);
     CheckNoError(err);
     CheckGet(db, roptions, "foo", "hello");
@@ -1330,6 +1566,43 @@ int main(int argc, char** argv) {
     rocksdb_iter_destroy(iter);
   }
 
+  StartPhase("iter_slice");
+  {
+    // Test the new slice-based iterator API for better performance
+    rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_first(iter);
+    CheckCondition(rocksdb_iter_valid(iter));
+
+    // Test rocksdb_iter_key_slice
+    rocksdb_slice_t key_slice = rocksdb_iter_key_slice(iter);
+    CheckEqual("box", key_slice.data, key_slice.size);
+
+    // Test rocksdb_iter_value_slice
+    rocksdb_slice_t value_slice = rocksdb_iter_value_slice(iter);
+    CheckEqual("c", value_slice.data, value_slice.size);
+
+    // Move to next entry and test again
+    rocksdb_iter_next(iter);
+    CheckCondition(rocksdb_iter_valid(iter));
+    key_slice = rocksdb_iter_key_slice(iter);
+    value_slice = rocksdb_iter_value_slice(iter);
+    CheckEqual("foo", key_slice.data, key_slice.size);
+    CheckEqual("hello", value_slice.data, value_slice.size);
+
+    // Test seeking with slice API
+    rocksdb_iter_seek(iter, "b", 1);
+    CheckCondition(rocksdb_iter_valid(iter));
+    key_slice = rocksdb_iter_key_slice(iter);
+    value_slice = rocksdb_iter_value_slice(iter);
+    CheckEqual("box", key_slice.data, key_slice.size);
+    CheckEqual("c", value_slice.data, value_slice.size);
+
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    rocksdb_iter_destroy(iter);
+  }
+
   StartPhase("wbwi_iter");
   {
     rocksdb_iterator_t* base_iter = rocksdb_create_iterator(db, roptions);
@@ -1362,6 +1635,46 @@ int main(int argc, char** argv) {
     rocksdb_writebatch_wi_destroy(wbi);
   }
 
+  StartPhase("wbwi_iter_readoptions");
+  {
+    rocksdb_readoptions_t* iter_roptions = rocksdb_readoptions_create();
+    rocksdb_readoptions_set_iterate_lower_bound(iter_roptions, "boy", 3);
+    rocksdb_readoptions_set_iterate_upper_bound(iter_roptions, "fool", 4);
+    rocksdb_iterator_t* base_iter = rocksdb_create_iterator(db, iter_roptions);
+    rocksdb_writebatch_wi_t* wbi = rocksdb_writebatch_wi_create(0, 1);
+    rocksdb_writebatch_wi_put(wbi, "bar", 3, "b",
+                              1);  // should get filtered out
+    rocksdb_writebatch_wi_put(wbi, "cat", 3, "miau", 4);
+    rocksdb_writebatch_wi_put(wbi, "gnu", 3, "muh",
+                              3);  // should get filtered out
+    rocksdb_iterator_t* iter =
+        rocksdb_writebatch_wi_create_iterator_with_base_readopts(wbi, base_iter,
+                                                                 iter_roptions);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_first(iter);
+    CheckCondition(rocksdb_iter_valid(iter));
+    CheckIter(iter, "cat", "miau");
+    rocksdb_iter_next(iter);
+    CheckIter(iter, "foo", "hello");
+    rocksdb_iter_prev(iter);
+    CheckIter(iter, "cat", "miau");
+    rocksdb_iter_prev(iter);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_last(iter);
+    CheckIter(iter, "foo", "hello");
+    rocksdb_iter_seek(iter, "b", 1);
+    CheckIter(iter, "cat", "miau");
+    rocksdb_iter_seek_for_prev(iter, "d", 1);
+    CheckIter(iter, "cat", "miau");
+    rocksdb_iter_seek_for_prev(iter, "fool", 3);
+    CheckIter(iter, "foo", "hello");
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    rocksdb_iter_destroy(iter);
+    rocksdb_writebatch_wi_destroy(wbi);
+    rocksdb_readoptions_destroy(iter_roptions);
+  }
+
   StartPhase("multiget");
   {
     const char* keys[3] = {"box", "foo", "notfound"};
@@ -1375,6 +1688,53 @@ int main(int argc, char** argv) {
     CheckMultiGetValues(3, vals, vals_sizes, errs, expected);
   }
 
+  StartPhase("zero_copy_get_pinned_v2");
+  {
+    // Test new zero-copy get functions
+
+    // Test rocksdb_get_pinned_v2
+    rocksdb_pinnable_handle_t* handle =
+        rocksdb_get_pinned_v2(db, roptions, "foo", 3, &err);
+    CheckNoError(err);
+    CheckCondition(handle != NULL);
+    size_t val_len;
+    const char* val = rocksdb_pinnable_handle_get_value(handle, &val_len);
+    CheckEqual("hello", val, val_len);
+    rocksdb_pinnable_handle_destroy(handle);
+
+    // Test with non-existent key
+    handle = rocksdb_get_pinned_v2(db, roptions, "notfound", 8, &err);
+    CheckNoError(err);
+    CheckCondition(handle == NULL);
+
+    // Test rocksdb_get_into_buffer
+    char buffer[100];
+    unsigned char found;
+    unsigned char success = rocksdb_get_into_buffer(
+        db, roptions, "foo", 3, buffer, sizeof(buffer), &val_len, &found, &err);
+    CheckNoError(err);
+    CheckCondition(success == 1);
+    CheckCondition(found == 1);
+    CheckCondition(val_len == 5);
+    CheckCondition(memcmp(buffer, "hello", 5) == 0);
+
+    // Test with buffer too small
+    success = rocksdb_get_into_buffer(db, roptions, "foo", 3, buffer,
+                                      2,  // Buffer too small
+                                      &val_len, &found, &err);
+    CheckNoError(err);
+    CheckCondition(success == 0);  // Should fail due to small buffer
+    CheckCondition(found == 1);
+    CheckCondition(val_len == 5);  // Should still report actual size
+
+    // Test with non-existent key
+    success = rocksdb_get_into_buffer(db, roptions, "notfound", 8, buffer,
+                                      sizeof(buffer), &val_len, &found, &err);
+    CheckNoError(err);
+    CheckCondition(success == 0);
+    CheckCondition(found == 0);
+  }
+
   StartPhase("pin_get");
   {
     CheckPinGet(db, roptions, "box", "c");
@@ -1792,6 +2152,84 @@ int main(int argc, char** argv) {
     rocksdb_flush_wal(db, 1, &err);
     CheckNoError(err);
 
+    // Test column family handle get name
+    {
+      size_t name_len;
+      char* cf_name =
+          rocksdb_column_family_handle_get_name(handles[1], &name_len);
+      CheckCondition(name_len == 3);
+      CheckCondition(memcmp(cf_name, "cf1", 3) == 0);
+      rocksdb_free(cf_name);
+    }
+
+    // Test zero-copy get with column families
+    {
+      rocksdb_pinnable_handle_t* handle =
+          rocksdb_get_pinned_cf_v2(db, roptions, handles[1], "box", 3, &err);
+      CheckNoError(err);
+      CheckCondition(handle != NULL);
+      size_t val_len;
+      const char* val = rocksdb_pinnable_handle_get_value(handle, &val_len);
+      CheckEqual("c", val, val_len);
+      rocksdb_pinnable_handle_destroy(handle);
+
+      // Test with non-existent key
+      handle = rocksdb_get_pinned_cf_v2(db, roptions, handles[1], "notfound", 8,
+                                        &err);
+      CheckNoError(err);
+      CheckCondition(handle == NULL);
+
+      // Test rocksdb_get_into_buffer_cf
+      char buffer[100];
+      unsigned char found;
+      unsigned char success = rocksdb_get_into_buffer_cf(
+          db, roptions, handles[1], "buff", 4, buffer, sizeof(buffer), &val_len,
+          &found, &err);
+      CheckNoError(err);
+      CheckCondition(success == 1);
+      CheckCondition(found == 1);
+      CheckCondition(val_len == 7);
+      CheckCondition(memcmp(buffer, "rocksdb", 7) == 0);
+
+      // Test with buffer too small
+      success = rocksdb_get_into_buffer_cf(db, roptions, handles[1], "buff", 4,
+                                           buffer, 3,  // Buffer too small
+                                           &val_len, &found, &err);
+      CheckNoError(err);
+      CheckCondition(success == 0);  // Should fail due to small buffer
+      CheckCondition(found == 1);
+      CheckCondition(val_len == 7);  // Should still report actual size
+    }
+
+    // Test WriteBatchWithIndex iteration with Column Family
+    rocksdb_writebatch_wi_t* wbwi = rocksdb_writebatch_wi_create(0, true);
+    rocksdb_writebatch_wi_put_cf(wbwi, handles[1], "boat", 4, "row",
+                                 3);  // should be filtered out
+    rocksdb_writebatch_wi_put_cf(wbwi, handles[1], "buffy", 5, "charmed", 7);
+    rocksdb_writebatch_wi_put_cf(wbwi, handles[1], "bus", 3, "yellow",
+                                 6);  // should be filtered out
+    rocksdb_readoptions_t* iter_roptions = rocksdb_readoptions_create();
+    rocksdb_readoptions_set_iterate_lower_bound(iter_roptions, "bu", 2);
+    rocksdb_readoptions_set_iterate_upper_bound(iter_roptions, "buffz", 5);
+    rocksdb_iterator_t* base_iter =
+        rocksdb_create_iterator_cf(db, iter_roptions, handles[1]);
+    rocksdb_iterator_t* wbwi_iter =
+        rocksdb_writebatch_wi_create_iterator_with_base_cf_readopts(
+            wbwi, base_iter, handles[1], iter_roptions);
+
+    CheckCondition(!rocksdb_iter_valid(wbwi_iter));
+    rocksdb_iter_seek_to_first(wbwi_iter);
+    CheckCondition(rocksdb_iter_valid(wbwi_iter));
+    CheckIter(wbwi_iter, "buff", "rocksdb");
+    rocksdb_iter_next(wbwi_iter);
+    CheckIter(wbwi_iter, "buffy", "charmed");
+    rocksdb_iter_next(wbwi_iter);
+    CheckCondition(!rocksdb_iter_valid(wbwi_iter));
+
+    rocksdb_iter_destroy(wbwi_iter);
+    rocksdb_writebatch_wi_destroy(wbwi);
+    rocksdb_readoptions_destroy(iter_roptions);
+
     const char* keys[3] = {"box", "box", "barfooxx"};
     const rocksdb_column_family_handle_t* get_handles[3] = {
         handles[0], handles[1], handles[1]};
@@ -1839,6 +2277,74 @@ int main(int argc, char** argv) {
       }
     }
 
+    {
+      // Test rocksdb_batched_multi_get_cf_slice for better performance
+      // Build rocksdb_slice_t array directly to avoid conversion overhead
+      rocksdb_slice_t batched_key_slices[4];
+      batched_key_slices[0].data = "box";
+      batched_key_slices[0].size = 3;
+      batched_key_slices[1].data = "buff";
+      batched_key_slices[1].size = 4;
+      batched_key_slices[2].data = "barfooxx";
+      batched_key_slices[2].size = 8;
+      batched_key_slices[3].data = "box";
+      batched_key_slices[3].size = 3;
+
+      const char* expected_value[4] = {"c", "rocksdb", NULL, "c"};
+      char* batched_errs[4];
+      rocksdb_pinnableslice_t* pvals[4];
+
+      rocksdb_batched_multi_get_cf_slice(db, roptions, handles[1], 4,
+                                         batched_key_slices, pvals,
+                                         batched_errs, false);
+
+      const char* val;
+      size_t val_len;
+      for (i = 0; i < 4; ++i) {
+        CheckNoError(batched_errs[i]);
+        if (pvals[i] != NULL) {
+          val = rocksdb_pinnableslice_value(pvals[i], &val_len);
+          CheckEqual(expected_value[i], val, val_len);
+          rocksdb_pinnableslice_destroy(pvals[i]);
+        } else {
+          CheckEqual(expected_value[i], NULL, 0);
+        }
+      }
+    }
+
+    {
+      // Test rocksdb_batched_multi_get_cf_slice with sorted_input=true
+      // Keys must be in sorted order for this optimization
+      rocksdb_slice_t sorted_key_slices[3];
+      sorted_key_slices[0].data = "box";
+      sorted_key_slices[0].size = 3;
+      sorted_key_slices[1].data = "buff";
+      sorted_key_slices[1].size = 4;
+      sorted_key_slices[2].data = "notfound";
+      sorted_key_slices[2].size = 8;
+
+      const char* expected_value[3] = {"c", "rocksdb", NULL};
+      char* batched_errs[3];
+      rocksdb_pinnableslice_t* pvals[3];
+
+      rocksdb_batched_multi_get_cf_slice(db, roptions, handles[1], 3,
+                                         sorted_key_slices, pvals, batched_errs,
+                                         true);
+
+      const char* val;
+      size_t val_len;
+      for (i = 0; i < 3; ++i) {
+        CheckNoError(batched_errs[i]);
+        if (pvals[i] != NULL) {
+          val = rocksdb_pinnableslice_value(pvals[i], &val_len);
+          CheckEqual(expected_value[i], val, val_len);
+          rocksdb_pinnableslice_destroy(pvals[i]);
+        } else {
+          CheckEqual(expected_value[i], NULL, 0);
+        }
+      }
+    }
+
     {
       unsigned char value_found = 0;
 
@@ -2129,16 +2635,20 @@ int main(int argc, char** argv) {
     CheckCondition(100000 ==
                    rocksdb_options_get_periodic_compaction_seconds(o));
 
+    rocksdb_options_set_memtable_op_scan_flush_trigger(o, 100);
+    CheckCondition(100 ==
+                   rocksdb_options_get_memtable_op_scan_flush_trigger(o));
+
+    rocksdb_options_set_memtable_avg_op_scan_flush_trigger(o, 150);
+    CheckCondition(150 ==
+                   rocksdb_options_get_memtable_avg_op_scan_flush_trigger(o));
+
     rocksdb_options_set_ttl(o, 5000);
     CheckCondition(5000 == rocksdb_options_get_ttl(o));
 
     rocksdb_options_set_skip_stats_update_on_db_open(o, 1);
     CheckCondition(1 == rocksdb_options_get_skip_stats_update_on_db_open(o));
 
-    rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(o, 1);
-    CheckCondition(
-        1 == rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(o));
-
     rocksdb_options_set_max_write_buffer_number(o, 97);
     CheckCondition(97 == rocksdb_options_get_max_write_buffer_number(o));
 
@@ -2146,10 +2656,6 @@ int main(int argc, char** argv) {
     CheckCondition(23 ==
                    rocksdb_options_get_min_write_buffer_number_to_merge(o));
 
-    rocksdb_options_set_max_write_buffer_number_to_maintain(o, 64);
-    CheckCondition(64 ==
-                   rocksdb_options_get_max_write_buffer_number_to_maintain(o));
-
     rocksdb_options_set_max_write_buffer_size_to_maintain(o, 50000);
     CheckCondition(50000 ==
                    rocksdb_options_get_max_write_buffer_size_to_maintain(o));
@@ -2402,13 +2908,9 @@ int main(int argc, char** argv) {
     CheckCondition(2.0 ==
                    rocksdb_options_get_max_bytes_for_level_multiplier(copy));
     CheckCondition(1 == rocksdb_options_get_skip_stats_update_on_db_open(copy));
-    CheckCondition(
-        1 == rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(copy));
     CheckCondition(97 == rocksdb_options_get_max_write_buffer_number(copy));
     CheckCondition(23 ==
                    rocksdb_options_get_min_write_buffer_number_to_merge(copy));
-    CheckCondition(
-        64 == rocksdb_options_get_max_write_buffer_number_to_maintain(copy));
     CheckCondition(50000 ==
                    rocksdb_options_get_max_write_buffer_size_to_maintain(copy));
     CheckCondition(1 == rocksdb_options_get_enable_pipelined_write(copy));
@@ -2572,6 +3074,18 @@ int main(int argc, char** argv) {
     CheckCondition(100000 ==
                    rocksdb_options_get_periodic_compaction_seconds(o));
 
+    rocksdb_options_set_memtable_op_scan_flush_trigger(copy, 800);
+    CheckCondition(800 ==
+                   rocksdb_options_get_memtable_op_scan_flush_trigger(copy));
+    CheckCondition(100 ==
+                   rocksdb_options_get_memtable_op_scan_flush_trigger(o));
+
+    rocksdb_options_set_memtable_avg_op_scan_flush_trigger(copy, 900);
+    CheckCondition(
+        900 == rocksdb_options_get_memtable_avg_op_scan_flush_trigger(copy));
+    CheckCondition(150 ==
+                   rocksdb_options_get_memtable_avg_op_scan_flush_trigger(o));
+
     rocksdb_options_set_ttl(copy, 8000);
     CheckCondition(8000 == rocksdb_options_get_ttl(copy));
     CheckCondition(5000 == rocksdb_options_get_ttl(o));
@@ -2580,12 +3094,6 @@ int main(int argc, char** argv) {
     CheckCondition(0 == rocksdb_options_get_skip_stats_update_on_db_open(copy));
     CheckCondition(1 == rocksdb_options_get_skip_stats_update_on_db_open(o));
 
-    rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(copy, 0);
-    CheckCondition(
-        0 == rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(copy));
-    CheckCondition(
-        1 == rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(o));
-
     rocksdb_options_set_max_write_buffer_number(copy, 2000);
     CheckCondition(2000 == rocksdb_options_get_max_write_buffer_number(copy));
     CheckCondition(97 == rocksdb_options_get_max_write_buffer_number(o));
@@ -2596,12 +3104,6 @@ int main(int argc, char** argv) {
     CheckCondition(23 ==
                    rocksdb_options_get_min_write_buffer_number_to_merge(o));
 
-    rocksdb_options_set_max_write_buffer_number_to_maintain(copy, 128);
-    CheckCondition(
-        128 == rocksdb_options_get_max_write_buffer_number_to_maintain(copy));
-    CheckCondition(64 ==
-                   rocksdb_options_get_max_write_buffer_number_to_maintain(o));
-
     rocksdb_options_set_max_write_buffer_size_to_maintain(copy, 9000);
     CheckCondition(9000 ==
                    rocksdb_options_get_max_write_buffer_size_to_maintain(copy));
@@ -3094,6 +3596,14 @@ int main(int argc, char** argv) {
         100000 ==
         rocksdb_fifo_compaction_options_get_max_table_files_size(fco));
 
+    rocksdb_fifo_compaction_options_set_max_data_files_size(fco, 200000);
+    CheckCondition(
+        200000 == rocksdb_fifo_compaction_options_get_max_data_files_size(fco));
+
+    rocksdb_fifo_compaction_options_set_use_kv_ratio_compaction(fco, 1);
+    CheckCondition(
+        1 == rocksdb_fifo_compaction_options_get_use_kv_ratio_compaction(fco));
+
     rocksdb_fifo_compaction_options_destroy(fco);
   }
 
@@ -3314,6 +3824,17 @@ int main(int argc, char** argv) {
     rocksdb_transaction_put(txn, "foo", 3, "hello", 5, &err);
     CheckNoError(err);
 
+    // test transaction get/set name (before commit)
+    {
+      rocksdb_transaction_set_name(txn, "test_txn", 8, &err);
+      CheckNoError(err);
+      size_t name_len;
+      char* txn_name = rocksdb_transaction_get_name(txn, &name_len);
+      CheckCondition(name_len == 8);
+      CheckCondition(memcmp(txn_name, "test_txn", 8) == 0);
+      rocksdb_free(txn_name);
+    }
+
     // read from outside transaction, before commit
     CheckTxnDBGet(txn_db, roptions, "foo", NULL);
     CheckTxnDBPinGet(txn_db, roptions, "foo", NULL);
@@ -3934,7 +4455,7 @@ int main(int argc, char** argv) {
 
   StartPhase("statistics");
   {
-    const uint32_t BYTES_WRITTEN_TICKER = 60;
+    const uint32_t BYTES_WRITTEN_TICKER = 61;
     const uint32_t DB_WRITE_HIST = 1;
 
     rocksdb_statistics_histogram_data_t* hist =
@@ -4052,6 +4573,313 @@ int main(int argc, char** argv) {
     rocksdb_cache_destroy(lru);
   }
 
+  StartPhase("remote_compaction_service");
+  {
+    RemoteCompactionState remote_state = {0, 0, 0, "", ""};
+
+    // Create compaction service
+    rocksdb_compactionservice_t* service = rocksdb_compactionservice_create(
+        &remote_state,             // state
+        RemoteCompactionDestroy,   // destructor
+        RemoteCompactionSchedule,  // schedule callback
+        "TestRemoteCompaction",    // name
+        RemoteCompactionWait,      // wait callback
+        RemoteCompactionCancel,    // cancel_awaiting_jobs
+        NULL);                     // on_installation
+
+    // Create options with remote compaction
+    rocksdb_options_t* remote_options = rocksdb_options_create();
+    rocksdb_options_set_create_if_missing(remote_options, 1);
+    rocksdb_options_set_level0_file_num_compaction_trigger(remote_options, 2);
+    rocksdb_options_set_write_buffer_size(remote_options,
+                                          64 * 1024);  // 64KB buffer
+    rocksdb_options_set_max_bytes_for_level_base(remote_options,
+                                                 256 * 1024);  // 256KB
+    rocksdb_options_set_target_file_size_base(
+        remote_options, 64 * 1024);  // 64KB target file size
+    // Disable automatic compactions to test manual compaction only
+    rocksdb_options_set_disable_auto_compactions(remote_options, 1);
+    rocksdb_options_set_compaction_service(remote_options, service);
+
+    // Destroy old DB and create new one
+    rocksdb_close(db);
+    rocksdb_destroy_db(remote_options, dbname, &err);
+    CheckNoError(err);
+
+    db = rocksdb_open(remote_options, dbname, &err);
+    CheckNoError(err);
+
+    // Create multiple SST files to trigger compaction
+    rocksdb_flushoptions_t* flush_opts = rocksdb_flushoptions_create();
+    rocksdb_flushoptions_set_wait(flush_opts, 1);
+
+    // Write and flush multiple times to create multiple L0 files
+    // Write more data with larger values to ensure files are substantial
+    for (int batch = 0; batch < 5; batch++) {
+      for (int i = 0; i < 200; i++) {
+        char key[20], val[1000];
+        snprintf(key, sizeof(key), "key%d_%d", batch, i);
+        // Fill value with repeated data to make it larger
+        memset(val, 'a' + (batch % 26), sizeof(val) - 1);
+        val[sizeof(val) - 1] = '\0';
+        rocksdb_put(db, woptions, key, strlen(key), val, strlen(val), &err);
+        CheckNoError(err);
+      }
+      rocksdb_flush(db, flush_opts, &err);
+      CheckNoError(err);
+    }
+    rocksdb_flushoptions_destroy(flush_opts);
+
+    // Trigger manual compaction to invoke remote compaction service
+    rocksdb_compact_range(db, NULL, 0, NULL, 0);
+
+    rocksdb_wait_for_compact_options_t* wco =
+        rocksdb_wait_for_compact_options_create();
+    rocksdb_wait_for_compact(db, wco, &err);
+    CheckNoError(err);
+    rocksdb_wait_for_compact_options_destroy(wco);
+
+    // Verify that callbacks were actually called
+    CheckCondition(remote_state.schedule_called > 0);
+    CheckCondition(remote_state.wait_called > 0);
+    CheckCondition(strlen(remote_state.last_db_name) > 0);
+    CheckCondition(strstr(remote_state.last_db_name, "rocksdb_c_test") != NULL);
+
+    // Verify data is still accessible after remote compaction
+    // Just check a few keys to verify data integrity
+    for (int batch = 0; batch < 5; batch++) {
+      char key[20];
+      snprintf(key, sizeof(key), "key%d_0", batch);
+      size_t vallen;
+      char* val = rocksdb_get(db, roptions, key, strlen(key), &vallen, &err);
+      CheckNoError(err);
+      CheckCondition(val != NULL);
+      CheckCondition(vallen == 999);  // strlen of 1000-byte string
+      free(val);
+    }
+
+    // Test cancellation API directly
+    RemoteCompactionCancel(&remote_state);
+    CheckCondition(remote_state.cancel_called > 0);
+
+    // Cleanup
+    rocksdb_close(db);
+    rocksdb_destroy_db(remote_options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_options_destroy(remote_options);
+
+    // Reopen DB with original options for subsequent tests
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+  }
+
+  StartPhase("remote_compaction_scheduleresponse");
+  {
+    // Test scheduleresponse creation and getters
+    rocksdb_compactionservice_scheduleresponse_t* response;
+
+    // Test success response
+    err = NULL;
+    response = rocksdb_compactionservice_scheduleresponse_create(
+        "test-job-123", rocksdb_compactionservice_jobstatus_success, &err);
+    CheckNoError(err);
+    CheckCondition(response != NULL);
+    CheckCondition(
+        rocksdb_compactionservice_scheduleresponse_getstatus(response) ==
+        rocksdb_compactionservice_jobstatus_success);
+
+    size_t job_id_len;
+    const char* job_id =
+        rocksdb_compactionservice_scheduleresponse_get_scheduled_job_id(
+            response, &job_id_len);
+    CheckCondition(job_id_len == strlen("test-job-123"));
+    CheckCondition(memcmp(job_id, "test-job-123", job_id_len) == 0);
+    rocksdb_compactionservice_scheduleresponse_t_destroy(response);
+
+    // Test failure response
+    response = rocksdb_compactionservice_scheduleresponse_create_with_status(
+        rocksdb_compactionservice_jobstatus_failure, &err);
+    CheckCondition(response != NULL);
+    CheckCondition(
+        rocksdb_compactionservice_scheduleresponse_getstatus(response) ==
+        rocksdb_compactionservice_jobstatus_failure);
+    rocksdb_compactionservice_scheduleresponse_t_destroy(response);
+
+    response = rocksdb_compactionservice_scheduleresponse_create_with_status(
+        999, &err);
+    CheckCondition(response == NULL);  // Invalid status
+    if (err) {
+      Free(&err);
+    }
+  }
+
+  StartPhase("remote_compaction_options_override");
+  {
+    // Test CompactionServiceOptionsOverride API
+    rocksdb_compaction_service_options_override_t* override_opts =
+        rocksdb_compaction_service_options_override_create();
+    CheckCondition(override_opts != NULL);
+
+    // Set up override options
+    rocksdb_compaction_service_options_override_set_env(override_opts, env);
+    rocksdb_compaction_service_options_override_set_comparator(override_opts,
+                                                               cmp);
+
+    // Test file checksum gen factory
+    rocksdb_file_checksum_gen_factory_t* checksum_factory =
+        rocksdb_file_checksum_gen_crc32c_factory_create();
+    CheckCondition(checksum_factory != NULL);
+    rocksdb_compaction_service_options_override_set_file_checksum_gen_factory(
+        override_opts, checksum_factory);
+
+    // Test SST partitioner factory
+    rocksdb_sst_partitioner_factory_t* partitioner_factory =
+        rocksdb_sst_partitioner_fixed_prefix_factory_create(4);
+    CheckCondition(partitioner_factory != NULL);
+    rocksdb_compaction_service_options_override_set_sst_partitioner_factory(
+        override_opts, partitioner_factory);
+
+    // Test merge operator
+    rocksdb_compaction_service_options_override_set_merge_operator(
+        override_opts, NULL);
+
+    // Test compaction filter
+    rocksdb_compaction_service_options_override_set_compaction_filter(
+        override_opts, NULL);
+
+    // Test prefix extractor
+    rocksdb_compaction_service_options_override_set_prefix_extractor(
+        override_opts, NULL);
+
+    // Test table factory - block based
+    rocksdb_block_based_table_options_t* table_opts =
+        rocksdb_block_based_options_create();
+    rocksdb_compaction_service_options_override_set_block_based_table_factory(
+        override_opts, table_opts);
+    rocksdb_block_based_options_destroy(table_opts);
+
+    // Test statistics via options
+    rocksdb_options_t* stats_opts = rocksdb_options_create();
+    rocksdb_options_enable_statistics(stats_opts);
+    rocksdb_compaction_service_options_override_set_statistics(override_opts,
+                                                               stats_opts);
+    rocksdb_options_destroy(stats_opts);
+
+    // Test info log
+    rocksdb_logger_t* logger =
+        rocksdb_logger_create_stderr_logger(1, "test_prefix");
+    rocksdb_compaction_service_options_override_set_info_log(override_opts,
+                                                             logger);
+    rocksdb_logger_destroy(logger);
+
+    // Test options map
+    rocksdb_compaction_service_options_override_set_option(
+        override_opts, "max_bytes_for_level_base", "67108864");
+
+    // Cleanup
+    rocksdb_file_checksum_gen_factory_destroy(checksum_factory);
+    rocksdb_sst_partitioner_factory_destroy(partitioner_factory);
+    rocksdb_compaction_service_options_override_destroy(override_opts);
+  }
+
+  StartPhase("factory_options_on_regular_options");
+  {
+    // Test that the new factory types work with regular rocksdb_options_t
+    rocksdb_options_t* test_opts = rocksdb_options_create();
+
+    // Test file checksum gen factory on regular options
+    rocksdb_file_checksum_gen_factory_t* checksum_factory =
+        rocksdb_file_checksum_gen_crc32c_factory_create();
+    CheckCondition(checksum_factory != NULL);
+    rocksdb_options_set_file_checksum_gen_factory(test_opts, checksum_factory);
+
+    // Test SST partitioner factory on regular options
+    rocksdb_sst_partitioner_factory_t* partitioner_factory =
+        rocksdb_sst_partitioner_fixed_prefix_factory_create(8);
+    CheckCondition(partitioner_factory != NULL);
+    rocksdb_options_set_sst_partitioner_factory(test_opts, partitioner_factory);
+
+    // Cleanup
+    rocksdb_file_checksum_gen_factory_destroy(checksum_factory);
+    rocksdb_sst_partitioner_factory_destroy(partitioner_factory);
+    rocksdb_options_destroy(test_opts);
+  }
+
+  StartPhase("remote_compaction_null_callback_handling");
+  {
+    // Test that NULL callback returns are handled gracefully
+    // This simulates a failure in the remote compaction service
+    rocksdb_compactionservice_t* null_service =
+        rocksdb_compactionservice_create(NULL, NULL, NullSchedule,
+                                         "NullTestService", NULL, NULL, NULL);
+
+    rocksdb_options_t* null_opts = rocksdb_options_create();
+    rocksdb_options_set_create_if_missing(null_opts, 1);
+    rocksdb_options_set_compaction_service(null_opts, null_service);
+
+    const char* null_db = "rocksdb_c_test_null_service";
+
+    rocksdb_t* null_db_handle = rocksdb_open(null_opts, null_db, &err);
+    CheckNoError(err);
+
+    // Write data and trigger compaction
+    for (int i = 0; i < 100; i++) {
+      char key[20], val[50];
+      snprintf(key, sizeof(key), "key%d", i);
+      snprintf(val, sizeof(val), "val%d", i);
+      rocksdb_put(null_db_handle, woptions, key, strlen(key), val, strlen(val),
+                  &err);
+      CheckNoError(err);
+    }
+
+    // This should fall back to local compaction (not crash)
+    rocksdb_compact_range(null_db_handle, NULL, 0, NULL, 0);
+
+    // Data should still be readable
+    CheckGet(null_db_handle, roptions, "key50", "val50");
+
+    rocksdb_close(null_db_handle);
+    rocksdb_destroy_db(null_opts, null_db, &err);
+    rocksdb_options_destroy(null_opts);
+  }
+
+  StartPhase("remote_compaction_canceled_flag");
+  {
+    // Test atomic cancellation flag API
+    unsigned char* canceled = rocksdb_open_and_compact_canceled_create();
+    CheckCondition(canceled != NULL);
+
+    // Set cancellation
+    rocksdb_open_and_compact_canceled_set(canceled, 1);
+
+    // Use with OpenAndCompactOptions
+    rocksdb_open_and_compact_options_t* oac_opts =
+        rocksdb_open_and_compact_options_create();
+    rocksdb_open_and_compact_options_set_canceled(oac_opts, canceled);
+    rocksdb_open_and_compact_options_set_allow_resumption(oac_opts, 1);
+
+    // Cleanup
+    rocksdb_open_and_compact_options_destroy(oac_opts);
+    rocksdb_open_and_compact_canceled_destroy(canceled);
+  }
+
+  StartPhase("sst_file_manager");
+  {
+    rocksdb_sst_file_manager_t* sst_file_manager;
+    sst_file_manager = rocksdb_sst_file_manager_create(env);
+    rocksdb_sst_file_manager_set_delete_rate_bytes_per_second(sst_file_manager,
+                                                              1);
+    rocksdb_sst_file_manager_set_max_trash_db_ratio(sst_file_manager, 0.75);
+
+    CheckCondition(1 ==
+                   rocksdb_sst_file_manager_get_delete_rate_bytes_per_second(
+                       sst_file_manager));
+    CheckCondition(0.75 == rocksdb_sst_file_manager_get_max_trash_db_ratio(
+                               sst_file_manager));
+
+    rocksdb_sst_file_manager_destroy(sst_file_manager);
+  }
+
   StartPhase("cancel_all_background_work");
   rocksdb_cancel_all_background_work(db, 1);
 
diff --git a/db/column_family.cc b/db/column_family.cc
index 2c1ad930ab01..bbf9f8210b31 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -110,23 +110,48 @@ void GetInternalTblPropCollFactory(
   }
 }
 
+Status CheckCompressionSupportedWithManager(
+    CompressionType type, UnownedPtr<CompressionManager> mgr) {
+  if (mgr) {
+    if (!mgr->SupportsCompressionType(type)) {
+      return Status::NotSupported("Compression type " +
+                                  CompressionTypeToString(type) +
+                                  " is not recognized/supported by this "
+                                  "version of CompressionManager " +
+                                  mgr->GetId());
+    }
+  } else {
+    if (!CompressionTypeSupported(type)) {
+      if (type <= kLastBuiltinCompression) {
+        return Status::InvalidArgument("Compression type " +
+                                       CompressionTypeToString(type) +
+                                       " is not linked with the binary.");
+      } else {
+        return Status::NotSupported(
+            "Compression type " + CompressionTypeToString(type) +
+            " is not recognized/supported by built-in CompressionManager.");
+      }
+    }
+  }
+  return Status::OK();
+}
+
 Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options) {
   if (!cf_options.compression_per_level.empty()) {
     for (size_t level = 0; level < cf_options.compression_per_level.size();
          ++level) {
-      if (!CompressionTypeSupported(cf_options.compression_per_level[level])) {
-        return Status::InvalidArgument(
-            "Compression type " +
-            CompressionTypeToString(cf_options.compression_per_level[level]) +
-            " is not linked with the binary.");
+      Status s = CheckCompressionSupportedWithManager(
+          cf_options.compression_per_level[level],
+          cf_options.compression_manager.get());
+      if (!s.ok()) {
+        return s;
       }
     }
   } else {
-    if (!CompressionTypeSupported(cf_options.compression)) {
-      return Status::InvalidArgument(
-          "Compression type " +
-          CompressionTypeToString(cf_options.compression) +
-          " is not linked with the binary.");
+    Status s = CheckCompressionSupportedWithManager(
+        cf_options.compression, cf_options.compression_manager.get());
+    if (!s.ok()) {
+      return s;
     }
   }
   if (cf_options.compression_opts.zstd_max_train_bytes > 0) {
@@ -168,7 +193,8 @@ Status CheckConcurrentWritesSupported(const ColumnFamilyOptions& cf_options) {
   }
   if (!cf_options.memtable_factory->IsInsertConcurrentlySupported()) {
     return Status::InvalidArgument(
-        "Memtable doesn't allow concurrent writes (allow_concurrent_memtable_write)");
+        "Memtable doesn't allow concurrent writes "
+        "(allow_concurrent_memtable_write)");
   }
   return Status::OK();
 }
@@ -199,8 +225,9 @@ const uint64_t kDefaultTtl = 0xfffffffffffffffe;
 const uint64_t kDefaultPeriodicCompSecs = 0xfffffffffffffffe;
 }  // anonymous namespace
 
-ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
-                                    const ColumnFamilyOptions& src) {
+ColumnFamilyOptions SanitizeCfOptions(const ImmutableDBOptions& db_options,
+                                      bool read_only,
+                                      const ColumnFamilyOptions& src) {
   ColumnFamilyOptions result = src;
   size_t clamp_max = std::conditional<
       sizeof(size_t) == 4, std::integral_constant<size_t, 0xffffffff>,
@@ -239,6 +266,10 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
 
     result.min_write_buffer_number_to_merge = 1;
   }
+  if (result.disallow_memtable_writes) {
+    // A simple memtable that enforces MarkReadOnly (unlike skip list)
+    result.memtable_factory = std::make_shared<VectorRepFactory>();
+  }
 
   if (result.num_levels < 1) {
     result.num_levels = 1;
@@ -249,22 +280,18 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
   }
 
   if (result.compaction_style == kCompactionStyleUniversal &&
-      db_options.allow_ingest_behind && result.num_levels < 3) {
+      (db_options.allow_ingest_behind || result.cf_allow_ingest_behind) &&
+      result.num_levels < 3) {
     result.num_levels = 3;
   }
 
   if (result.max_write_buffer_number < 2) {
     result.max_write_buffer_number = 2;
   }
-  // fall back max_write_buffer_number_to_maintain if
-  // max_write_buffer_size_to_maintain is not set
   if (result.max_write_buffer_size_to_maintain < 0) {
     result.max_write_buffer_size_to_maintain =
         result.max_write_buffer_number *
         static_cast<int64_t>(result.write_buffer_size);
-  } else if (result.max_write_buffer_size_to_maintain == 0 &&
-             result.max_write_buffer_number_to_maintain < 0) {
-    result.max_write_buffer_number_to_maintain = result.max_write_buffer_number;
   }
   // bloom filter size shouldn't exceed 1/4 of memtable size.
   if (result.memtable_prefix_bloom_size_ratio > 0.25) {
@@ -374,7 +401,13 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
   }
 
   if (result.max_compaction_bytes == 0) {
-    result.max_compaction_bytes = result.target_file_size_base * 25;
+    // For FIFO with use_kv_ratio_compaction, leave max_compaction_bytes as 0
+    // to signal "auto-calculate target from capacity and SST/blob ratio."
+    // When explicitly set by the user, it overrides the auto-calculated target.
+    if (result.compaction_style != kCompactionStyleFIFO ||
+        !result.compaction_options_fifo.use_kv_ratio_compaction) {
+      result.max_compaction_bytes = result.target_file_size_base * 25;
+    }
   }
 
   bool is_block_based_table = (result.table_factory->IsInstanceOf(
@@ -435,6 +468,33 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
     result.periodic_compaction_seconds = 0;
   }
 
+  if (read_only && (result.preserve_internal_time_seconds > 0 ||
+                    result.preclude_last_level_data_seconds > 0)) {
+    // With no writes coming in, we don't need periodic SeqnoToTime entries.
+    // Existing SST files may or may not have that info associated with them.
+    ROCKS_LOG_WARN(
+        db_options.info_log.get(),
+        "preserve_internal_time_seconds and preclude_last_level_data_seconds "
+        "are ignored in read-only DB");
+    result.preserve_internal_time_seconds = 0;
+    result.preclude_last_level_data_seconds = 0;
+  }
+
+  if (read_only) {
+    if (result.memtable_op_scan_flush_trigger) {
+      ROCKS_LOG_WARN(db_options.info_log.get(),
+                     "option memtable_op_scan_flush_trigger is sanitized to "
+                     "0(disabled) for read only DB.");
+      result.memtable_op_scan_flush_trigger = 0;
+    }
+    if (result.memtable_avg_op_scan_flush_trigger) {
+      ROCKS_LOG_WARN(
+          db_options.info_log.get(),
+          "option memtable_avg_op_scan_flush_trigger is sanitized to "
+          "0(disabled) for read only DB.");
+      result.memtable_avg_op_scan_flush_trigger = 0;
+    }
+  }
   return result;
 }
 
@@ -492,6 +552,17 @@ void SuperVersion::Init(
   imm->Ref();
   current->Ref();
   refs.store(1, std::memory_order_relaxed);
+
+  // There should be at least one mapping entry iff time tracking is enabled.
+#ifndef NDEBUG
+  MinAndMaxPreserveSeconds preserve_info{mutable_cf_options};
+  if (preserve_info.IsEnabled()) {
+    assert(seqno_to_time_mapping);
+    assert(!seqno_to_time_mapping->Empty());
+  } else {
+    assert(seqno_to_time_mapping == nullptr);
+  }
+#endif  // NDEBUG
 }
 
 namespace {
@@ -530,7 +601,7 @@ ColumnFamilyData::ColumnFamilyData(
     const FileOptions* file_options, ColumnFamilySet* column_family_set,
     BlockCacheTracer* const block_cache_tracer,
     const std::shared_ptr<IOTracer>& io_tracer, const std::string& db_id,
-    const std::string& db_session_id)
+    const std::string& db_session_id, bool read_only)
     : id_(id),
       name_(name),
       dummy_versions_(_dummy_versions),
@@ -540,7 +611,7 @@ ColumnFamilyData::ColumnFamilyData(
       dropped_(false),
       flush_skip_reschedule_(false),
       internal_comparator_(cf_options.comparator),
-      initial_cf_options_(SanitizeOptions(db_options, cf_options)),
+      initial_cf_options_(SanitizeCfOptions(db_options, read_only, cf_options)),
       ioptions_(db_options, initial_cf_options_),
       mutable_cf_options_(initial_cf_options_),
       is_delete_range_supported_(
@@ -548,7 +619,6 @@ ColumnFamilyData::ColumnFamilyData(
       write_buffer_manager_(write_buffer_manager),
       mem_(nullptr),
       imm_(ioptions_.min_write_buffer_number_to_merge,
-           ioptions_.max_write_buffer_number_to_maintain,
            ioptions_.max_write_buffer_size_to_maintain),
       super_version_(nullptr),
       super_version_number_(0),
@@ -1179,10 +1249,12 @@ Compaction* ColumnFamilyData::PickCompaction(
     const MutableCFOptions& mutable_options,
     const MutableDBOptions& mutable_db_options,
     const std::vector<SequenceNumber>& existing_snapshots,
-    const SnapshotChecker* snapshot_checker, LogBuffer* log_buffer) {
+    const SnapshotChecker* snapshot_checker, LogBuffer* log_buffer,
+    bool require_max_output_level) {
   auto* result = compaction_picker_->PickCompaction(
       GetName(), mutable_options, mutable_db_options, existing_snapshots,
-      snapshot_checker, current_->storage_info(), log_buffer);
+      snapshot_checker, current_->storage_info(), log_buffer,
+      GetFullHistoryTsLow(), require_max_output_level);
   if (result != nullptr) {
     result->FinalizeInputInfo(current_);
   }
@@ -1266,11 +1338,11 @@ Compaction* ColumnFamilyData::CompactRange(
     const InternalKey* begin, const InternalKey* end,
     InternalKey** compaction_end, bool* conflict,
     uint64_t max_file_num_to_ignore, const std::string& trim_ts) {
-  auto* result = compaction_picker_->CompactRange(
+  auto* result = compaction_picker_->PickCompactionForCompactRange(
       GetName(), mutable_cf_options, mutable_db_options,
       current_->storage_info(), input_level, output_level,
       compact_range_options, begin, end, compaction_end, conflict,
-      max_file_num_to_ignore, trim_ts);
+      max_file_num_to_ignore, trim_ts, GetFullHistoryTsLow());
   if (result != nullptr) {
     result->FinalizeInputInfo(current_);
   }
@@ -1339,20 +1411,17 @@ bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) {
   return false;
 }
 
-void ColumnFamilyData::InstallSuperVersion(SuperVersionContext* sv_context,
-                                           InstrumentedMutex* db_mutex) {
+void ColumnFamilyData::InstallSuperVersion(
+    SuperVersionContext* sv_context, InstrumentedMutex* db_mutex,
+    std::optional<std::shared_ptr<SeqnoToTimeMapping>>
+        new_seqno_to_time_mapping) {
   db_mutex->AssertHeld();
-  return InstallSuperVersion(sv_context, mutable_cf_options_);
-}
 
-void ColumnFamilyData::InstallSuperVersion(
-    SuperVersionContext* sv_context,
-    const MutableCFOptions& mutable_cf_options) {
   SuperVersion* new_superversion = sv_context->new_superversion.release();
-  new_superversion->mutable_cf_options = mutable_cf_options;
+  new_superversion->mutable_cf_options = GetLatestMutableCFOptions();
   new_superversion->Init(this, mem_, imm_.current(), current_,
-                         sv_context->new_seqno_to_time_mapping
-                             ? std::move(sv_context->new_seqno_to_time_mapping)
+                         new_seqno_to_time_mapping.has_value()
+                             ? std::move(new_seqno_to_time_mapping.value())
                          : super_version_
                              ? super_version_->ShareSeqnoToTimeMapping()
                              : nullptr);
@@ -1365,7 +1434,7 @@ void ColumnFamilyData::InstallSuperVersion(
     // currently RecalculateWriteStallConditions() treats it as further slowing
     // down is needed.
     super_version_->write_stall_condition =
-        RecalculateWriteStallConditions(mutable_cf_options);
+        RecalculateWriteStallConditions(new_superversion->mutable_cf_options);
   } else {
     super_version_->write_stall_condition =
         old_superversion->write_stall_condition;
@@ -1378,8 +1447,9 @@ void ColumnFamilyData::InstallSuperVersion(
     ResetThreadLocalSuperVersions();
 
     if (old_superversion->mutable_cf_options.write_buffer_size !=
-        mutable_cf_options.write_buffer_size) {
-      mem_->UpdateWriteBufferSize(mutable_cf_options.write_buffer_size);
+        new_superversion->mutable_cf_options.write_buffer_size) {
+      mem_->UpdateWriteBufferSize(
+          new_superversion->mutable_cf_options.write_buffer_size);
     }
     if (old_superversion->write_stall_condition !=
         new_superversion->write_stall_condition) {
@@ -1499,6 +1569,34 @@ Status ColumnFamilyData::ValidateOptions(
         "FIFO compaction only supported with max_open_files = -1.");
   }
 
+  if (cf_options.compaction_options_fifo.use_kv_ratio_compaction) {
+    if (cf_options.compaction_style != kCompactionStyleFIFO) {
+      return Status::InvalidArgument(
+          "use_kv_ratio_compaction is only supported with FIFO compaction "
+          "style.");
+    }
+    if (!cf_options.compaction_options_fifo.allow_compaction) {
+      return Status::InvalidArgument(
+          "use_kv_ratio_compaction requires allow_compaction = true. "
+          "allow_compaction enables intra-L0 compaction, and "
+          "use_kv_ratio_compaction selects the picking strategy.");
+    }
+    if (cf_options.compaction_options_fifo.max_data_files_size == 0) {
+      return Status::InvalidArgument(
+          "use_kv_ratio_compaction requires max_data_files_size > 0 to "
+          "compute the target compacted file size from data capacity.");
+    }
+  }
+
+  if (cf_options.compaction_options_fifo.max_data_files_size > 0 &&
+      cf_options.compaction_options_fifo.max_data_files_size <
+          cf_options.compaction_options_fifo.max_table_files_size) {
+    return Status::InvalidArgument(
+        "max_data_files_size (total data = SST + blob) must be >= "
+        "max_table_files_size (SST only) when non-zero, since total data "
+        "always includes SST data.");
+  }
+
   std::vector<uint32_t> supported{0, 1, 2, 4, 8};
   if (std::find(supported.begin(), supported.end(),
                 cf_options.memtable_protection_bytes_per_key) ==
@@ -1570,6 +1668,8 @@ Status ColumnFamilyData::SetOptions(
   Status s = GetColumnFamilyOptionsFromMap(config_opts, cf_opts, options_map,
                                            &cf_opts);
   if (s.ok()) {
+    // FIXME: we should call SanitizeOptions() too or consolidate it with
+    // ValidateOptions().
     s = ValidateOptions(db_opts, cf_opts);
   }
   if (s.ok()) {
@@ -1680,7 +1780,8 @@ ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
       dummy_cfd_(new ColumnFamilyData(
           ColumnFamilyData::kDummyColumnFamilyDataId, "", nullptr, nullptr,
           nullptr, ColumnFamilyOptions(), *db_options, &file_options_, nullptr,
-          block_cache_tracer, io_tracer, db_id, db_session_id)),
+          block_cache_tracer, io_tracer, db_id, db_session_id,
+          /*read_only*/ true)),
       default_cfd_cache_(nullptr),
       db_name_(dbname),
       db_options_(db_options),
@@ -1752,12 +1853,12 @@ size_t ColumnFamilySet::NumberOfColumnFamilies() const {
 // under a DB mutex AND write thread
 ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
     const std::string& name, uint32_t id, Version* dummy_versions,
-    const ColumnFamilyOptions& options) {
+    const ColumnFamilyOptions& options, bool read_only) {
   assert(column_families_.find(name) == column_families_.end());
   ColumnFamilyData* new_cfd = new ColumnFamilyData(
       id, name, dummy_versions, table_cache_, write_buffer_manager_, options,
       *db_options_, &file_options_, this, block_cache_tracer_, io_tracer_,
-      db_id_, db_session_id_);
+      db_id_, db_session_id_, read_only);
   column_families_.insert({name, id});
   column_family_data_.insert({id, new_cfd});
   auto ucmp = new_cfd->user_comparator();
diff --git a/db/column_family.h b/db/column_family.h
index 51ad803b9002..60b3f15fa6c0 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -281,8 +281,9 @@ Status CheckConcurrentWritesSupported(const ColumnFamilyOptions& cf_options);
 Status CheckCFPathsSupported(const DBOptions& db_options,
                              const ColumnFamilyOptions& cf_options);
 
-ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
-                                    const ColumnFamilyOptions& src);
+ColumnFamilyOptions SanitizeCfOptions(const ImmutableDBOptions& db_options,
+                                      bool read_only,
+                                      const ColumnFamilyOptions& src);
 // Wrap user defined table properties collector factories `from cf_options`
 // into internal ones in internal_tbl_prop_coll_factories. Add a system internal
 // one too.
@@ -384,14 +385,17 @@ class ColumnFamilyData {
   Version* dummy_versions() { return dummy_versions_; }
   Version* current() { return current_; }  // REQUIRE: DB mutex held
   void SetCurrent(Version* _current);      // REQUIRE: DB mutex held
-  uint64_t GetNumLiveVersions() const;    // REQUIRE: DB mutex held
-  uint64_t GetTotalSstFilesSize() const;  // REQUIRE: DB mutex held
-  uint64_t GetLiveSstFilesSize() const;   // REQUIRE: DB mutex held
-  uint64_t GetTotalBlobFileSize() const;  // REQUIRE: DB mutex held
+  uint64_t GetNumLiveVersions() const;     // REQUIRE: DB mutex held
+  uint64_t GetTotalSstFilesSize() const;   // REQUIRE: DB mutex held
+  uint64_t GetLiveSstFilesSize() const;    // REQUIRE: DB mutex held
+  uint64_t GetTotalBlobFileSize() const;   // REQUIRE: DB mutex held
   // REQUIRE: DB mutex held
   void SetMemtable(MemTable* new_mem) {
     AssignMemtableID(new_mem);
     mem_ = new_mem;
+    if (ioptions_.disallow_memtable_writes) {
+      mem_->MarkImmutable();
+    }
   }
 
   void AssignMemtableID(ReadOnlyMemTable* new_imm) {
@@ -420,7 +424,8 @@ class ColumnFamilyData {
       const MutableCFOptions& mutable_options,
       const MutableDBOptions& mutable_db_options,
       const std::vector<SequenceNumber>& existing_snapshots,
-      const SnapshotChecker* snapshot_checker, LogBuffer* log_buffer);
+      const SnapshotChecker* snapshot_checker, LogBuffer* log_buffer,
+      bool require_max_output_level = false);
 
   // Check if the passed range overlap with any running compactions.
   // REQUIRES: DB mutex held
@@ -487,15 +492,11 @@ class ColumnFamilyData {
   uint64_t GetSuperVersionNumberRelaxed() const {
     return super_version_number_.load(std::memory_order_relaxed);
   }
-  // will return a pointer to SuperVersion* if previous SuperVersion
-  // if its reference count is zero and needs deletion or nullptr if not
-  // As argument takes a pointer to allocated SuperVersion to enable
-  // the clients to allocate SuperVersion outside of mutex.
-  // IMPORTANT: Only call this from DBImpl::InstallSuperVersion()
-  void InstallSuperVersion(SuperVersionContext* sv_context,
-                           const MutableCFOptions& mutable_cf_options);
+  // Only intended for use by DBImpl::InstallSuperVersion() and variants
   void InstallSuperVersion(SuperVersionContext* sv_context,
-                           InstrumentedMutex* db_mutex);
+                           InstrumentedMutex* db_mutex,
+                           std::optional<std::shared_ptr<SeqnoToTimeMapping>>
+                               new_seqno_to_time_mapping = {});
 
   void ResetThreadLocalSuperVersions();
 
@@ -537,6 +538,12 @@ class ColumnFamilyData {
     assert(!ts_low.empty());
     const Comparator* ucmp = user_comparator();
     assert(ucmp);
+    // Guard against resurrected full_history_ts_low persisted in MANIFEST
+    // from previous DB sessions. This could happen if UDT was enabled and then
+    // disabled.
+    if (ucmp->timestamp_size() == 0) {
+      return;
+    }
     if (full_history_ts_low_.empty() ||
         ucmp->CompareTimestamp(ts_low, full_history_ts_low_) > 0) {
       full_history_ts_low_ = std::move(ts_low);
@@ -544,6 +551,11 @@ class ColumnFamilyData {
   }
 
   const std::string& GetFullHistoryTsLow() const {
+    const Comparator* ucmp = user_comparator();
+    assert(ucmp);
+    if (ucmp->timestamp_size() == 0) {
+      assert(full_history_ts_low_.empty());
+    }
     return full_history_ts_low_;
   }
 
@@ -588,18 +600,21 @@ class ColumnFamilyData {
     return (mem_->IsEmpty() ? 0 : 1) + imm_.NumNotFlushed();
   }
 
+  // thread-safe, DB mutex not needed.
+  bool AllowIngestBehind() const {
+    return ioptions_.cf_allow_ingest_behind || ioptions_.allow_ingest_behind;
+  }
+
  private:
   friend class ColumnFamilySet;
-  ColumnFamilyData(uint32_t id, const std::string& name,
-                   Version* dummy_versions, Cache* table_cache,
-                   WriteBufferManager* write_buffer_manager,
-                   const ColumnFamilyOptions& options,
-                   const ImmutableDBOptions& db_options,
-                   const FileOptions* file_options,
-                   ColumnFamilySet* column_family_set,
-                   BlockCacheTracer* const block_cache_tracer,
-                   const std::shared_ptr<IOTracer>& io_tracer,
-                   const std::string& db_id, const std::string& db_session_id);
+  ColumnFamilyData(
+      uint32_t id, const std::string& name, Version* dummy_versions,
+      Cache* table_cache, WriteBufferManager* write_buffer_manager,
+      const ColumnFamilyOptions& options, const ImmutableDBOptions& db_options,
+      const FileOptions* file_options, ColumnFamilySet* column_family_set,
+      BlockCacheTracer* const block_cache_tracer,
+      const std::shared_ptr<IOTracer>& io_tracer, const std::string& db_id,
+      const std::string& db_session_id, bool read_only);
 
   std::vector<std::string> GetDbPaths() const;
 
@@ -761,7 +776,8 @@ class ColumnFamilySet {
 
   ColumnFamilyData* CreateColumnFamily(const std::string& name, uint32_t id,
                                        Version* dummy_version,
-                                       const ColumnFamilyOptions& options);
+                                       const ColumnFamilyOptions& options,
+                                       bool read_only);
 
   const UnorderedMap<uint32_t, size_t>& GetRunningColumnFamiliesTimestampSize()
       const {
diff --git a/db/column_family_test.cc b/db/column_family_test.cc
index 29ff2d15adbf..7cb505179c38 100644
--- a/db/column_family_test.cc
+++ b/db/column_family_test.cc
@@ -72,7 +72,6 @@ class ColumnFamilyTestBase : public testing::Test {
     env_->skip_fsync_ = true;
     dbname_ = test::PerThreadDBPath("column_family_test");
     db_options_.create_if_missing = true;
-    db_options_.fail_if_options_file_error = true;
     db_options_.env = env_;
   }
 
@@ -119,8 +118,7 @@ class ColumnFamilyTestBase : public testing::Test {
 
     for (int i = 0; i < n; i++) {
       if (flush_every != 0 && i != 0 && i % flush_every == 0) {
-        DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
-        dbi->TEST_FlushMemTable();
+        dbfull()->TEST_FlushMemTable();
       }
 
       int keyi = base + i;
@@ -178,8 +176,7 @@ class ColumnFamilyTestBase : public testing::Test {
     }
     handles_.clear();
     names_.clear();
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
   }
 
   Status TryOpen(std::vector<std::string> cf,
@@ -219,7 +216,7 @@ class ColumnFamilyTestBase : public testing::Test {
 
   void Open() { Open({"default"}); }
 
-  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_); }
+  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_.get()); }
 
   int GetProperty(int cf, std::string property) {
     std::string value;
@@ -271,7 +268,8 @@ class ColumnFamilyTestBase : public testing::Test {
       // them.
       ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(
           ConfigOptions(), desc.options,
-          SanitizeOptions(dbfull()->immutable_db_options(), current_cf_opt)));
+          SanitizeCfOptions(dbfull()->immutable_db_options(),
+                            /*read_only*/ false, current_cf_opt)));
       cfi++;
     }
   }
@@ -500,7 +498,7 @@ class ColumnFamilyTestBase : public testing::Test {
   ColumnFamilyOptions column_family_options_;
   DBOptions db_options_;
   std::string dbname_;
-  DB* db_ = nullptr;
+  std::unique_ptr<DB> db_;
   EnvCounter* env_;
   std::shared_ptr<Env> env_guard_;
   Random rnd_;
@@ -517,7 +515,7 @@ class ColumnFamilyTest
 INSTANTIATE_TEST_CASE_P(FormatDef, ColumnFamilyTest,
                         testing::Values(test::kDefaultFormatVersion));
 INSTANTIATE_TEST_CASE_P(FormatLatest, ColumnFamilyTest,
-                        testing::Values(kLatestFormatVersion));
+                        testing::Values(kLatestBbtFormatVersion));
 
 TEST_P(ColumnFamilyTest, DontReuseColumnFamilyID) {
   for (int iter = 0; iter < 3; ++iter) {
@@ -707,8 +705,8 @@ INSTANTIATE_TEST_CASE_P(
                     std::make_tuple(test::kDefaultFormatVersion, false)));
 INSTANTIATE_TEST_CASE_P(
     FormatLatest, FlushEmptyCFTestWithParam,
-    testing::Values(std::make_tuple(kLatestFormatVersion, true),
-                    std::make_tuple(kLatestFormatVersion, false)));
+    testing::Values(std::make_tuple(kLatestBbtFormatVersion, true),
+                    std::make_tuple(kLatestBbtFormatVersion, false)));
 
 TEST_P(ColumnFamilyTest, AddDrop) {
   Open();
@@ -2175,7 +2173,7 @@ TEST_P(ColumnFamilyTest, FlushStaleColumnFamilies) {
   ASSERT_TRUE(has_cf2_sst);
 
   ASSERT_OK(Flush(0));
-  ASSERT_EQ(0, dbfull()->TEST_total_log_size());
+  ASSERT_EQ(0, dbfull()->TEST_wals_total_size());
   Close();
 }
 
@@ -2232,7 +2230,7 @@ TEST_P(ColumnFamilyTest, CreateMissingColumnFamilies) {
   ASSERT_EQ(my_fs->options_files_created.load(), 2);
 }
 
-TEST_P(ColumnFamilyTest, SanitizeOptions) {
+TEST_P(ColumnFamilyTest, SanitizeCfOptions) {
   DBOptions db_options;
   for (int s = kCompactionStyleLevel; s <= kCompactionStyleUniversal; ++s) {
     for (int l = 0; l <= 2; l++) {
@@ -2248,8 +2246,8 @@ TEST_P(ColumnFamilyTest, SanitizeOptions) {
             original.write_buffer_size =
                 l * 4 * 1024 * 1024 + i * 1024 * 1024 + j * 1024 + k;
 
-            ColumnFamilyOptions result =
-                SanitizeOptions(ImmutableDBOptions(db_options), original);
+            ColumnFamilyOptions result = SanitizeCfOptions(
+                ImmutableDBOptions(db_options), /*read_only*/ false, original);
             ASSERT_TRUE(result.level0_stop_writes_trigger >=
                         result.level0_slowdown_writes_trigger);
             ASSERT_TRUE(result.level0_slowdown_writes_trigger >=
@@ -3542,11 +3540,10 @@ TEST_P(ColumnFamilyTest, MultipleCFPathsTest) {
 
   // Re-open and verify the keys.
   Reopen({ColumnFamilyOptions(), cf_opt1, cf_opt2});
-  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
   for (int cf = 1; cf != 3; ++cf) {
     ReadOptions read_options;
     read_options.readahead_size = 0;
-    auto it = dbi->NewIterator(read_options, handles_[cf]);
+    auto it = db_->NewIterator(read_options, handles_[cf]);
     for (it->SeekToFirst(); it->Valid(); it->Next()) {
       ASSERT_OK(it->status());
       Slice key(it->key());
@@ -3636,7 +3633,7 @@ TEST(ColumnFamilyTest, ValidateMemtableKVChecksumOption) {
 // the behavior of manual flush is that it skips retaining UDTs.
 class ColumnFamilyRetainUDTTest : public ColumnFamilyTestBase {
  public:
-  ColumnFamilyRetainUDTTest() : ColumnFamilyTestBase(kLatestFormatVersion) {}
+  ColumnFamilyRetainUDTTest() : ColumnFamilyTestBase(kLatestBbtFormatVersion) {}
 
   void SetUp() override {
     db_options_.allow_concurrent_memtable_write = false;
@@ -3886,7 +3883,7 @@ TEST_F(ManualFlushSkipRetainUDTTest, FlushRemovesStaleEntries) {
       static_cast_with_check<ColumnFamilyHandleImpl>(cfh)->cfd();
   for (int version = 0; version < 100; version++) {
     if (version == 50) {
-      ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable(cfd));
+      ASSERT_OK(dbfull()->TEST_SwitchMemtable(cfd));
     }
     ASSERT_OK(
         Put(0, "foo", EncodeAsUint64(version), "v" + std::to_string(version)));
diff --git a/db/compact_files_test.cc b/db/compact_files_test.cc
index d037f53accb9..62669bc1bdb2 100644
--- a/db/compact_files_test.cc
+++ b/db/compact_files_test.cc
@@ -75,10 +75,9 @@ TEST_F(CompactFilesTest, L0ConflictsFiles) {
   options.level0_file_num_compaction_trigger = kLevel0Trigger;
   options.compression = kNoCompression;
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   ASSERT_OK(DestroyDB(db_name_, options));
-  Status s = DB::Open(options, db_name_, &db);
-  assert(s.ok());
+  ASSERT_OK(DB::Open(options, db_name_, &db));
   assert(db);
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
@@ -114,7 +113,6 @@ TEST_F(CompactFilesTest, L0ConflictsFiles) {
     }
   }
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
-  delete db;
 }
 
 TEST_F(CompactFilesTest, MultipleLevel) {
@@ -128,11 +126,11 @@ TEST_F(CompactFilesTest, MultipleLevel) {
   FlushedFileCollector* collector = new FlushedFileCollector();
   options.listeners.emplace_back(collector);
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   ASSERT_OK(DestroyDB(db_name_, options));
   Status s = DB::Open(options, db_name_, &db);
   ASSERT_OK(s);
-  ASSERT_NE(db, nullptr);
+  ASSERT_NE(db.get(), nullptr);
 
   // create couple files in L0, L3, L4 and L5
   for (int i = 5; i > 2; --i) {
@@ -141,7 +139,8 @@ TEST_F(CompactFilesTest, MultipleLevel) {
     ASSERT_OK(db->Flush(FlushOptions()));
     // Ensure background work is fully finished including listener callbacks
     // before accessing listener state.
-    ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForBackgroundWork());
+    ASSERT_OK(
+        static_cast_with_check<DBImpl>(db.get())->TEST_WaitForBackgroundWork());
     auto l0_files = collector->GetFlushedFiles();
     ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files, i));
 
@@ -191,8 +190,6 @@ TEST_F(CompactFilesTest, MultipleLevel) {
   ASSERT_OK(db->CompactFiles(CompactionOptions(), files, 5));
   SyncPoint::GetInstance()->DisableProcessing();
   thread.join();
-
-  delete db;
 }
 
 TEST_F(CompactFilesTest, ObsoleteFiles) {
@@ -212,11 +209,11 @@ TEST_F(CompactFilesTest, ObsoleteFiles) {
   FlushedFileCollector* collector = new FlushedFileCollector();
   options.listeners.emplace_back(collector);
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   ASSERT_OK(DestroyDB(db_name_, options));
   Status s = DB::Open(options, db_name_, &db);
   ASSERT_OK(s);
-  ASSERT_NE(db, nullptr);
+  ASSERT_NE(db.get(), nullptr);
 
   // create couple files
   for (int i = 1000; i < 2000; ++i) {
@@ -226,13 +223,12 @@ TEST_F(CompactFilesTest, ObsoleteFiles) {
 
   auto l0_files = collector->GetFlushedFiles();
   ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files, 1));
-  ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForCompact());
+  ASSERT_OK(static_cast_with_check<DBImpl>(db.get())->TEST_WaitForCompact());
 
   // verify all compaction input files are deleted
   for (const auto& fname : l0_files) {
     ASSERT_EQ(Status::NotFound(), env_->FileExists(fname));
   }
-  delete db;
 }
 
 TEST_F(CompactFilesTest, NotCutOutputOnLevel0) {
@@ -251,10 +247,9 @@ TEST_F(CompactFilesTest, NotCutOutputOnLevel0) {
   FlushedFileCollector* collector = new FlushedFileCollector();
   options.listeners.emplace_back(collector);
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   ASSERT_OK(DestroyDB(db_name_, options));
-  Status s = DB::Open(options, db_name_, &db);
-  assert(s.ok());
+  ASSERT_OK(DB::Open(options, db_name_, &db));
   assert(db);
 
   // create couple files
@@ -262,19 +257,20 @@ TEST_F(CompactFilesTest, NotCutOutputOnLevel0) {
     ASSERT_OK(db->Put(WriteOptions(), std::to_string(i),
                       std::string(1000, 'a' + (i % 26))));
   }
-  ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForFlushMemTable());
+  ASSERT_OK(
+      static_cast_with_check<DBImpl>(db.get())->TEST_WaitForFlushMemTable());
   auto l0_files_1 = collector->GetFlushedFiles();
   collector->ClearFlushedFiles();
   for (int i = 0; i < 500; ++i) {
     ASSERT_OK(db->Put(WriteOptions(), std::to_string(i),
                       std::string(1000, 'a' + (i % 26))));
   }
-  ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForFlushMemTable());
+  ASSERT_OK(
+      static_cast_with_check<DBImpl>(db.get())->TEST_WaitForFlushMemTable());
   auto l0_files_2 = collector->GetFlushedFiles();
   ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files_1, 0));
   ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files_2, 0));
   // no assertion failure
-  delete db;
 }
 
 TEST_F(CompactFilesTest, CapturingPendingFiles) {
@@ -289,7 +285,7 @@ TEST_F(CompactFilesTest, CapturingPendingFiles) {
   FlushedFileCollector* collector = new FlushedFileCollector();
   options.listeners.emplace_back(collector);
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   ASSERT_OK(DestroyDB(db_name_, options));
   Status s = DB::Open(options, db_name_, &db);
   ASSERT_OK(s);
@@ -303,7 +299,8 @@ TEST_F(CompactFilesTest, CapturingPendingFiles) {
 
   // Ensure background work is fully finished including listener callbacks
   // before accessing listener state.
-  ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForBackgroundWork());
+  ASSERT_OK(
+      static_cast_with_check<DBImpl>(db.get())->TEST_WaitForBackgroundWork());
   auto l0_files = collector->GetFlushedFiles();
   EXPECT_EQ(5, l0_files.size());
 
@@ -327,13 +324,12 @@ TEST_F(CompactFilesTest, CapturingPendingFiles) {
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 
-  delete db;
+  db.reset();
 
   // Make sure we can reopen the DB.
   s = DB::Open(options, db_name_, &db);
   ASSERT_OK(s);
   assert(db);
-  delete db;
 }
 
 TEST_F(CompactFilesTest, CompactionFilterWithGetSv) {
@@ -365,12 +361,12 @@ TEST_F(CompactFilesTest, CompactionFilterWithGetSv) {
   options.create_if_missing = true;
   options.compaction_filter = cf.get();
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   ASSERT_OK(DestroyDB(db_name_, options));
   Status s = DB::Open(options, db_name_, &db);
   ASSERT_OK(s);
 
-  cf->SetDB(db);
+  cf->SetDB(db.get());
 
   // Write one L0 file
   ASSERT_OK(db->Put(WriteOptions(), "K1", "V1"));
@@ -384,8 +380,6 @@ TEST_F(CompactFilesTest, CompactionFilterWithGetSv) {
     ASSERT_OK(
         db->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), {fname}, 0));
   }
-
-  delete db;
 }
 
 TEST_F(CompactFilesTest, SentinelCompressionType) {
@@ -413,7 +407,7 @@ TEST_F(CompactFilesTest, SentinelCompressionType) {
     options.create_if_missing = true;
     FlushedFileCollector* collector = new FlushedFileCollector();
     options.listeners.emplace_back(collector);
-    DB* db = nullptr;
+    std::unique_ptr<DB> db;
     ASSERT_OK(DB::Open(options, db_name_, &db));
 
     ASSERT_OK(db->Put(WriteOptions(), "key", "val"));
@@ -421,7 +415,8 @@ TEST_F(CompactFilesTest, SentinelCompressionType) {
 
     // Ensure background work is fully finished including listener callbacks
     // before accessing listener state.
-    ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForBackgroundWork());
+    ASSERT_OK(
+        static_cast_with_check<DBImpl>(db.get())->TEST_WaitForBackgroundWork());
     auto l0_files = collector->GetFlushedFiles();
     ASSERT_EQ(1, l0_files.size());
 
@@ -433,14 +428,18 @@ TEST_F(CompactFilesTest, SentinelCompressionType) {
     ROCKSDB_NAMESPACE::TablePropertiesCollection all_tables_props;
     ASSERT_OK(db->GetPropertiesOfAllTables(&all_tables_props));
     for (const auto& name_and_table_props : all_tables_props) {
-      ASSERT_EQ(CompressionTypeToString(CompressionType::kZlibCompression),
-                name_and_table_props.second->compression_name);
+      // As of format_version 7, more elaborate information is encoded into the
+      // compression_name property
+      ASSERT_EQ("BuiltinV2;02;", name_and_table_props.second->compression_name);
     }
-    delete db;
   }
 }
 
 TEST_F(CompactFilesTest, CompressionWithBlockAlign) {
+  if (!Snappy_Supported()) {
+    ROCKSDB_GTEST_SKIP("Test requires Snappy support");
+    return;
+  }
   Options options;
   options.compression = CompressionType::kNoCompression;
   options.create_if_missing = true;
@@ -457,11 +456,7 @@ TEST_F(CompactFilesTest, CompressionWithBlockAlign) {
   }
 
   std::unique_ptr<DB> db;
-  {
-    DB* _db = nullptr;
-    ASSERT_OK(DB::Open(options, db_name_, &_db));
-    db.reset(_db);
-  }
+  ASSERT_OK(DB::Open(options, db_name_, &db));
 
   ASSERT_OK(db->Put(WriteOptions(), "key", "val"));
   ASSERT_OK(db->Flush(FlushOptions()));
@@ -500,7 +495,7 @@ TEST_F(CompactFilesTest, GetCompactionJobInfo) {
   FlushedFileCollector* collector = new FlushedFileCollector();
   options.listeners.emplace_back(collector);
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   ASSERT_OK(DestroyDB(db_name_, options));
   Status s = DB::Open(options, db_name_, &db);
   ASSERT_OK(s);
@@ -511,7 +506,8 @@ TEST_F(CompactFilesTest, GetCompactionJobInfo) {
     ASSERT_OK(db->Put(WriteOptions(), std::to_string(i),
                       std::string(1000, 'a' + (i % 26))));
   }
-  ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForFlushMemTable());
+  ASSERT_OK(
+      static_cast_with_check<DBImpl>(db.get())->TEST_WaitForFlushMemTable());
   auto l0_files_1 = collector->GetFlushedFiles();
   CompactionOptions co;
   co.compression = CompressionType::kLZ4Compression;
@@ -527,7 +523,228 @@ TEST_F(CompactFilesTest, GetCompactionJobInfo) {
   ASSERT_EQ(compaction_job_info.output_level, 0);
   ASSERT_OK(compaction_job_info.status);
   // no assertion failure
-  delete db;
+}
+
+// Helper function to generate zero-padded keys
+// e.g., MakeKey("a", 5) -> "a05", MakeKey("b", 42) -> "b42"
+static std::string MakeKey(const std::string& prefix, int index) {
+  return prefix + (index < 10 ? "0" : "") + std::to_string(index);
+}
+
+TEST_F(CompactFilesTest, TrivialMoveNonOverlappingFiles) {
+  Options options;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.compression = kNoCompression;
+  options.level_compaction_dynamic_level_bytes = false;
+
+  std::unique_ptr<DB> db;
+  ASSERT_OK(DestroyDB(db_name_, options));
+  Status s = DB::Open(options, db_name_, &db);
+  ASSERT_OK(s);
+  ASSERT_NE(db.get(), nullptr);
+
+  // Create 3 non-overlapping files in L0
+  // File 1: keys [a00-a99]
+  for (int i = 0; i < 100; i++) {
+    std::string key = MakeKey("a", i);
+    ASSERT_OK(db->Put(WriteOptions(), key, "value_" + key));
+  }
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  // File 2: keys [b00-b99]
+  for (int i = 0; i < 100; i++) {
+    std::string key = MakeKey("b", i);
+    ASSERT_OK(db->Put(WriteOptions(), key, "value_" + key));
+  }
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  // File 3: keys [c00-c99]
+  for (int i = 0; i < 100; i++) {
+    std::string key = MakeKey("c", i);
+    ASSERT_OK(db->Put(WriteOptions(), key, "value_" + key));
+  }
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  // Verify files are in L0
+  ColumnFamilyMetaData meta;
+  db->GetColumnFamilyMetaData(&meta);
+  ASSERT_EQ(meta.levels[0].files.size(), 3);
+  ASSERT_EQ(meta.levels[1].files.size(), 0);
+
+  // Get L0 files
+  std::vector<std::string> l0_files;
+  for (const auto& file : meta.levels[0].files) {
+    l0_files.push_back(file.db_path + "/" + file.name);
+  }
+
+  CompactionOptions compact_option;
+  compact_option.allow_trivial_move = true;
+  // Compact all L0 files to L1 (non-overlapping in L1)
+  ASSERT_OK(db->CompactFiles(compact_option, l0_files, 1));
+
+  // Verify files are now in L1
+  db->GetColumnFamilyMetaData(&meta);
+  ASSERT_EQ(meta.levels[0].files.size(), 0);
+  ASSERT_EQ(meta.levels[1].files.size(), 3);
+
+  // Get the first file from L1 (should be the one with keys a00-a99)
+  std::string l1_file_to_move;
+  std::vector<std::string> l1_files_to_move_later;
+  uint64_t l1_file_number = 0;
+  for (const auto& file : meta.levels[1].files) {
+    if (file.smallestkey[0] == 'a') {
+      l1_file_to_move = file.db_path + "/" + file.name;
+      l1_file_number = file.file_number;
+    } else {
+      l1_files_to_move_later.push_back(file.db_path + "/" + file.name);
+    }
+  }
+  ASSERT_FALSE(l1_file_to_move.empty());
+
+  // Set up sync point to verify trivial move path is taken
+  bool trivial_move_executed = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::CompactFilesImpl:TrivialMove",
+      [&](void* /*arg*/) { trivial_move_executed = true; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Move the file from L1 to L6 - this should be a trivial move
+  // because the file doesn't overlap with anything in L6
+  std::vector<std::string> files_to_move = {l1_file_to_move};
+  ASSERT_OK(db->CompactFiles(compact_option, files_to_move, 6));
+
+  // Verify trivial move was executed
+  ASSERT_TRUE(trivial_move_executed);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // Verify the file is now in L6
+  db->GetColumnFamilyMetaData(&meta);
+  ASSERT_EQ(meta.levels[1].files.size(), 2);  // Two files remain in L1
+  ASSERT_EQ(meta.levels[6].files.size(), 1);  // One file in L6
+
+  // Verify it's the correct file in L6
+  bool found_file_in_l6 = false;
+  for (const auto& file : meta.levels[6].files) {
+    if (file.file_number == l1_file_number) {
+      found_file_in_l6 = true;
+      // Verify key range hasn't changed
+      ASSERT_EQ(file.smallestkey[0], 'a');
+      ASSERT_EQ(file.largestkey[0], 'a');
+      break;
+    }
+  }
+  ASSERT_TRUE(found_file_in_l6);
+
+  // Move the other 2 files from L1 to L6, with allow_trivial_move set to false.
+  // This will trigger a normal compaction, so the 2 files will be compacted
+  // into a single file in L6.
+  ASSERT_OK(db->CompactFiles(CompactionOptions(), l1_files_to_move_later, 6));
+
+  // Verify files in L6
+  db->GetColumnFamilyMetaData(&meta);
+  ASSERT_EQ(meta.levels[1].files.size(), 0);  // Zero files remain in L1
+  ASSERT_EQ(meta.levels[6].files.size(), 2);  // Two file in L6
+
+  // Verify data integrity - all keys should still be readable
+  for (int i = 0; i < 100; i++) {
+    std::string key = MakeKey("a", i);
+    std::string value;
+    ASSERT_OK(db->Get(ReadOptions(), key, &value));
+    ASSERT_EQ(value, "value_" + key);
+  }
+}
+
+TEST_F(CompactFilesTest, TrivialMoveBlockedByOverlap) {
+  Options options;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.compression = kNoCompression;
+  options.level_compaction_dynamic_level_bytes = false;
+  options.num_levels = 7;
+
+  std::unique_ptr<DB> db;
+  ASSERT_OK(DestroyDB(db_name_, options));
+  Status s = DB::Open(options, db_name_, &db);
+  ASSERT_OK(s);
+  ASSERT_NE(db.get(), nullptr);
+
+  // Create a file in L6 with keys [m00-m99] (wide range)
+  for (int i = 0; i < 100; i++) {
+    std::string key = MakeKey("m", i);
+    ASSERT_OK(db->Put(WriteOptions(), key, "value_" + key));
+  }
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  // Get L0 file
+  ColumnFamilyMetaData meta;
+  db->GetColumnFamilyMetaData(&meta);
+  std::vector<std::string> l0_files;
+  for (const auto& file : meta.levels[0].files) {
+    l0_files.push_back(file.db_path + "/" + file.name);
+  }
+
+  CompactionOptions compact_option;
+  compact_option.allow_trivial_move = true;
+
+  // Move to L6
+  ASSERT_OK(db->CompactFiles(compact_option, l0_files, 6));
+
+  // Now create a file in L1 with overlapping keys [m50-m60]
+  for (int i = 50; i <= 60; i++) {
+    std::string key = "m" + std::to_string(i);
+    ASSERT_OK(db->Put(WriteOptions(), key, "updated_value_" + key));
+  }
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  // Get the L0 file
+  db->GetColumnFamilyMetaData(&meta);
+  std::vector<std::string> l0_files_2;
+  for (const auto& file : meta.levels[0].files) {
+    l0_files_2.push_back(file.db_path + "/" + file.name);
+  }
+
+  // Move to L1
+  ASSERT_OK(db->CompactFiles(compact_option, l0_files_2, 1));
+
+  // Get the L1 file
+  db->GetColumnFamilyMetaData(&meta);
+  ASSERT_EQ(meta.levels[1].files.size(), 1);
+  std::string l1_file =
+      meta.levels[1].files[0].db_path + "/" + meta.levels[1].files[0].name;
+
+  // Set up sync point to verify full compaction path is taken
+  bool trivial_move_executed = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::CompactFilesImpl:TrivialMove",
+      [&](void* /*arg*/) { trivial_move_executed = true; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Try to move from L1 to L6 - this should NOT be a trivial move
+  // because the file overlaps with the existing file in L6
+  ASSERT_OK(db->CompactFiles(compact_option, {l1_file}, 6));
+
+  // Verify trivial move was NOT executed (full compaction happened)
+  ASSERT_FALSE(trivial_move_executed);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // Verify the result - should have merged data in L6
+  db->GetColumnFamilyMetaData(&meta);
+  ASSERT_EQ(meta.levels[1].files.size(), 0);  // L1 should be empty
+  // L6 should have the merged file (may be 1 file if merged, or 2 if not)
+  ASSERT_GE(meta.levels[6].files.size(), 1);
+
+  // Verify updated values are present
+  for (int i = 50; i <= 60; i++) {
+    std::string key = "m" + std::to_string(i);
+    std::string value;
+    ASSERT_OK(db->Get(ReadOptions(), key, &value));
+    ASSERT_EQ(value, "updated_value_" + key);
+  }
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/compaction/compaction.cc b/db/compaction/compaction.cc
index 313e2998aecd..9609f17c80f0 100644
--- a/db/compaction/compaction.cc
+++ b/db/compaction/compaction.cc
@@ -281,12 +281,13 @@ Compaction::Compaction(
     std::vector<CompactionInputFiles> _inputs, int _output_level,
     uint64_t _target_file_size, uint64_t _max_compaction_bytes,
     uint32_t _output_path_id, CompressionType _compression,
-    CompressionOptions _compression_opts, Temperature _output_temperature,
-    uint32_t _max_subcompactions, std::vector<FileMetaData*> _grandparents,
+    CompressionOptions _compression_opts,
+    Temperature _output_temperature_override, uint32_t _max_subcompactions,
+    std::vector<FileMetaData*> _grandparents,
     std::optional<SequenceNumber> _earliest_snapshot,
-    const SnapshotChecker* _snapshot_checker, bool _manual_compaction,
-    const std::string& _trim_ts, double _score, bool _deletion_compaction,
-    bool l0_files_might_overlap, CompactionReason _compaction_reason,
+    const SnapshotChecker* _snapshot_checker,
+    CompactionReason _compaction_reason, const std::string& _trim_ts,
+    double _score, bool l0_files_might_overlap,
     BlobGarbageCollectionPolicy _blob_garbage_collection_policy,
     double _blob_garbage_collection_age_cutoff)
     : input_vstorage_(vstorage),
@@ -303,8 +304,10 @@ Compaction::Compaction(
       output_path_id_(_output_path_id),
       output_compression_(_compression),
       output_compression_opts_(_compression_opts),
-      output_temperature_(_output_temperature),
-      deletion_compaction_(_deletion_compaction),
+      output_temperature_override_(_output_temperature_override),
+      deletion_compaction_(_compaction_reason == CompactionReason::kFIFOTtl ||
+                           _compaction_reason ==
+                               CompactionReason::kFIFOMaxSize),
       l0_files_might_overlap_(l0_files_might_overlap),
       inputs_(PopulateWithAtomicBoundaries(vstorage, std::move(_inputs))),
       grandparents_(std::move(_grandparents)),
@@ -321,7 +324,8 @@ Compaction::Compaction(
               ? false
               : IsBottommostLevel(output_level_, vstorage, inputs_)),
       is_full_compaction_(IsFullCompaction(vstorage, inputs_)),
-      is_manual_compaction_(_manual_compaction),
+      is_manual_compaction_(_compaction_reason ==
+                            CompactionReason::kManualCompaction),
       trim_ts_(_trim_ts),
       is_trivial_move_(false),
       compaction_reason_(_compaction_reason),
@@ -338,20 +342,17 @@ Compaction::Compaction(
                   _blob_garbage_collection_age_cutoff > 1
               ? mutable_cf_options().blob_garbage_collection_age_cutoff
               : _blob_garbage_collection_age_cutoff),
-      penultimate_level_(
-          // For simplicity, we don't support the concept of "penultimate level"
+      proximal_level_(
+          // For simplicity, we don't support the concept of "proximal level"
           // with `CompactionReason::kExternalSstIngestion` and
           // `CompactionReason::kRefitLevel`
           _compaction_reason == CompactionReason::kExternalSstIngestion ||
                   _compaction_reason == CompactionReason::kRefitLevel
               ? Compaction::kInvalidLevel
-              : EvaluatePenultimateLevel(vstorage, mutable_cf_options_,
-                                         immutable_options_, start_level_,
-                                         output_level_)) {
+              : EvaluateProximalLevel(vstorage, mutable_cf_options_,
+                                      immutable_options_, start_level_,
+                                      output_level_)) {
   MarkFilesBeingCompacted(true);
-  if (is_manual_compaction_) {
-    compaction_reason_ = CompactionReason::kManualCompaction;
-  }
   if (max_subcompactions_ == 0) {
     max_subcompactions_ = _mutable_db_options.max_subcompactions;
   }
@@ -405,10 +406,10 @@ Compaction::Compaction(
     }
   }
 
-  PopulatePenultimateLevelOutputRange();
+  PopulateProximalLevelOutputRange();
 }
 
-void Compaction::PopulatePenultimateLevelOutputRange() {
+void Compaction::PopulateProximalLevelOutputRange() {
   if (!SupportsPerKeyPlacement()) {
     assert(keep_in_last_level_through_seqno_ == kMaxSequenceNumber);
     return;
@@ -417,46 +418,42 @@ void Compaction::PopulatePenultimateLevelOutputRange() {
   // exclude the last level, the range of all input levels is the safe range
   // of keys that can be moved up.
   int exclude_level = number_levels_ - 1;
-  penultimate_output_range_type_ = PenultimateOutputRangeType::kNonLastRange;
+  proximal_output_range_type_ = ProximalOutputRangeType::kNonLastRange;
 
-  // For universal compaction, the penultimate_output_range could be extended if
-  // all penultimate level files are included in the compaction (which includes
-  // the case that the penultimate level is empty).
+  // For universal compaction, the proximal_output_range could be extended if
+  // all proximal level files are included in the compaction (which includes
+  // the case that the proximal level is empty).
   if (immutable_options_.compaction_style == kCompactionStyleUniversal) {
     exclude_level = kInvalidLevel;
-    penultimate_output_range_type_ = PenultimateOutputRangeType::kFullRange;
-    std::set<uint64_t> penultimate_inputs;
+    proximal_output_range_type_ = ProximalOutputRangeType::kFullRange;
+    std::set<uint64_t> proximal_inputs;
     for (const auto& input_lvl : inputs_) {
-      if (input_lvl.level == penultimate_level_) {
+      if (input_lvl.level == proximal_level_) {
         for (const auto& file : input_lvl.files) {
-          penultimate_inputs.emplace(file->fd.GetNumber());
+          proximal_inputs.emplace(file->fd.GetNumber());
         }
       }
     }
-    auto penultimate_files = input_vstorage_->LevelFiles(penultimate_level_);
-    for (const auto& file : penultimate_files) {
-      if (penultimate_inputs.find(file->fd.GetNumber()) ==
-          penultimate_inputs.end()) {
+    auto proximal_files = input_vstorage_->LevelFiles(proximal_level_);
+    for (const auto& file : proximal_files) {
+      if (proximal_inputs.find(file->fd.GetNumber()) == proximal_inputs.end()) {
         exclude_level = number_levels_ - 1;
-        penultimate_output_range_type_ =
-            PenultimateOutputRangeType::kNonLastRange;
+        proximal_output_range_type_ = ProximalOutputRangeType::kNonLastRange;
         break;
       }
     }
   }
 
-  // FIXME: should make use of `penultimate_output_range_type_`.
+  // FIXME: should make use of `proximal_output_range_type_`.
   // FIXME: when last level's input range does not overlap with
-  //  penultimate level, and penultimate level input is empty,
-  //  this call will not set penultimate_level_smallest_ or
-  //  penultimate_level_largest_. No keys will be compacted up.
-  GetBoundaryInternalKeys(input_vstorage_, inputs_,
-                          &penultimate_level_smallest_,
-                          &penultimate_level_largest_, exclude_level);
-
-  if (penultimate_output_range_type_ !=
-      PenultimateOutputRangeType::kFullRange) {
-    // If not full range in penultimate level, must keep everything already
+  //  proximal level, and proximal level input is empty,
+  //  this call will not set proximal_level_smallest_ or
+  //  proximal_level_largest_. No keys will be compacted up.
+  GetBoundaryInternalKeys(input_vstorage_, inputs_, &proximal_level_smallest_,
+                          &proximal_level_largest_, exclude_level);
+
+  if (proximal_output_range_type_ != ProximalOutputRangeType::kFullRange) {
+    // If not full range in proximal level, must keep everything already
     // in the last level there, because moving it back up might cause
     // overlap/placement issues that are difficult to resolve properly in the
     // presence of range deletes
@@ -486,23 +483,23 @@ Compaction::~Compaction() {
 }
 
 bool Compaction::SupportsPerKeyPlacement() const {
-  return penultimate_level_ != kInvalidLevel;
+  return proximal_level_ != kInvalidLevel;
 }
 
-int Compaction::GetPenultimateLevel() const { return penultimate_level_; }
+int Compaction::GetProximalLevel() const { return proximal_level_; }
 
 // smallest_key and largest_key include timestamps if user-defined timestamp is
 // enabled.
-bool Compaction::OverlapPenultimateLevelOutputRange(
+bool Compaction::OverlapProximalLevelOutputRange(
     const Slice& smallest_key, const Slice& largest_key) const {
   if (!SupportsPerKeyPlacement()) {
     return false;
   }
 
-  // See FIXME in Compaction::PopulatePenultimateLevelOutputRange().
+  // See FIXME in Compaction::PopulateProximalLevelOutputRange().
   // We do not compact any key up in this case.
-  if (penultimate_level_smallest_.size() == 0 ||
-      penultimate_level_largest_.size() == 0) {
+  if (proximal_level_smallest_.size() == 0 ||
+      proximal_level_largest_.size() == 0) {
     return false;
   }
 
@@ -510,13 +507,13 @@ bool Compaction::OverlapPenultimateLevelOutputRange(
       input_vstorage_->InternalComparator()->user_comparator();
 
   return ucmp->CompareWithoutTimestamp(
-             smallest_key, penultimate_level_largest_.user_key()) <= 0 &&
+             smallest_key, proximal_level_largest_.user_key()) <= 0 &&
          ucmp->CompareWithoutTimestamp(
-             largest_key, penultimate_level_smallest_.user_key()) >= 0;
+             largest_key, proximal_level_smallest_.user_key()) >= 0;
 }
 
 // key includes timestamp if user-defined timestamp is enabled.
-void Compaction::TEST_AssertWithinPenultimateLevelOutputRange(
+void Compaction::TEST_AssertWithinProximalLevelOutputRange(
     const Slice& user_key, bool expect_failure) const {
 #ifdef NDEBUG
   (void)user_key;
@@ -524,15 +521,15 @@ void Compaction::TEST_AssertWithinPenultimateLevelOutputRange(
 #else
   assert(SupportsPerKeyPlacement());
 
-  assert(penultimate_level_smallest_.size() > 0);
-  assert(penultimate_level_largest_.size() > 0);
+  assert(proximal_level_smallest_.size() > 0);
+  assert(proximal_level_largest_.size() > 0);
 
   auto* cmp = input_vstorage_->user_comparator();
 
   // op_type of a key can change during compaction, e.g. Merge -> Put.
-  if (!(cmp->Compare(user_key, penultimate_level_smallest_.user_key()) >= 0)) {
+  if (!(cmp->Compare(user_key, proximal_level_smallest_.user_key()) >= 0)) {
     assert(expect_failure);
-  } else if (!(cmp->Compare(user_key, penultimate_level_largest_.user_key()) <=
+  } else if (!(cmp->Compare(user_key, proximal_level_largest_.user_key()) <=
                0)) {
     assert(expect_failure);
   } else {
@@ -651,6 +648,8 @@ bool Compaction::KeyNotExistsBeyondOutputLevel(
     return true;
   } else if (output_level_ != 0 &&
              cfd_->ioptions().compaction_style == kCompactionStyleLevel) {
+    // TODO: apply the optimization here to other compaction styles and
+    // compaction/flush to L0.
     // Maybe use binary search to find right entry instead of linear search?
     const Comparator* user_cmp = cfd_->user_comparator();
     for (int lvl = output_level_ + 1; lvl < number_levels_; lvl++) {
@@ -1018,7 +1017,7 @@ uint64_t Compaction::MinInputFileEpochNumber() const {
   return min_epoch_number;
 }
 
-int Compaction::EvaluatePenultimateLevel(
+int Compaction::EvaluateProximalLevel(
     const VersionStorageInfo* vstorage,
     const MutableCFOptions& mutable_cf_options,
     const ImmutableOptions& immutable_options, const int start_level,
@@ -1033,21 +1032,21 @@ int Compaction::EvaluatePenultimateLevel(
     return kInvalidLevel;
   }
 
-  int penultimate_level = output_level - 1;
-  assert(penultimate_level < immutable_options.num_levels);
-  if (penultimate_level <= 0) {
+  int proximal_level = output_level - 1;
+  assert(proximal_level < immutable_options.num_levels);
+  if (proximal_level <= 0) {
     return kInvalidLevel;
   }
 
-  // If the penultimate level is not within input level -> output level range
-  // check if the penultimate output level is empty, if it's empty, it could
-  // also be locked for the penultimate output.
+  // If the proximal level is not within input level -> output level range
+  // check if the proximal output level is empty, if it's empty, it could
+  // also be locked for the proximal output.
   // TODO: ideally, it only needs to check if there's a file within the
   //  compaction output key range. For simplicity, it just check if there's any
-  //  file on the penultimate level.
+  //  file on the proximal level.
   if (start_level == immutable_options.num_levels - 1 &&
       (immutable_options.compaction_style != kCompactionStyleUniversal ||
-       !vstorage->LevelFiles(penultimate_level).empty())) {
+       !vstorage->LevelFiles(proximal_level).empty())) {
     return kInvalidLevel;
   }
 
@@ -1061,7 +1060,7 @@ int Compaction::EvaluatePenultimateLevel(
     return kInvalidLevel;
   }
 
-  return penultimate_level;
+  return proximal_level;
 }
 
 void Compaction::FilterInputsForCompactionIterator() {
@@ -1130,4 +1129,17 @@ void Compaction::FilterInputsForCompactionIterator() {
   }
 }
 
+Temperature Compaction::GetOutputTemperature(bool is_proximal_level) const {
+  if (output_temperature_override_ != Temperature::kUnknown) {
+    return output_temperature_override_;
+  }
+
+  if (is_last_level() && !is_proximal_level &&
+      mutable_cf_options_.last_level_temperature != Temperature::kUnknown) {
+    return mutable_cf_options_.last_level_temperature;
+  }
+
+  return mutable_cf_options_.default_write_temperature;
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/compaction/compaction.h b/db/compaction/compaction.h
index 534b13c6a8f8..44eb876ac71a 100644
--- a/db/compaction/compaction.h
+++ b/db/compaction/compaction.h
@@ -90,25 +90,25 @@ class Compaction {
              uint64_t target_file_size, uint64_t max_compaction_bytes,
              uint32_t output_path_id, CompressionType compression,
              CompressionOptions compression_opts,
-             Temperature output_temperature, uint32_t max_subcompactions,
+             Temperature output_temperature_override,
+             uint32_t max_subcompactions,
              std::vector<FileMetaData*> grandparents,
              std::optional<SequenceNumber> earliest_snapshot,
              const SnapshotChecker* snapshot_checker,
-             bool manual_compaction = false, const std::string& trim_ts = "",
-             double score = -1, bool deletion_compaction = false,
+             CompactionReason compaction_reason,
+             const std::string& trim_ts = "", double score = -1,
              bool l0_files_might_overlap = true,
-             CompactionReason compaction_reason = CompactionReason::kUnknown,
              BlobGarbageCollectionPolicy blob_garbage_collection_policy =
                  BlobGarbageCollectionPolicy::kUseDefault,
              double blob_garbage_collection_age_cutoff = -1);
 
-  // The type of the penultimate level output range
-  enum class PenultimateOutputRangeType : int {
-    kNotSupported,  // it cannot output to the penultimate level
-    kFullRange,     // any data could be output to the penultimate level
+  // The type of the proximal level output range
+  enum class ProximalOutputRangeType : int {
+    kNotSupported,  // it cannot output to the proximal level
+    kFullRange,     // any data could be output to the proximal level
     kNonLastRange,  // only the keys within non_last_level compaction inputs can
-                    // be outputted to the penultimate level
-    kDisabled,      // no data can be outputted to the penultimate level
+                    // be outputted to the proximal level
+    kDisabled,      // no data can be outputted to the proximal level
   };
 
   // No copying allowed
@@ -180,6 +180,10 @@ class Compaction {
   const std::vector<CompactionInputFiles>* inputs() { return &inputs_; }
 
   // Returns the LevelFilesBrief of the specified compaction input level.
+  // Note that if the compaction includes standalone range deletion file,
+  // this function returns the result after filtering out input files covered
+  // by the range deletion file.
+  // Use inputs() if you want to get the original input files.
   const LevelFilesBrief* input_levels(size_t compaction_input_level) const {
     return &input_levels_[compaction_input_level];
   }
@@ -283,6 +287,13 @@ class Compaction {
   // are non-overlapping and can be trivially moved.
   bool is_trivial_move() const { return is_trivial_move_; }
 
+  bool is_trivial_copy_compaction() const {
+    return immutable_options_.compaction_style == kCompactionStyleFIFO &&
+           compaction_reason_ == CompactionReason::kChangeTemperature &&
+           mutable_cf_options_.compaction_options_fifo
+               .allow_trivial_copy_when_change_temperature;
+  }
+
   // How many total levels are there?
   int number_levels() const { return number_levels_; }
 
@@ -370,29 +381,29 @@ class Compaction {
 
   Slice GetLargestUserKey() const { return largest_user_key_; }
 
-  PenultimateOutputRangeType GetPenultimateOutputRangeType() const {
-    return penultimate_output_range_type_;
+  ProximalOutputRangeType GetProximalOutputRangeType() const {
+    return proximal_output_range_type_;
   }
 
   // Return true if the compaction supports per_key_placement
   bool SupportsPerKeyPlacement() const;
 
-  // Get per_key_placement penultimate output level, which is `last_level - 1`
+  // Get per_key_placement proximal output level, which is `last_level - 1`
   // if per_key_placement feature is supported. Otherwise, return -1.
-  int GetPenultimateLevel() const;
+  int GetProximalLevel() const;
 
-  // Return true if the given range is overlap with penultimate level output
+  // Return true if the given range is overlap with proximal level output
   // range.
   // Both smallest_key and largest_key include timestamps if user-defined
   // timestamp is enabled.
-  bool OverlapPenultimateLevelOutputRange(const Slice& smallest_key,
-                                          const Slice& largest_key) const;
+  bool OverlapProximalLevelOutputRange(const Slice& smallest_key,
+                                       const Slice& largest_key) const;
 
-  // For testing purposes, check that a key is within penultimate level
+  // For testing purposes, check that a key is within proximal level
   // output range for per_key_placement feature, which is safe to place the key
-  // to the penultimate level. Different compaction strategies have different
+  // to the proximal level. Different compaction strategies have different
   // rules. `user_key` includes timestamp if user-defined timestamp is enabled.
-  void TEST_AssertWithinPenultimateLevelOutputRange(
+  void TEST_AssertWithinProximalLevelOutputRange(
       const Slice& user_key, bool expect_failure = false) const;
 
   CompactionReason compaction_reason() const { return compaction_reason_; }
@@ -403,7 +414,11 @@ class Compaction {
 
   uint64_t max_compaction_bytes() const { return max_compaction_bytes_; }
 
-  Temperature output_temperature() const { return output_temperature_; }
+  // Order of precedence for temperature:
+  // 1. Override temp if not kUnknown
+  // 2. Temperature of the last level files if applicable
+  // 3. Default write temperature
+  Temperature GetOutputTemperature(bool is_proximal_level = false) const;
 
   uint32_t max_subcompactions() const { return max_subcompactions_; }
 
@@ -441,20 +456,25 @@ class Compaction {
 
   static constexpr int kInvalidLevel = -1;
 
-  // Evaluate penultimate output level. If the compaction supports
-  // per_key_placement feature, it returns the penultimate level number.
+  // Evaluate proximal output level. If the compaction supports
+  // per_key_placement feature, it returns the proximal level number.
   // Otherwise, it's set to kInvalidLevel (-1), which means
-  // output_to_penultimate_level is not supported.
-  // Note: even the penultimate level output is supported (PenultimateLevel !=
+  // output_to_proximal_level is not supported.
+  // Note: even the proximal level output is supported (ProximalLevel !=
   // kInvalidLevel), some key range maybe unsafe to be outputted to the
-  // penultimate level. The safe key range is populated by
-  // `PopulatePenultimateLevelOutputRange()`.
-  // Which could potentially disable all penultimate level output.
-  static int EvaluatePenultimateLevel(
-      const VersionStorageInfo* vstorage,
-      const MutableCFOptions& mutable_cf_options,
-      const ImmutableOptions& immutable_options, const int start_level,
-      const int output_level);
+  // proximal level. The safe key range is populated by
+  // `PopulateProximalLevelOutputRange()`.
+  // Which could potentially disable all proximal level output.
+  static int EvaluateProximalLevel(const VersionStorageInfo* vstorage,
+                                   const MutableCFOptions& mutable_cf_options,
+                                   const ImmutableOptions& immutable_options,
+                                   const int start_level,
+                                   const int output_level);
+
+  static bool OutputToNonZeroMaxOutputLevel(int output_level,
+                                            int max_output_level) {
+    return output_level > 0 && output_level == max_output_level;
+  }
 
   // If some data cannot be safely migrated "up" the LSM tree due to a change
   // in the preclude_last_level_data_seconds setting, this indicates a sequence
@@ -482,10 +502,10 @@ class Compaction {
       InternalKey* smallest_key, InternalKey* largest_key,
       int exclude_level = -1);
 
-  // populate penultimate level output range, which will be used to determine if
-  // a key is safe to output to the penultimate level (details see
-  // `Compaction::WithinPenultimateLevelOutputRange()`.
-  void PopulatePenultimateLevelOutputRange();
+  // populate proximal level output range, which will be used to determine if
+  // a key is safe to output to the proximal level (details see
+  // `Compaction::WithinProximalLevelOutputRange()`.
+  void PopulateProximalLevelOutputRange();
 
   // If oldest snapshot is specified at Compaction construction time, we have
   // an opportunity to optimize inputs for compaction iterator for this case:
@@ -530,7 +550,7 @@ class Compaction {
   const uint32_t output_path_id_;
   CompressionType output_compression_;
   CompressionOptions output_compression_opts_;
-  Temperature output_temperature_;
+  Temperature output_temperature_override_;
   // If true, then the compaction can be done by simply deleting input files.
   const bool deletion_compaction_;
   // should it split the output file using the compact cursor?
@@ -616,20 +636,20 @@ class Compaction {
 
   // only set when per_key_placement feature is enabled, -1 (kInvalidLevel)
   // means not supported.
-  const int penultimate_level_;
+  const int proximal_level_;
 
-  // Key range for penultimate level output
+  // Key range for proximal level output
   // includes timestamp if user-defined timestamp is enabled.
-  // penultimate_output_range_type_ shows the range type
-  InternalKey penultimate_level_smallest_;
-  InternalKey penultimate_level_largest_;
-  PenultimateOutputRangeType penultimate_output_range_type_ =
-      PenultimateOutputRangeType::kNotSupported;
+  // proximal_output_range_type_ shows the range type
+  InternalKey proximal_level_smallest_;
+  InternalKey proximal_level_largest_;
+  ProximalOutputRangeType proximal_output_range_type_ =
+      ProximalOutputRangeType::kNotSupported;
 };
 
 #ifndef NDEBUG
 // Helper struct only for tests, which contains the data to decide if a key
-// should be output to the penultimate level.
+// should be output to the proximal level.
 // TODO: remove this when the public feature knob is available
 struct PerKeyPlacementContext {
   const int level;
@@ -637,16 +657,16 @@ struct PerKeyPlacementContext {
   const Slice value;
   const SequenceNumber seq_num;
 
-  bool& output_to_penultimate_level;
+  bool& output_to_proximal_level;
 
   PerKeyPlacementContext(int _level, Slice _key, Slice _value,
                          SequenceNumber _seq_num,
-                         bool& _output_to_penultimate_level)
+                         bool& _output_to_proximal_level)
       : level(_level),
         key(_key),
         value(_value),
         seq_num(_seq_num),
-        output_to_penultimate_level(_output_to_penultimate_level) {}
+        output_to_proximal_level(_output_to_proximal_level) {}
 };
 #endif /* !NDEBUG */
 
diff --git a/db/compaction/compaction_iterator.cc b/db/compaction/compaction_iterator.cc
index dc441817c6cc..e76490225c26 100644
--- a/db/compaction/compaction_iterator.cc
+++ b/db/compaction/compaction_iterator.cc
@@ -28,7 +28,7 @@ CompactionIterator::CompactionIterator(
     SequenceNumber earliest_snapshot,
     SequenceNumber earliest_write_conflict_snapshot,
     SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker,
-    Env* env, bool report_detailed_time, bool expect_valid_internal_key,
+    Env* env, bool report_detailed_time,
     CompactionRangeDelAggregator* range_del_agg,
     BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
     bool enforce_single_del_contracts,
@@ -42,8 +42,8 @@ CompactionIterator::CompactionIterator(
     : CompactionIterator(
           input, cmp, merge_helper, last_sequence, snapshots, earliest_snapshot,
           earliest_write_conflict_snapshot, job_snapshot, snapshot_checker, env,
-          report_detailed_time, expect_valid_internal_key, range_del_agg,
-          blob_file_builder, allow_data_in_errors, enforce_single_del_contracts,
+          report_detailed_time, range_del_agg, blob_file_builder,
+          allow_data_in_errors, enforce_single_del_contracts,
           manual_compaction_canceled,
           compaction ? std::make_unique<RealCompaction>(compaction) : nullptr,
           must_count_input_entries, compaction_filter, shutting_down, info_log,
@@ -55,7 +55,7 @@ CompactionIterator::CompactionIterator(
     SequenceNumber earliest_snapshot,
     SequenceNumber earliest_write_conflict_snapshot,
     SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker,
-    Env* env, bool report_detailed_time, bool expect_valid_internal_key,
+    Env* env, bool report_detailed_time,
     CompactionRangeDelAggregator* range_del_agg,
     BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
     bool enforce_single_del_contracts,
@@ -76,16 +76,14 @@ CompactionIterator::CompactionIterator(
       env_(env),
       clock_(env_->GetSystemClock().get()),
       report_detailed_time_(report_detailed_time),
-      expect_valid_internal_key_(expect_valid_internal_key),
       range_del_agg_(range_del_agg),
       blob_file_builder_(blob_file_builder),
       compaction_(std::move(compaction)),
       compaction_filter_(compaction_filter),
       shutting_down_(shutting_down),
       manual_compaction_canceled_(manual_compaction_canceled),
-      bottommost_level_(!compaction_ ? false
-                                     : compaction_->bottommost_level() &&
-                                           !compaction_->allow_ingest_behind()),
+      bottommost_level_(compaction_ && compaction_->bottommost_level() &&
+                        !compaction_->allow_ingest_behind()),
       // snapshots_ cannot be nullptr, but we will assert later in the body of
       // the constructor.
       visible_at_tip_(snapshots_ ? snapshots_->empty() : false),
@@ -161,6 +159,7 @@ void CompactionIterator::Next() {
       // MergeUntil stops when it encounters a corrupt key and does not
       // include them in the result, so we expect the keys here to be valid.
       if (!s.ok()) {
+        // FIXME: should fail compaction after this fatal logging.
         ROCKS_LOG_FATAL(
             info_log_, "Invalid ikey %s in compaction. %s",
             allow_data_in_errors_ ? key_.ToString(true).c_str() : "hidden",
@@ -464,18 +463,9 @@ void CompactionIterator::NextFromInput() {
     if (!pik_status.ok()) {
       iter_stats_.num_input_corrupt_records++;
 
-      // If `expect_valid_internal_key_` is false, return the corrupted key
-      // and let the caller decide what to do with it.
-      if (expect_valid_internal_key_) {
-        status_ = pik_status;
-        return;
-      }
-      key_ = current_key_.SetInternalKey(key_);
-      has_current_user_key_ = false;
-      current_user_key_sequence_ = kMaxSequenceNumber;
-      current_user_key_snapshot_ = 0;
-      validity_info_.SetValid(ValidContext::kParseKeyError);
-      break;
+      // Always fail compaction when encountering corrupted internal keys
+      status_ = pik_status;
+      return;
     }
     TEST_SYNC_POINT_CALLBACK("CompactionIterator:ProcessKV", &ikey_);
     if (is_range_del_) {
@@ -642,7 +632,8 @@ void CompactionIterator::NextFromInput() {
     } else if (ikey_.type == kTypeSingleDeletion) {
       // We can compact out a SingleDelete if:
       // 1) We encounter the corresponding PUT -OR- we know that this key
-      //    doesn't appear past this output level
+      //    doesn't appear past this output level and  we are not in
+      //    ingest_behind mode.
       // =AND=
       // 2) We've already returned a record in this snapshot -OR-
       //    there are no earlier earliest_write_conflict_snapshot.
@@ -731,6 +722,8 @@ void CompactionIterator::NextFromInput() {
             "CompactionIterator::NextFromInput:SingleDelete:1",
             const_cast<Compaction*>(c));
         if (last_key_seq_zeroed_) {
+          // Drop SD and the next key since they are both in the last
+          // snapshot (since last key has seqno zeroed).
           ++iter_stats_.num_record_drop_hidden;
           ++iter_stats_.num_record_drop_obsolete;
           assert(bottommost_level_);
@@ -841,7 +834,7 @@ void CompactionIterator::NextFromInput() {
         // iteration. If the next key is corrupt, we return before the
         // comparison, so the value of has_current_user_key does not matter.
         has_current_user_key_ = false;
-        if (compaction_ != nullptr &&
+        if (compaction_ != nullptr && !compaction_->allow_ingest_behind() &&
             DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) &&
             compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key,
                                                        &level_ptrs_) &&
@@ -854,6 +847,9 @@ void CompactionIterator::NextFromInput() {
             ++iter_stats_.num_optimized_del_drop_obsolete;
           }
         } else if (last_key_seq_zeroed_) {
+          // Sequence number zeroing requires bottommost_level_, which is
+          // false with ingest_behind.
+          assert(!compaction_->allow_ingest_behind());
           // Skip.
           ++iter_stats_.num_record_drop_hidden;
           ++iter_stats_.num_record_drop_obsolete;
@@ -870,6 +866,7 @@ void CompactionIterator::NextFromInput() {
     } else if (last_sequence != kMaxSequenceNumber &&
                (last_snapshot == current_user_key_snapshot_ ||
                 last_snapshot < current_user_key_snapshot_)) {
+      // rule (A):
       // If the earliest snapshot is which this key is visible in
       // is the same as the visibility of a previous instance of the
       // same key, then this kv is not visible in any snapshot.
@@ -878,6 +875,15 @@ void CompactionIterator::NextFromInput() {
       // Note: Dropping this key will not affect TransactionDB write-conflict
       // checking since there has already been a record returned for this key
       // in this snapshot.
+      // When ingest_behind is enabled, it's ok that we drop an overwritten
+      // Delete here. The overwritting key still covers whatever that will be
+      // ingested. Note that we will not drop SingleDelete here as SingleDelte
+      // is handled entirely in its own if clause. This is important, see
+      // example: from new to old: SingleDelete_1, PUT_1, SingleDelete_2, PUT_2,
+      // where all operations are on the same key and PUT_2 is ingested with
+      // ingest_behind=true. If SingleDelete_2 is dropped due to being compacted
+      // together with PUT_1, and then PUT_1 is compacted away together with
+      // SingleDelete_1, PUT_2 can incorrectly becomes visible.
       if (last_sequence < current_user_key_sequence_) {
         ROCKS_LOG_FATAL(info_log_,
                         "key %s, last_sequence (%" PRIu64
@@ -887,12 +893,13 @@ void CompactionIterator::NextFromInput() {
         assert(false);
       }
 
-      ++iter_stats_.num_record_drop_hidden;  // rule (A)
+      ++iter_stats_.num_record_drop_hidden;
       AdvanceInputIter();
     } else if (compaction_ != nullptr &&
                (ikey_.type == kTypeDeletion ||
                 (ikey_.type == kTypeDeletionWithTimestamp &&
                  cmp_with_history_ts_low_ < 0)) &&
+               !compaction_->allow_ingest_behind() &&
                DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) &&
                compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key,
                                                           &level_ptrs_)) {
@@ -928,11 +935,13 @@ void CompactionIterator::NextFromInput() {
                 (ikey_.type == kTypeDeletionWithTimestamp &&
                  cmp_with_history_ts_low_ < 0)) &&
                bottommost_level_) {
+      assert(compaction_);
+      assert(!compaction_->allow_ingest_behind());  // bottommost_level_ is true
       // Handle the case where we have a delete key at the bottom most level
       // We can skip outputting the key iff there are no subsequent puts for
       // this key
-      assert(!compaction_ || compaction_->KeyNotExistsBeyondOutputLevel(
-                                 ikey_.user_key, &level_ptrs_));
+      assert(compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key,
+                                                        &level_ptrs_));
       ParsedInternalKey next_ikey;
       AdvanceInputIter();
 #ifndef NDEBUG
@@ -974,6 +983,12 @@ void CompactionIterator::NextFromInput() {
                 (compaction_ != nullptr &&
                  compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key,
                                                             &level_ptrs_)))) {
+      // FIXME: it's possible that we are setting sequence number to 0 as
+      // preferred sequence number here. If cf_ingest_behind is enabled, this
+      // may fail ingestions since they expect all keys above the last level
+      // to have non-zero sequence number. We should probably not allow seqno
+      // zeroing here.
+      //
       // This section that attempts to swap preferred sequence number will not
       // be invoked if this is a CompactionIterator created for flush, since
       // `compaction_` will be nullptr and it's not bottommost either.
@@ -1105,17 +1120,15 @@ void CompactionIterator::NextFromInput() {
     }
   }
 
-  if (!Valid() && IsShuttingDown()) {
-    status_ = Status::ShutdownInProgress();
-  }
-
-  if (IsPausingManualCompaction()) {
-    status_ = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
-  }
-
-  // Propagate corruption status from memtable itereator
-  if (!input_.Valid() && input_.status().IsCorruption()) {
-    status_ = input_.status();
+  if (status_.ok()) {
+    if (!Valid() && IsShuttingDown()) {
+      status_ = Status::ShutdownInProgress();
+    } else if (IsPausingManualCompaction()) {
+      status_ = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+    } else if (!input_.Valid() && input_.status().IsCorruption()) {
+      // Propagate corruption status from memtable iterator
+      status_ = input_.status();
+    }
   }
 }
 
@@ -1274,11 +1287,11 @@ void CompactionIterator::PrepareOutput() {
     //
     // Can we do the same for levels above bottom level as long as
     // KeyNotExistsBeyondOutputLevel() return true?
-    if (Valid() && compaction_ != nullptr &&
-        !compaction_->allow_ingest_behind() && bottommost_level_ &&
+    if (Valid() && bottommost_level_ &&
         DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) &&
         ikey_.type != kTypeMerge && current_key_committed_ &&
         ikey_.sequence <= preserve_seqno_after_ && !is_range_del_) {
+      assert(compaction_ != nullptr && !compaction_->allow_ingest_behind());
       if (ikey_.type == kTypeDeletion ||
           (ikey_.type == kTypeSingleDeletion && timestamp_size_ == 0)) {
         ROCKS_LOG_FATAL(
@@ -1297,14 +1310,14 @@ void CompactionIterator::PrepareOutput() {
             validity_info_.rep);
         assert(false);
       }
-      ikey_.sequence = 0;
-      last_key_seq_zeroed_ = true;
-      TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput:ZeroingSeq",
-                               &ikey_);
+
+      bool zeroed_seqno = false;
       if (!timestamp_size_) {
         current_key_.UpdateInternalKey(0, ikey_.type);
+        zeroed_seqno = true;
       } else if (full_history_ts_low_ && cmp_with_history_ts_low_ < 0) {
-        // We can also zero out timestamp for better compression.
+        // For UDT, the seqno and timestamp could only be zeroed out after the
+        // key is below history_ts_low_.
         // For the same user key (excluding timestamp), the timestamp-based
         // history can be collapsed to save some space if the timestamp is
         // older than *full_history_ts_low_.
@@ -1312,6 +1325,14 @@ void CompactionIterator::PrepareOutput() {
         const Slice ts_slice = kTsMin;
         ikey_.SetTimestamp(ts_slice);
         current_key_.UpdateInternalKey(0, ikey_.type, &ts_slice);
+        zeroed_seqno = true;
+      }
+
+      if (zeroed_seqno) {
+        ikey_.sequence = 0;
+        last_key_seq_zeroed_ = true;
+        TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput:ZeroingSeq",
+                                 &ikey_);
       }
     }
   }
diff --git a/db/compaction/compaction_iterator.h b/db/compaction/compaction_iterator.h
index c3e4942ac342..a851e35f93d5 100644
--- a/db/compaction/compaction_iterator.h
+++ b/db/compaction/compaction_iterator.h
@@ -145,7 +145,8 @@ class CompactionIterator {
     }
 
     bool allow_ingest_behind() const override {
-      return compaction_->immutable_options().allow_ingest_behind;
+      return compaction_->immutable_options().cf_allow_ingest_behind ||
+             compaction_->immutable_options().allow_ingest_behind;
     }
 
     bool allow_mmap_reads() const override {
@@ -182,17 +183,27 @@ class CompactionIterator {
     const Compaction* compaction_;
   };
 
-  // @param must_count_input_entries  if true, `NumInputEntryScanned()` will
-  // return the number of input keys scanned. If false, `NumInputEntryScanned()`
-  // will return this number if no Seek was called on `input`. User should call
-  // `HasNumInputEntryScanned()` first in this case.
+  // @param must_count_input_entries Controls input entry counting accuracy vs
+  // performance:
+  //   - If true: `NumInputEntryScanned()` always returns the exact count of
+  //   input keys
+  //     scanned. The iterator will use sequential `Next()` calls instead of
+  //     `Seek()` to maintain count accuracy as `Seek()` will not count the
+  //     skipped input entries, which is slower but guarantees correctness.
+  //   - If false: `NumInputEntryScanned()` returns the count only if no
+  //   `Seek()` operations
+  //     were performed on the input iterator. When compaction filters request
+  //     skipping ranges of keys or other optimizations trigger seek operations,
+  //     the count becomes unreliable. Always call `HasNumInputEntryScanned()`
+  //     first to verify if the count is accurate before using
+  //     `NumInputEntryScanned()`.
   CompactionIterator(
       InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
       SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
       SequenceNumber earliest_snapshot,
       SequenceNumber earliest_write_conflict_snapshot,
       SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker,
-      Env* env, bool report_detailed_time, bool expect_valid_internal_key,
+      Env* env, bool report_detailed_time,
       CompactionRangeDelAggregator* range_del_agg,
       BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
       bool enforce_single_del_contracts,
@@ -212,7 +223,7 @@ class CompactionIterator {
                      SequenceNumber earliest_write_conflict_snapshot,
                      SequenceNumber job_snapshot,
                      const SnapshotChecker* snapshot_checker, Env* env,
-                     bool report_detailed_time, bool expect_valid_internal_key,
+                     bool report_detailed_time,
                      CompactionRangeDelAggregator* range_del_agg,
                      BlobFileBuilder* blob_file_builder,
                      bool allow_data_in_errors,
@@ -254,7 +265,21 @@ class CompactionIterator {
   }
   const CompactionIterationStats& iter_stats() const { return iter_stats_; }
   bool HasNumInputEntryScanned() const { return input_.HasNumItered(); }
+
+  // This method should only be used when `HasNumInputEntryScanned()` returns
+  // true, unless `must_count_input_entries=true` was specified during iterator
+  // creation (which ensures the count is always accurate).
   uint64_t NumInputEntryScanned() const { return input_.NumItered(); }
+
+  // Returns true if the current valid key was already scanned/counted during
+  // a lookahead operation in a previous iteration.
+  //
+  // REQUIRED: Valid() must be true
+  bool IsCurrentKeyAlreadyScanned() const {
+    assert(Valid());
+    return at_next_ || merge_out_iter_.Valid();
+  }
+
   Status InputStatus() const { return input_.status(); }
 
   bool IsDeleteRangeSentinelKey() const { return is_range_del_; }
@@ -347,7 +372,6 @@ class CompactionIterator {
   Env* env_;
   SystemClock* clock_;
   const bool report_detailed_time_;
-  const bool expect_valid_internal_key_;
   CompactionRangeDelAggregator* range_del_agg_;
   BlobFileBuilder* blob_file_builder_;
   std::unique_ptr<CompactionProxy> compaction_;
@@ -417,13 +441,15 @@ class CompactionIterator {
   // NextFromInput()).
   ParsedInternalKey ikey_;
 
-  // Stores whether ikey_.user_key is valid. If set to false, the user key is
-  // not compared against the current key in the underlying iterator.
+  // Stores whether current_user_key_ is valid. If so, current_user_key_
+  // stores the user key of the last key seen by the iterator.
+  // If false, treat the next key to read as a new user key.
   bool has_current_user_key_ = false;
   // If false, the iterator holds a copy of the current compaction iterator
   // output (or current key in the underlying iterator during NextFromInput()).
   bool at_next_ = false;
 
+  // A copy of the current internal key.
   IterKey current_key_;
   Slice current_user_key_;
   std::string curr_ts_;
@@ -433,8 +459,9 @@ class CompactionIterator {
   // True if the iterator has already returned a record for the current key.
   bool has_outputted_key_ = false;
 
-  // truncated the value of the next key and output it without applying any
-  // compaction rules.  This is used for outputting a put after a single delete.
+  // Truncate the value of the next key and output it without applying any
+  // compaction rules. This is an optimization for outputting a put after
+  // a single delete. See more in `NextFromInput()` under Optimization 3.
   bool clear_and_output_next_key_ = false;
 
   MergeOutputIterator merge_out_iter_;
diff --git a/db/compaction/compaction_iterator_test.cc b/db/compaction/compaction_iterator_test.cc
index 974a4e1ff837..5ede0f4e1623 100644
--- a/db/compaction/compaction_iterator_test.cc
+++ b/db/compaction/compaction_iterator_test.cc
@@ -294,7 +294,7 @@ class CompactionIteratorTest : public testing::TestWithParam<bool> {
         snapshots_.empty() ? kMaxSequenceNumber : snapshots_.at(0),
         earliest_write_conflict_snapshot, kMaxSequenceNumber,
         snapshot_checker_.get(), Env::Default(),
-        false /* report_detailed_time */, false, range_del_agg_.get(),
+        false /* report_detailed_time */, range_del_agg_.get(),
         nullptr /* blob_file_builder */, true /*allow_data_in_errors*/,
         true /*enforce_single_del_contracts*/,
         /*manual_compaction_canceled=*/kManualCompactionCanceledFalse_,
@@ -374,8 +374,7 @@ TEST_P(CompactionIteratorTest, EmptyResult) {
   ASSERT_FALSE(c_iter_->Valid());
 }
 
-// If there is a corruption after a single deletion, the corrupted key should
-// be preserved.
+// If there is a corruption after a single deletion, the compaction should fail.
 TEST_P(CompactionIteratorTest, CorruptionAfterSingleDeletion) {
   InitIterators({test::KeyStr("a", 5, kTypeSingleDeletion),
                  test::KeyStr("a", 3, kTypeValue, true),
@@ -386,14 +385,10 @@ TEST_P(CompactionIteratorTest, CorruptionAfterSingleDeletion) {
   ASSERT_EQ(test::KeyStr("a", 5, kTypeSingleDeletion),
             c_iter_->key().ToString());
   c_iter_->Next();
-  ASSERT_TRUE(c_iter_->Valid());
-  ASSERT_EQ(test::KeyStr("a", 3, kTypeValue, true), c_iter_->key().ToString());
-  c_iter_->Next();
-  ASSERT_TRUE(c_iter_->Valid());
-  ASSERT_EQ(test::KeyStr("b", 10, kTypeValue), c_iter_->key().ToString());
-  c_iter_->Next();
-  ASSERT_OK(c_iter_->status());
+  // The iterator should now fail when encountering the corrupted key
   ASSERT_FALSE(c_iter_->Valid());
+  ASSERT_FALSE(c_iter_->status().ok());
+  ASSERT_TRUE(c_iter_->status().IsCorruption());
 }
 
 // Tests compatibility of TimedPut and SingleDelete. TimedPut should act as if
diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 0ea74891e40d..8092a26069be 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -51,7 +51,9 @@
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
 #include "rocksdb/utilities/options_type.h"
+#include "table/format.h"
 #include "table/merging_iterator.h"
+#include "table/meta_blocks.h"
 #include "table/table_builder.h"
 #include "table/unique_id_impl.h"
 #include "test_util/sync_point.h"
@@ -109,16 +111,16 @@ const char* GetCompactionReasonString(CompactionReason compaction_reason) {
   }
 }
 
-const char* GetCompactionPenultimateOutputRangeTypeString(
-    Compaction::PenultimateOutputRangeType range_type) {
+const char* GetCompactionProximalOutputRangeTypeString(
+    Compaction::ProximalOutputRangeType range_type) {
   switch (range_type) {
-    case Compaction::PenultimateOutputRangeType::kNotSupported:
+    case Compaction::ProximalOutputRangeType::kNotSupported:
       return "NotSupported";
-    case Compaction::PenultimateOutputRangeType::kFullRange:
+    case Compaction::ProximalOutputRangeType::kFullRange:
       return "FullRange";
-    case Compaction::PenultimateOutputRangeType::kNonLastRange:
+    case Compaction::ProximalOutputRangeType::kNonLastRange:
       return "NonLastRange";
-    case Compaction::PenultimateOutputRangeType::kDisabled:
+    case Compaction::ProximalOutputRangeType::kDisabled:
       return "Disabled";
     default:
       assert(false);
@@ -126,6 +128,10 @@ const char* GetCompactionPenultimateOutputRangeTypeString(
   }
 }
 
+// Static constant for compaction abort flag - always false, used for
+// compaction service jobs that don't support abort signaling
+const std::atomic<int> CompactionJob::kCompactionAbortedFalse{0};
+
 CompactionJob::CompactionJob(
     int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
     const MutableDBOptions& mutable_db_options, const FileOptions& file_options,
@@ -133,21 +139,18 @@ CompactionJob::CompactionJob(
     LogBuffer* log_buffer, FSDirectory* db_directory,
     FSDirectory* output_directory, FSDirectory* blob_output_directory,
     Statistics* stats, InstrumentedMutex* db_mutex,
-    ErrorHandler* db_error_handler,
-    std::vector<SequenceNumber> existing_snapshots,
-    SequenceNumber earliest_write_conflict_snapshot,
-    const SnapshotChecker* snapshot_checker, JobContext* job_context,
+    ErrorHandler* db_error_handler, JobContext* job_context,
     std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
     bool paranoid_file_checks, bool measure_io_stats, const std::string& dbname,
     CompactionJobStats* compaction_job_stats, Env::Priority thread_pri,
     const std::shared_ptr<IOTracer>& io_tracer,
     const std::atomic<bool>& manual_compaction_canceled,
-    const std::string& db_id, const std::string& db_session_id,
-    std::string full_history_ts_low, std::string trim_ts,
-    BlobFileCompletionCallback* blob_callback, int* bg_compaction_scheduled,
-    int* bg_bottom_compaction_scheduled)
+    const std::atomic<int>& compaction_aborted, const std::string& db_id,
+    const std::string& db_session_id, std::string full_history_ts_low,
+    std::string trim_ts, BlobFileCompletionCallback* blob_callback,
+    int* bg_compaction_scheduled, int* bg_bottom_compaction_scheduled)
     : compact_(new CompactionState(compaction)),
-      compaction_stats_(compaction->compaction_reason(), 1),
+      internal_stats_(compaction->compaction_reason(), 1),
       db_options_(db_options),
       mutable_db_options_copy_(mutable_db_options),
       log_buffer_(log_buffer),
@@ -155,7 +158,7 @@ CompactionJob::CompactionJob(
       stats_(stats),
       bottommost_level_(false),
       write_hint_(Env::WLTH_NOT_SET),
-      compaction_job_stats_(compaction_job_stats),
+      job_stats_(compaction_job_stats),
       job_id_(job_id),
       dbname_(dbname),
       db_id_(db_id),
@@ -169,16 +172,16 @@ CompactionJob::CompactionJob(
       versions_(versions),
       shutting_down_(shutting_down),
       manual_compaction_canceled_(manual_compaction_canceled),
+      compaction_aborted_(compaction_aborted),
       db_directory_(db_directory),
       blob_output_directory_(blob_output_directory),
       db_mutex_(db_mutex),
       db_error_handler_(db_error_handler),
-      existing_snapshots_(std::move(existing_snapshots)),
-      earliest_snapshot_(existing_snapshots_.empty()
-                             ? kMaxSequenceNumber
-                             : existing_snapshots_.at(0)),
-      earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
-      snapshot_checker_(snapshot_checker),
+      // job_context cannot be nullptr, but we will assert later in the body of
+      // the constructor.
+      earliest_snapshot_(job_context
+                             ? job_context->GetEarliestSnapshotSequence()
+                             : kMaxSequenceNumber),
       job_context_(job_context),
       table_cache_(std::move(table_cache)),
       event_logger_(event_logger),
@@ -191,8 +194,10 @@ CompactionJob::CompactionJob(
       extra_num_subcompaction_threads_reserved_(0),
       bg_compaction_scheduled_(bg_compaction_scheduled),
       bg_bottom_compaction_scheduled_(bg_bottom_compaction_scheduled) {
-  assert(compaction_job_stats_ != nullptr);
+  assert(job_stats_ != nullptr);
   assert(log_buffer_ != nullptr);
+  assert(job_context);
+  assert(job_context->snapshot_context_initialized);
 
   const auto* cfd = compact_->compaction->column_family_data();
   ThreadStatusUtil::SetEnableTracking(db_options_.enable_thread_tracking);
@@ -224,10 +229,9 @@ void CompactionJob::ReportStartedCompaction(Compaction* compaction) {
       ThreadStatus::COMPACTION_PROP_FLAGS,
       compaction->is_manual_compaction() +
           (compaction->deletion_compaction() << 1));
-
+  auto total_input_bytes = compaction->CalculateTotalInputSize();
   ThreadStatusUtil::SetThreadOperationProperty(
-      ThreadStatus::COMPACTION_TOTAL_INPUT_BYTES,
-      compaction->CalculateTotalInputSize());
+      ThreadStatus::COMPACTION_TOTAL_INPUT_BYTES, total_input_bytes);
 
   IOSTATS_RESET(bytes_written);
   IOSTATS_RESET(bytes_read);
@@ -240,14 +244,25 @@ void CompactionJob::ReportStartedCompaction(Compaction* compaction) {
   // to ensure GetThreadList() can always show them all together.
   ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
 
-  compaction_job_stats_->is_manual_compaction =
-      compaction->is_manual_compaction();
-  compaction_job_stats_->is_full_compaction = compaction->is_full_compaction();
+  job_stats_->is_manual_compaction = compaction->is_manual_compaction();
+  job_stats_->is_full_compaction = compaction->is_full_compaction();
+  // populate compaction stats num_input_files and total_num_of_bytes
+  size_t num_input_files = 0;
+  for (int input_level = 0;
+       input_level < static_cast<int>(compaction->num_input_levels());
+       ++input_level) {
+    const LevelFilesBrief* flevel = compaction->input_levels(input_level);
+    num_input_files += flevel->num_files;
+  }
+  job_stats_->CompactionJobStats::num_input_files = num_input_files;
+  job_stats_->total_input_bytes = total_input_bytes;
 }
 
 void CompactionJob::Prepare(
     std::optional<std::pair<std::optional<Slice>, std::optional<Slice>>>
-        known_single_subcompact) {
+        known_single_subcompact,
+    const CompactionProgress& compaction_progress,
+    log::Writer* compaction_progress_writer) {
   db_mutex_->AssertHeld();
   AutoThreadOperationStageUpdater stage_updater(
       ThreadStatus::STAGE_COMPACTION_PREPARE);
@@ -260,7 +275,8 @@ void CompactionJob::Prepare(
   assert(storage_info);
   assert(storage_info->NumLevelFiles(compact_->compaction->level()) > 0);
 
-  write_hint_ = storage_info->CalculateSSTWriteHint(c->output_level());
+  write_hint_ = storage_info->CalculateSSTWriteHint(
+      c->output_level(), db_options_.calculate_sst_write_lifetime_hint_set);
   bottommost_level_ = c->bottommost_level();
 
   if (!known_single_subcompact.has_value() && c->ShouldFormSubcompactions()) {
@@ -296,13 +312,15 @@ void CompactionJob::Prepare(
                                               /*sub_job_id*/ 0);
   }
 
+  MaybeAssignCompactionProgressAndWriter(compaction_progress,
+                                         compaction_progress_writer);
+
   // collect all seqno->time information from the input files which will be used
   // to encode seqno->time to the output files.
   SequenceNumber preserve_time_min_seqno = kMaxSequenceNumber;
   SequenceNumber preclude_last_level_min_seqno = kMaxSequenceNumber;
   uint64_t preserve_time_duration =
-      std::max(c->mutable_cf_options().preserve_internal_time_seconds,
-               c->mutable_cf_options().preclude_last_level_data_seconds);
+      MinAndMaxPreserveSeconds(c->mutable_cf_options()).max_preserve_seconds;
 
   if (preserve_time_duration > 0) {
     const ReadOptions read_options(Env::IOActivity::kCompaction);
@@ -379,8 +397,8 @@ void CompactionJob::Prepare(
   }
   // Now combine what we would like to preclude from last level with what we
   // can safely support without dangerously moving data back up the LSM tree,
-  // to get the final seqno threshold for penultimate vs. last. In particular,
-  // when the reserved output key range for the penultimate level does not
+  // to get the final seqno threshold for proximal vs. last. In particular,
+  // when the reserved output key range for the proximal level does not
   // include the entire last level input key range, we need to keep entries
   // already in the last level there. (Even allowing within-range entries to
   // move back up could cause problems with range tombstones. Perhaps it
@@ -389,12 +407,31 @@ void CompactionJob::Prepare(
   // tracking and complexity to CompactionIterator that is probably not
   // worthwhile overall. Correctness is also more clear when splitting by
   // seqno threshold.)
-  penultimate_after_seqno_ = std::max(preclude_last_level_min_seqno,
-                                      c->GetKeepInLastLevelThroughSeqno());
+  proximal_after_seqno_ = std::max(preclude_last_level_min_seqno,
+                                   c->GetKeepInLastLevelThroughSeqno());
 
   options_file_number_ = versions_->options_file_number();
 }
 
+void CompactionJob::MaybeAssignCompactionProgressAndWriter(
+    const CompactionProgress& compaction_progress,
+    log::Writer* compaction_progress_writer) {
+  // LIMITATION: Only supports resuming single subcompaction for now
+  if (compact_->sub_compact_states.size() != 1) {
+    return;
+  }
+
+  if (!compaction_progress.empty()) {
+    assert(compaction_progress.size() == 1);
+    SubcompactionState* sub_compact = &compact_->sub_compact_states[0];
+    const SubcompactionProgress& subcompaction_progress =
+        compaction_progress[0];
+    sub_compact->SetSubcompactionProgress(subcompaction_progress);
+  }
+
+  compaction_progress_writer_ = compaction_progress_writer;
+}
+
 uint64_t CompactionJob::GetSubcompactionsLimit() {
   return extra_num_subcompaction_threads_reserved_ +
          std::max(
@@ -667,16 +704,18 @@ void CompactionJob::GenSubcompactionBoundaries() {
                extra_num_subcompaction_threads_reserved_));
 }
 
-Status CompactionJob::Run() {
+void CompactionJob::InitializeCompactionRun() {
   AutoThreadOperationStageUpdater stage_updater(
       ThreadStatus::STAGE_COMPACTION_RUN);
   TEST_SYNC_POINT("CompactionJob::Run():Start");
   log_buffer_->FlushBufferToLog();
   LogCompaction();
+}
 
+void CompactionJob::RunSubcompactions() {
+  TEST_SYNC_POINT("CompactionJob::RunSubcompactions:BeforeStart");
   const size_t num_threads = compact_->sub_compact_states.size();
   assert(num_threads > 0);
-  const uint64_t start_micros = db_options_.clock->NowMicros();
   compact_->compaction->GetOrInitInputTableProperties();
 
   // Launch a thread for each of subcompactions 1...num_threads-1
@@ -695,25 +734,108 @@ Status CompactionJob::Run() {
   for (auto& thread : thread_pool) {
     thread.join();
   }
+  RemoveEmptyOutputs();
+
+  ReleaseSubcompactionResources();
+  TEST_SYNC_POINT("CompactionJob::ReleaseSubcompactionResources");
+}
 
-  compaction_stats_.SetMicros(db_options_.clock->NowMicros() - start_micros);
+void CompactionJob::UpdateTimingStats(uint64_t start_micros) {
+  internal_stats_.SetMicros(db_options_.clock->NowMicros() - start_micros);
 
   for (auto& state : compact_->sub_compact_states) {
-    compaction_stats_.AddCpuMicros(state.compaction_job_stats.cpu_micros);
-    state.RemoveLastEmptyOutput();
+    internal_stats_.AddCpuMicros(state.compaction_job_stats.cpu_micros);
   }
 
   RecordTimeToHistogram(stats_, COMPACTION_TIME,
-                        compaction_stats_.stats.micros);
+                        internal_stats_.output_level_stats.micros);
   RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME,
-                        compaction_stats_.stats.cpu_micros);
+                        internal_stats_.output_level_stats.cpu_micros);
+}
 
-  TEST_SYNC_POINT("CompactionJob::Run:BeforeVerify");
+void CompactionJob::RemoveEmptyOutputs() {
+  for (auto& state : compact_->sub_compact_states) {
+    state.RemoveLastEmptyOutput();
+  }
+}
+
+void CompactionJob::CleanupAbortedSubcompactions() {
+  ColumnFamilyData* cfd = compact_->compaction->column_family_data();
 
-  // Check if any thread encountered an error during execution
+  uint64_t total_sst_files_deleted = 0;
+  uint64_t total_blob_files_deleted = 0;
+
+  // Track the first file deletion error to report at the end
+  Status first_error;
+  int deletion_errors = 0;
+
+  // Mark all subcompactions as aborted and delete their output files
+  for (auto& sub_compact : compact_->sub_compact_states) {
+    // Mark this subcompaction as aborted
+    sub_compact.status =
+        Status::Incomplete(Status::SubCode::kCompactionAborted);
+
+    // Delete all files (SST and blob) tracked during compaction.
+    // GetOutputFilePaths() contains ALL file paths created, including
+    // in-progress files that may have been removed from outputs_ or
+    // blob_file_additions_.
+    for (const bool is_proximal_level : {false, true}) {
+      if (is_proximal_level &&
+          !compact_->compaction->SupportsPerKeyPlacement()) {
+        continue;
+      }
+      for (const std::string& file_path :
+           sub_compact.Outputs(is_proximal_level)->GetOutputFilePaths()) {
+        Status s = env_->DeleteFile(file_path);
+        if (s.ok()) {
+          // Count SST vs blob files by checking extension
+          if (file_path.find(".sst") != std::string::npos) {
+            total_sst_files_deleted++;
+          } else if (file_path.find(".blob") != std::string::npos) {
+            total_blob_files_deleted++;
+          }
+        } else if (!s.IsNotFound()) {
+          if (first_error.ok()) {
+            first_error = s;
+          }
+          deletion_errors++;
+        }
+      }
+    }
+    sub_compact.CleanupOutputs();
+  }
+
+  if (stats_) {
+    RecordTick(stats_, COMPACTION_ABORTED);
+  }
+
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "[%s] [JOB %d] Compaction aborted: deleted %" PRIu64
+                 " SST files and %" PRIu64 " blob files",
+                 cfd->GetName().c_str(), job_id_, total_sst_files_deleted,
+                 total_blob_files_deleted);
+
+  if (!first_error.ok()) {
+    ROCKS_LOG_ERROR(db_options_.info_log,
+                    "[%s] [JOB %d] Cleanup completed with %d file deletion "
+                    "errors. First error: %s",
+                    cfd->GetName().c_str(), job_id_, deletion_errors,
+                    first_error.ToString().c_str());
+  }
+}
+
+bool CompactionJob::HasNewBlobFiles() const {
+  for (const auto& state : compact_->sub_compact_states) {
+    if (state.Current().HasBlobFileAdditions()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+Status CompactionJob::CollectSubcompactionErrors() {
   Status status;
   IOStatus io_s;
-  bool wrote_new_blob_files = false;
 
   for (const auto& state : compact_->sub_compact_states) {
     if (!state.status.ok()) {
@@ -721,127 +843,161 @@ Status CompactionJob::Run() {
       io_s = state.io_status;
       break;
     }
-
-    if (state.Current().HasBlobFileAdditions()) {
-      wrote_new_blob_files = true;
-    }
   }
 
   if (io_status_.ok()) {
     io_status_ = io_s;
   }
-  if (status.ok()) {
-    constexpr IODebugContext* dbg = nullptr;
 
-    if (output_directory_) {
-      io_s = output_directory_->FsyncWithDirOptions(
-          IOOptions(), dbg,
-          DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
-    }
+  return status;
+}
 
-    if (io_s.ok() && wrote_new_blob_files && blob_output_directory_ &&
-        blob_output_directory_ != output_directory_) {
-      io_s = blob_output_directory_->FsyncWithDirOptions(
-          IOOptions(), dbg,
-          DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
-    }
+Status CompactionJob::SyncOutputDirectories() {
+  Status status;
+  IOStatus io_s;
+  constexpr IODebugContext* dbg = nullptr;
+  const bool wrote_new_blob_files = HasNewBlobFiles();
+  if (output_directory_) {
+    io_s = output_directory_->FsyncWithDirOptions(
+        IOOptions(), dbg,
+        DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
+  }
+
+  if (io_s.ok() && wrote_new_blob_files && blob_output_directory_ &&
+      blob_output_directory_ != output_directory_) {
+    io_s = blob_output_directory_->FsyncWithDirOptions(
+        IOOptions(), dbg,
+        DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
   }
+
   if (io_status_.ok()) {
     io_status_ = io_s;
   }
   if (status.ok()) {
     status = io_s;
   }
-  if (status.ok()) {
-    thread_pool.clear();
-    std::vector<const CompactionOutputs::Output*> files_output;
-    for (const auto& state : compact_->sub_compact_states) {
-      for (const auto& output : state.GetOutputs()) {
-        files_output.emplace_back(&output);
-      }
-    }
-    ColumnFamilyData* cfd = compact_->compaction->column_family_data();
-    std::atomic<size_t> next_file_idx(0);
-    auto verify_table = [&](Status& output_status) {
-      while (true) {
-        size_t file_idx = next_file_idx.fetch_add(1);
-        if (file_idx >= files_output.size()) {
-          break;
-        }
-        // Verify that the table is usable
-        // We set for_compaction to false and don't
-        // OptimizeForCompactionTableRead here because this is a special case
-        // after we finish the table building No matter whether
-        // use_direct_io_for_flush_and_compaction is true, we will regard this
-        // verification as user reads since the goal is to cache it here for
-        // further user reads
-        ReadOptions verify_table_read_options(Env::IOActivity::kCompaction);
-        verify_table_read_options.rate_limiter_priority =
-            GetRateLimiterPriority();
-        InternalIterator* iter = cfd->table_cache()->NewIterator(
-            verify_table_read_options, file_options_,
-            cfd->internal_comparator(), files_output[file_idx]->meta,
-            /*range_del_agg=*/nullptr,
-            compact_->compaction->mutable_cf_options(),
-            /*table_reader_ptr=*/nullptr,
-            cfd->internal_stats()->GetFileReadHist(
-                compact_->compaction->output_level()),
-            TableReaderCaller::kCompactionRefill, /*arena=*/nullptr,
-            /*skip_filters=*/false, compact_->compaction->output_level(),
-            MaxFileSizeForL0MetaPin(compact_->compaction->mutable_cf_options()),
-            /*smallest_compaction_key=*/nullptr,
-            /*largest_compaction_key=*/nullptr,
-            /*allow_unprepared_value=*/false);
-        auto s = iter->status();
-
-        if (s.ok() && paranoid_file_checks_) {
-          OutputValidator validator(cfd->internal_comparator(),
-                                    /*_enable_hash=*/true);
-          for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-            s = validator.Add(iter->key(), iter->value());
-            if (!s.ok()) {
-              break;
-            }
-          }
-          if (s.ok()) {
-            s = iter->status();
+
+  return status;
+}
+
+Status CompactionJob::VerifyOutputFiles() {
+  Status status;
+  std::vector<port::Thread> thread_pool;
+  ColumnFamilyData* cfd = compact_->compaction->column_family_data();
+  VerifyOutputFlags verify_output_flags =
+      compact_->compaction->mutable_cf_options().verify_output_flags;
+
+  // For backward compatibility
+  if (paranoid_file_checks_) {
+    verify_output_flags |= VerifyOutputFlags::kVerifyIteration;
+    verify_output_flags |= VerifyOutputFlags::kEnableForLocalCompaction;
+    verify_output_flags |= VerifyOutputFlags::kEnableForRemoteCompaction;
+  }
+
+  auto verify_table = [&](SubcompactionState& subcompaction_state) {
+    for (const auto& output_file : subcompaction_state.GetOutputs()) {
+      // Verify that the table is usable
+      // We set for_compaction to false and don't
+      // OptimizeForCompactionTableRead here because this is a special case
+      // after we finish the table building No matter whether
+      // use_direct_io_for_flush_and_compaction is true, we will regard this
+      // verification as user reads since the goal is to cache it here for
+      // further user reads
+      ReadOptions verify_table_read_options(Env::IOActivity::kCompaction);
+      verify_table_read_options.verify_checksums = true;
+      verify_table_read_options.readahead_size =
+          file_options_for_read_.compaction_readahead_size;
+
+      std::unique_ptr<TableReader> table_reader_guard;
+      TableReader* table_reader_ptr = table_reader_guard.get();
+      verify_table_read_options.rate_limiter_priority =
+          GetRateLimiterPriority();
+      InternalIterator* iter = cfd->table_cache()->NewIterator(
+          verify_table_read_options, file_options_, cfd->internal_comparator(),
+          output_file.meta,
+          /*range_del_agg=*/nullptr, compact_->compaction->mutable_cf_options(),
+          /*table_reader_ptr=*/&table_reader_ptr,
+          cfd->internal_stats()->GetFileReadHist(
+              compact_->compaction->output_level()),
+          TableReaderCaller::kCompactionRefill, /*arena=*/nullptr,
+          /*skip_filters=*/false, compact_->compaction->output_level(),
+          MaxFileSizeForL0MetaPin(compact_->compaction->mutable_cf_options()),
+          /*smallest_compaction_key=*/nullptr,
+          /*largest_compaction_key=*/nullptr,
+          /*allow_unprepared_value=*/false);
+      auto s = iter->status();
+      if (s.ok()) {
+        // Check for remote/local compaction and verify_output_flags flags
+        const bool should_verify =
+            (subcompaction_state.compaction_job_stats.is_remote_compaction &&
+             !!(verify_output_flags &
+                VerifyOutputFlags::kEnableForRemoteCompaction)) ||
+            (!subcompaction_state.compaction_job_stats.is_remote_compaction &&
+             !!(verify_output_flags &
+                VerifyOutputFlags::kEnableForLocalCompaction));
+
+        if (should_verify) {
+          const bool should_verify_block_checksum =
+              !!(verify_output_flags & VerifyOutputFlags::kVerifyBlockChecksum);
+          const bool should_verify_iteration =
+              !!(verify_output_flags & VerifyOutputFlags::kVerifyIteration);
+          if (should_verify_block_checksum) {
+            assert(table_reader_ptr != nullptr);
+            // If verifying iteration as well, verify meta blocks here only to
+            // avoid redundant checks on data blocks
+            s = table_reader_ptr->VerifyChecksum(
+                verify_table_read_options, TableReaderCaller::kCompaction,
+                /*meta_blocks_only=*/should_verify_iteration);
           }
-          if (s.ok() &&
-              !validator.CompareValidator(files_output[file_idx]->validator)) {
-            s = Status::Corruption("Paranoid checksums do not match");
+          if (s.ok() && should_verify_iteration) {
+            OutputValidator validator(cfd->internal_comparator(),
+                                      /*_enable_hash=*/true);
+            for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+              s = validator.Add(iter->key(), iter->value());
+              if (!s.ok()) {
+                break;
+              }
+            }
+            if (s.ok()) {
+              s = iter->status();
+            }
+            if (s.ok() && !validator.CompareValidator(output_file.validator)) {
+              s = Status::Corruption(
+                  "Key-value checksum of compaction output doesn't match what "
+                  "was computed when written");
+            }
           }
         }
+      }
 
-        delete iter;
+      delete iter;
 
-        if (!s.ok()) {
-          output_status = s;
-          break;
-        }
+      if (!s.ok()) {
+        subcompaction_state.status = s;
+        break;
       }
-    };
-    for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) {
-      thread_pool.emplace_back(
-          verify_table, std::ref(compact_->sub_compact_states[i].status));
-    }
-    verify_table(compact_->sub_compact_states[0].status);
-    for (auto& thread : thread_pool) {
-      thread.join();
     }
+  };
+  for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) {
+    thread_pool.emplace_back(verify_table,
+                             std::ref(compact_->sub_compact_states[i]));
+  }
+  verify_table(compact_->sub_compact_states[0]);
+  for (auto& thread : thread_pool) {
+    thread.join();
+  }
 
-    for (const auto& state : compact_->sub_compact_states) {
-      if (!state.status.ok()) {
-        status = state.status;
-        break;
-      }
+  for (const auto& state : compact_->sub_compact_states) {
+    if (!state.status.ok()) {
+      status = state.status;
+      break;
     }
   }
 
-  ReleaseSubcompactionResources();
-  TEST_SYNC_POINT("CompactionJob::ReleaseSubcompactionResources:0");
-  TEST_SYNC_POINT("CompactionJob::ReleaseSubcompactionResources:1");
+  return status;
+}
 
-  TablePropertiesCollection tp;
+void CompactionJob::SetOutputTableProperties() {
   for (const auto& state : compact_->sub_compact_states) {
     for (const auto& output : state.GetOutputs()) {
       auto fn =
@@ -851,56 +1007,109 @@ Status CompactionJob::Run() {
                                                      output.table_properties);
     }
   }
+}
 
+void CompactionJob::AggregateSubcompactionOutputAndJobStats() {
   // Before the compaction starts, is_remote_compaction was set to true if
   // compaction_service is set. We now know whether each sub_compaction was
   // done remotely or not. Reset is_remote_compaction back to false and allow
   // AggregateCompactionStats() to set the right value.
-  compaction_job_stats_->is_remote_compaction = false;
+  job_stats_->is_remote_compaction = false;
 
   // Finish up all bookkeeping to unify the subcompaction results.
-  compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_);
-  uint64_t num_input_range_del = 0;
-  bool ok = UpdateCompactionStats(&num_input_range_del);
-  // (Sub)compactions returned ok, do sanity check on the number of input keys.
-  if (status.ok() && ok && compaction_job_stats_->has_num_input_records) {
-    size_t ts_sz = compact_->compaction->column_family_data()
-                       ->user_comparator()
-                       ->timestamp_size();
-    // When trim_ts_ is non-empty, CompactionIterator takes
-    // HistoryTrimmingIterator as input iterator and sees a trimmed view of
-    // input keys. So the number of keys it processed is not suitable for
-    // verification here.
-    // TODO: support verification when trim_ts_ is non-empty.
-    if (!(ts_sz > 0 && !trim_ts_.empty())) {
-      assert(compaction_stats_.stats.num_input_records > 0);
-      // TODO: verify the number of range deletion entries.
-      uint64_t expected =
-          compaction_stats_.stats.num_input_records - num_input_range_del;
-      uint64_t actual = compaction_job_stats_->num_input_records;
-      if (expected != actual) {
-        char scratch[2345];
-        compact_->compaction->Summary(scratch, sizeof(scratch));
-        std::string msg =
-            "Compaction number of input keys does not match "
-            "number of keys processed. Expected " +
-            std::to_string(expected) + " but processed " +
-            std::to_string(actual) + ". Compaction summary: " + scratch;
-        ROCKS_LOG_WARN(
-            db_options_.info_log, "[%s] [JOB %d] Compaction with status: %s",
-            compact_->compaction->column_family_data()->GetName().c_str(),
-            job_context_->job_id, msg.c_str());
-        if (db_options_.compaction_verify_record_count) {
-          status = Status::Corruption(msg);
-        }
-      }
+  compact_->AggregateCompactionStats(internal_stats_, *job_stats_);
+}
+
+Status CompactionJob::VerifyCompactionRecordCounts(
+    bool stats_built_from_input_table_prop, uint64_t num_input_range_del) {
+  Status status;
+  if (stats_built_from_input_table_prop &&
+      job_stats_->has_accurate_num_input_records) {
+    status = VerifyInputRecordCount(num_input_range_del);
+    if (!status.ok()) {
+      return status;
+    }
+  }
+
+  const auto& mutable_cf_options = compact_->compaction->mutable_cf_options();
+  if ((mutable_cf_options.table_factory->IsInstanceOf(
+           TableFactory::kBlockBasedTableName()) ||
+       mutable_cf_options.table_factory->IsInstanceOf(
+           TableFactory::kPlainTableName()))) {
+    status = VerifyOutputRecordCount();
+    if (!status.ok()) {
+      return status;
     }
   }
+  return status;
+}
+
+void CompactionJob::FinalizeCompactionRun(
+    const Status& input_status, bool stats_built_from_input_table_prop,
+    uint64_t num_input_range_del) {
+  if (stats_built_from_input_table_prop) {
+    UpdateCompactionJobInputStatsFromInternalStats(internal_stats_,
+                                                   num_input_range_del);
+  }
+  UpdateCompactionJobOutputStatsFromInternalStats(input_status,
+                                                  internal_stats_);
   RecordCompactionIOStats();
+
   LogFlush(db_options_.info_log);
   TEST_SYNC_POINT("CompactionJob::Run():End");
-  compact_->status = status;
-  TEST_SYNC_POINT_CALLBACK("CompactionJob::Run():EndStatusSet", &status);
+  compact_->status = input_status;
+  TEST_SYNC_POINT_CALLBACK("CompactionJob::Run():EndStatusSet",
+                           const_cast<Status*>(&input_status));
+}
+
+Status CompactionJob::Run() {
+  InitializeCompactionRun();
+
+  const uint64_t start_micros = db_options_.clock->NowMicros();
+
+  RunSubcompactions();
+
+  UpdateTimingStats(start_micros);
+
+  TEST_SYNC_POINT("CompactionJob::Run:BeforeVerify");
+
+  Status status = CollectSubcompactionErrors();
+
+  // If compaction was aborted or manually paused, clean up any output files
+  // from completed subcompactions to prevent orphaned files on disk.
+  // Skip cleanup for resumable compaction (when progress writer is set)
+  // because the output files are needed for resumption.
+  if ((status.IsCompactionAborted() || status.IsManualCompactionPaused()) &&
+      compaction_progress_writer_ == nullptr) {
+    CleanupAbortedSubcompactions();
+  }
+
+  if (status.ok()) {
+    status = SyncOutputDirectories();
+  }
+
+  if (status.ok()) {
+    status = VerifyOutputFiles();
+  }
+
+  if (status.ok()) {
+    SetOutputTableProperties();
+  }
+
+  AggregateSubcompactionOutputAndJobStats();
+
+  uint64_t num_input_range_del = 0;
+  bool stats_built_from_input_table_prop =
+      UpdateInternalStatsFromInputFiles(&num_input_range_del);
+
+  if (status.ok()) {
+    status = VerifyCompactionRecordCounts(stats_built_from_input_table_prop,
+                                          num_input_range_del);
+  }
+
+  FinalizeCompactionRun(status, stats_built_from_input_table_prop,
+                        num_input_range_del);
+
   return status;
 }
 
@@ -917,7 +1126,7 @@ Status CompactionJob::Install(bool* compaction_released) {
 
   int output_level = compact_->compaction->output_level();
   cfd->internal_stats()->AddCompactionStats(output_level, thread_pri_,
-                                            compaction_stats_);
+                                            internal_stats_);
 
   if (status.ok()) {
     status = InstallCompactionResults(compaction_released);
@@ -928,7 +1137,7 @@ Status CompactionJob::Install(bool* compaction_released) {
 
   VersionStorageInfo::LevelSummaryStorage tmp;
   auto vstorage = cfd->current()->storage_info();
-  const auto& stats = compaction_stats_.stats;
+  const auto& stats = internal_stats_.output_level_stats;
 
   double read_write_amp = 0.0;
   double write_amp = 0.0;
@@ -994,19 +1203,20 @@ Status CompactionJob::Install(bool* compaction_released) {
         blob_files.back()->GetBlobFileNumber());
   }
 
-  if (compaction_stats_.has_penultimate_level_output) {
-    ROCKS_LOG_BUFFER(
-        log_buffer_,
-        "[%s] has Penultimate Level output: %" PRIu64
-        ", level %d, number of files: %" PRIu64 ", number of records: %" PRIu64,
-        column_family_name.c_str(),
-        compaction_stats_.penultimate_level_stats.bytes_written,
-        compact_->compaction->GetPenultimateLevel(),
-        compaction_stats_.penultimate_level_stats.num_output_files,
-        compaction_stats_.penultimate_level_stats.num_output_records);
+  if (internal_stats_.has_proximal_level_output) {
+    ROCKS_LOG_BUFFER(log_buffer_,
+                     "[%s] has Proximal Level output: %" PRIu64
+                     ", level %d, number of files: %" PRIu64
+                     ", number of records: %" PRIu64,
+                     column_family_name.c_str(),
+                     internal_stats_.proximal_level_stats.bytes_written,
+                     compact_->compaction->GetProximalLevel(),
+                     internal_stats_.proximal_level_stats.num_output_files,
+                     internal_stats_.proximal_level_stats.num_output_records);
   }
 
-  UpdateCompactionJobStats(stats);
+  TEST_SYNC_POINT_CALLBACK(
+      "CompactionJob::Install:AfterUpdateCompactionJobStats", job_stats_);
 
   auto stream = event_logger_->LogToBuffer(log_buffer_, 8192);
   stream << "job" << job_id_ << "event" << "compaction_finished"
@@ -1028,17 +1238,16 @@ Status CompactionJob::Install(bool* compaction_released) {
          << CompressionTypeToString(compact_->compaction->output_compression());
 
   stream << "num_single_delete_mismatches"
-         << compaction_job_stats_->num_single_del_mismatch;
+         << job_stats_->num_single_del_mismatch;
   stream << "num_single_delete_fallthrough"
-         << compaction_job_stats_->num_single_del_fallthru;
+         << job_stats_->num_single_del_fallthru;
 
   if (measure_io_stats_) {
-    stream << "file_write_nanos" << compaction_job_stats_->file_write_nanos;
-    stream << "file_range_sync_nanos"
-           << compaction_job_stats_->file_range_sync_nanos;
-    stream << "file_fsync_nanos" << compaction_job_stats_->file_fsync_nanos;
+    stream << "file_write_nanos" << job_stats_->file_write_nanos;
+    stream << "file_range_sync_nanos" << job_stats_->file_range_sync_nanos;
+    stream << "file_fsync_nanos" << job_stats_->file_fsync_nanos;
     stream << "file_prepare_write_nanos"
-           << compaction_job_stats_->file_prepare_write_nanos;
+           << job_stats_->file_prepare_write_nanos;
   }
 
   stream << "lsm_state";
@@ -1056,16 +1265,16 @@ Status CompactionJob::Install(bool* compaction_released) {
     stream << "blob_file_tail" << blob_files.back()->GetBlobFileNumber();
   }
 
-  if (compaction_stats_.has_penultimate_level_output) {
+  if (internal_stats_.has_proximal_level_output) {
     InternalStats::CompactionStats& pl_stats =
-        compaction_stats_.penultimate_level_stats;
-    stream << "penultimate_level_num_output_files" << pl_stats.num_output_files;
-    stream << "penultimate_level_bytes_written" << pl_stats.bytes_written;
-    stream << "penultimate_level_num_output_records"
+        internal_stats_.proximal_level_stats;
+    stream << "proximal_level_num_output_files" << pl_stats.num_output_files;
+    stream << "proximal_level_bytes_written" << pl_stats.bytes_written;
+    stream << "proximal_level_num_output_records"
            << pl_stats.num_output_records;
-    stream << "penultimate_level_num_output_files_blob"
+    stream << "proximal_level_num_output_files_blob"
            << pl_stats.num_output_files_blob;
-    stream << "penultimate_level_bytes_written_blob"
+    stream << "proximal_level_bytes_written_blob"
            << pl_stats.bytes_written_blob;
   }
 
@@ -1124,59 +1333,62 @@ void CompactionJob::NotifyOnSubcompactionCompleted(
   }
 }
 
-void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
-  assert(sub_compact);
-  assert(sub_compact->compaction);
+bool CompactionJob::ShouldUseLocalCompaction(SubcompactionState* sub_compact) {
   if (db_options_.compaction_service) {
     CompactionServiceJobStatus comp_status =
         ProcessKeyValueCompactionWithCompactionService(sub_compact);
-    if (comp_status == CompactionServiceJobStatus::kSuccess ||
-        comp_status == CompactionServiceJobStatus::kFailure) {
-      return;
+    if (comp_status != CompactionServiceJobStatus::kUseLocal) {
+      return false;
     }
     // fallback to local compaction
     assert(comp_status == CompactionServiceJobStatus::kUseLocal);
     sub_compact->compaction_job_stats.is_remote_compaction = false;
   }
+  return true;
+}
 
-  uint64_t prev_cpu_micros = db_options_.clock->CPUMicros();
+CompactionJob::CompactionIOStatsSnapshot CompactionJob::InitializeIOStats() {
+  CompactionIOStatsSnapshot io_stats;
 
-  ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
+  if (measure_io_stats_) {
+    io_stats.prev_perf_level = GetPerfLevel();
+    SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
+    io_stats.prev_write_nanos = IOSTATS(write_nanos);
+    io_stats.prev_fsync_nanos = IOSTATS(fsync_nanos);
+    io_stats.prev_range_sync_nanos = IOSTATS(range_sync_nanos);
+    io_stats.prev_prepare_write_nanos = IOSTATS(prepare_write_nanos);
+    io_stats.prev_cpu_write_nanos = IOSTATS(cpu_write_nanos);
+    io_stats.prev_cpu_read_nanos = IOSTATS(cpu_read_nanos);
+  }
+
+  return io_stats;
+}
+
+Status CompactionJob::SetupAndValidateCompactionFilter(
+    SubcompactionState* sub_compact,
+    const CompactionFilter* configured_compaction_filter,
+    const CompactionFilter*& compaction_filter,
+    std::unique_ptr<CompactionFilter>& compaction_filter_from_factory) {
+  compaction_filter = configured_compaction_filter;
 
-  // Create compaction filter and fail the compaction if
-  // IgnoreSnapshots() = false because it is not supported anymore
-  const CompactionFilter* compaction_filter = cfd->ioptions().compaction_filter;
-  std::unique_ptr<CompactionFilter> compaction_filter_from_factory = nullptr;
   if (compaction_filter == nullptr) {
     compaction_filter_from_factory =
         sub_compact->compaction->CreateCompactionFilter();
     compaction_filter = compaction_filter_from_factory.get();
   }
+
   if (compaction_filter != nullptr && !compaction_filter->IgnoreSnapshots()) {
-    sub_compact->status = Status::NotSupported(
+    return Status::NotSupported(
         "CompactionFilter::IgnoreSnapshots() = false is not supported "
         "anymore.");
-    return;
   }
 
-  NotifyOnSubcompactionBegin(sub_compact);
-
-  // This is assigned after creation of SubcompactionState to simplify that
-  // creation across both CompactionJob and CompactionServiceCompactionJob
-  sub_compact->AssignRangeDelAggregator(
-      std::make_unique<CompactionRangeDelAggregator>(
-          &cfd->internal_comparator(), existing_snapshots_,
-          &full_history_ts_low_, &trim_ts_));
-
-  // TODO: since we already use C++17, should use
-  // std::optional<const Slice> instead.
-  const std::optional<Slice> start = sub_compact->start;
-  const std::optional<Slice> end = sub_compact->end;
-
-  std::optional<Slice> start_without_ts;
-  std::optional<Slice> end_without_ts;
+  return Status::OK();
+}
 
-  ReadOptions read_options;
+void CompactionJob::InitializeReadOptionsAndBoundaries(
+    const size_t ts_sz, ReadOptions& read_options,
+    SubcompactionKeyBoundaries& boundaries) {
   read_options.verify_checksums = true;
   read_options.fill_cache = false;
   read_options.rate_limiter_priority = GetRateLimiterPriority();
@@ -1187,242 +1399,245 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
   // (b) CompactionFilter::Decision::kRemoveAndSkipUntil.
   read_options.total_order_seek = true;
 
-  const WriteOptions write_options(Env::IOPriority::IO_LOW,
-                                   Env::IOActivity::kCompaction);
-
   // Remove the timestamps from boundaries because boundaries created in
   // GenSubcompactionBoundaries doesn't strip away the timestamp.
-  size_t ts_sz = cfd->user_comparator()->timestamp_size();
-  if (start.has_value()) {
-    read_options.iterate_lower_bound = &(*start);
+  if (boundaries.start.has_value()) {
+    read_options.iterate_lower_bound = &(*boundaries.start);
     if (ts_sz > 0) {
-      start_without_ts = StripTimestampFromUserKey(*start, ts_sz);
-      read_options.iterate_lower_bound = &(*start_without_ts);
+      boundaries.start_without_ts =
+          StripTimestampFromUserKey(*boundaries.start, ts_sz);
+      read_options.iterate_lower_bound = &(*boundaries.start_without_ts);
     }
   }
-  if (end.has_value()) {
-    read_options.iterate_upper_bound = &(*end);
+  if (boundaries.end.has_value()) {
+    read_options.iterate_upper_bound = &(*boundaries.end);
     if (ts_sz > 0) {
-      end_without_ts = StripTimestampFromUserKey(*end, ts_sz);
-      read_options.iterate_upper_bound = &(*end_without_ts);
+      boundaries.end_without_ts =
+          StripTimestampFromUserKey(*boundaries.end, ts_sz);
+      read_options.iterate_upper_bound = &(*boundaries.end_without_ts);
     }
   }
 
-  // Although the v2 aggregator is what the level iterator(s) know about,
-  // the AddTombstones calls will be propagated down to the v1 aggregator.
-  std::unique_ptr<InternalIterator> raw_input(versions_->MakeInputIterator(
-      read_options, sub_compact->compaction, sub_compact->RangeDelAgg(),
-      file_options_for_read_, start, end));
-  InternalIterator* input = raw_input.get();
-
-  IterKey start_ikey;
-  IterKey end_ikey;
-  Slice start_slice;
-  Slice end_slice;
-  Slice start_user_key{};
-  Slice end_user_key{};
-
-  static constexpr char kMaxTs[] =
-      "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff";
-  Slice ts_slice;
-  std::string max_ts;
   if (ts_sz > 0) {
-    if (ts_sz <= strlen(kMaxTs)) {
-      ts_slice = Slice(kMaxTs, ts_sz);
+    if (ts_sz <= strlen(boundaries.kMaxTs)) {
+      boundaries.ts_slice = Slice(boundaries.kMaxTs, ts_sz);
     } else {
-      max_ts = std::string(ts_sz, '\xff');
-      ts_slice = Slice(max_ts);
+      boundaries.max_ts = std::string(ts_sz, '\xff');
+      boundaries.ts_slice = Slice(boundaries.max_ts);
     }
   }
-
-  if (start.has_value()) {
-    start_ikey.SetInternalKey(*start, kMaxSequenceNumber, kValueTypeForSeek);
+  if (boundaries.start.has_value()) {
+    boundaries.start_ikey.SetInternalKey(*boundaries.start, kMaxSequenceNumber,
+                                         kValueTypeForSeek);
     if (ts_sz > 0) {
-      start_ikey.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeek,
-                                   &ts_slice);
+      boundaries.start_ikey.UpdateInternalKey(
+          kMaxSequenceNumber, kValueTypeForSeek, &boundaries.ts_slice);
     }
-    start_slice = start_ikey.GetInternalKey();
-    start_user_key = start_ikey.GetUserKey();
+    boundaries.start_internal_key = boundaries.start_ikey.GetInternalKey();
+    boundaries.start_user_key = boundaries.start_ikey.GetUserKey();
   }
-  if (end.has_value()) {
-    end_ikey.SetInternalKey(*end, kMaxSequenceNumber, kValueTypeForSeek);
+  if (boundaries.end.has_value()) {
+    boundaries.end_ikey.SetInternalKey(*boundaries.end, kMaxSequenceNumber,
+                                       kValueTypeForSeek);
     if (ts_sz > 0) {
-      end_ikey.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeek,
-                                 &ts_slice);
+      boundaries.end_ikey.UpdateInternalKey(
+          kMaxSequenceNumber, kValueTypeForSeek, &boundaries.ts_slice);
     }
-    end_slice = end_ikey.GetInternalKey();
-    end_user_key = end_ikey.GetUserKey();
+    boundaries.end_internal_key = boundaries.end_ikey.GetInternalKey();
+    boundaries.end_user_key = boundaries.end_ikey.GetUserKey();
   }
+}
 
-  std::unique_ptr<InternalIterator> clip;
-  if (start.has_value() || end.has_value()) {
-    clip = std::make_unique<ClippingIterator>(
-        raw_input.get(), start.has_value() ? &start_slice : nullptr,
-        end.has_value() ? &end_slice : nullptr, &cfd->internal_comparator());
-    input = clip.get();
-  }
+InternalIterator* CompactionJob::CreateInputIterator(
+    SubcompactionState* sub_compact, ColumnFamilyData* cfd,
+    SubcompactionInternalIterators& iterators,
+    SubcompactionKeyBoundaries& boundaries, ReadOptions& read_options) {
+  const size_t ts_sz = cfd->user_comparator()->timestamp_size();
+  InitializeReadOptionsAndBoundaries(ts_sz, read_options, boundaries);
+
+  // This is assigned after creation of SubcompactionState to simplify that
+  // creation across both CompactionJob and CompactionServiceCompactionJob
+  sub_compact->AssignRangeDelAggregator(
+      std::make_unique<CompactionRangeDelAggregator>(
+          &cfd->internal_comparator(), job_context_->snapshot_seqs,
+          &full_history_ts_low_, &trim_ts_));
+
+  // Although the v2 aggregator is what the level iterator(s) know about,
+  // the AddTombstones calls will be propagated down to the v1 aggregator.
+  iterators.raw_input =
+      std::unique_ptr<InternalIterator>(versions_->MakeInputIterator(
+          read_options, sub_compact->compaction, sub_compact->RangeDelAgg(),
+          file_options_for_read_, boundaries.start, boundaries.end));
+  InternalIterator* input = iterators.raw_input.get();
 
-  std::unique_ptr<InternalIterator> blob_counter;
+  if (boundaries.start.has_value() || boundaries.end.has_value()) {
+    iterators.clip = std::make_unique<ClippingIterator>(
+        iterators.raw_input.get(),
+        boundaries.start.has_value() ? &boundaries.start_internal_key : nullptr,
+        boundaries.end.has_value() ? &boundaries.end_internal_key : nullptr,
+        &cfd->internal_comparator());
+    input = iterators.clip.get();
+  }
 
   if (sub_compact->compaction->DoesInputReferenceBlobFiles()) {
     BlobGarbageMeter* meter = sub_compact->Current().CreateBlobGarbageMeter();
-    blob_counter = std::make_unique<BlobCountingIterator>(input, meter);
-    input = blob_counter.get();
+    iterators.blob_counter =
+        std::make_unique<BlobCountingIterator>(input, meter);
+    input = iterators.blob_counter.get();
   }
 
-  std::unique_ptr<InternalIterator> trim_history_iter;
   if (ts_sz > 0 && !trim_ts_.empty()) {
-    trim_history_iter = std::make_unique<HistoryTrimmingIterator>(
+    iterators.trim_history_iter = std::make_unique<HistoryTrimmingIterator>(
         input, cfd->user_comparator(), trim_ts_);
-    input = trim_history_iter.get();
+    input = iterators.trim_history_iter.get();
   }
 
-  input->SeekToFirst();
-
-  AutoThreadOperationStageUpdater stage_updater(
-      ThreadStatus::STAGE_COMPACTION_PROCESS_KV);
-
-  // I/O measurement variables
-  PerfLevel prev_perf_level = PerfLevel::kEnableTime;
-  const uint64_t kRecordStatsEvery = 1000;
-  uint64_t prev_write_nanos = 0;
-  uint64_t prev_fsync_nanos = 0;
-  uint64_t prev_range_sync_nanos = 0;
-  uint64_t prev_prepare_write_nanos = 0;
-  uint64_t prev_cpu_write_nanos = 0;
-  uint64_t prev_cpu_read_nanos = 0;
-  if (measure_io_stats_) {
-    prev_perf_level = GetPerfLevel();
-    SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
-    prev_write_nanos = IOSTATS(write_nanos);
-    prev_fsync_nanos = IOSTATS(fsync_nanos);
-    prev_range_sync_nanos = IOSTATS(range_sync_nanos);
-    prev_prepare_write_nanos = IOSTATS(prepare_write_nanos);
-    prev_cpu_write_nanos = IOSTATS(cpu_write_nanos);
-    prev_cpu_read_nanos = IOSTATS(cpu_read_nanos);
-  }
-
-  MergeHelper merge(
-      env_, cfd->user_comparator(), cfd->ioptions().merge_operator.get(),
-      compaction_filter, db_options_.info_log.get(),
-      false /* internal key corruption is expected */,
-      existing_snapshots_.empty() ? 0 : existing_snapshots_.back(),
-      snapshot_checker_, compact_->compaction->level(), db_options_.stats);
+  return input;
+}
 
+void CompactionJob::CreateBlobFileBuilder(
+    SubcompactionState* sub_compact, ColumnFamilyData* cfd,
+    std::unique_ptr<BlobFileBuilder>& blob_file_builder,
+    const WriteOptions& write_options) {
   const auto& mutable_cf_options =
       sub_compact->compaction->mutable_cf_options();
 
-  std::vector<std::string> blob_file_paths;
-
-  // TODO: BlobDB to support output_to_penultimate_level compaction, which needs
+  // TODO: BlobDB to support output_to_proximal_level compaction, which needs
   //  2 builders, so may need to move to `CompactionOutputs`
-  std::unique_ptr<BlobFileBuilder> blob_file_builder(
-      (mutable_cf_options.enable_blob_files &&
-       sub_compact->compaction->output_level() >=
-           mutable_cf_options.blob_file_starting_level)
-          ? new BlobFileBuilder(
-                versions_, fs_.get(),
-                &sub_compact->compaction->immutable_options(),
-                &mutable_cf_options, &file_options_, &write_options, db_id_,
-                db_session_id_, job_id_, cfd->GetID(), cfd->GetName(),
-                write_hint_, io_tracer_, blob_callback_,
-                BlobFileCreationReason::kCompaction, &blob_file_paths,
-                sub_compact->Current().GetBlobFileAdditionsPtr())
-          : nullptr);
+  if (mutable_cf_options.enable_blob_files &&
+      sub_compact->compaction->output_level() >=
+          mutable_cf_options.blob_file_starting_level) {
+    blob_file_builder = std::make_unique<BlobFileBuilder>(
+        versions_, fs_.get(), &sub_compact->compaction->immutable_options(),
+        &mutable_cf_options, &file_options_, &write_options, db_id_,
+        db_session_id_, job_id_, cfd->GetID(), cfd->GetName(), write_hint_,
+        io_tracer_, blob_callback_, BlobFileCreationReason::kCompaction,
+        sub_compact->Current().GetOutputFilePathsPtr(),
+        sub_compact->Current().GetBlobFileAdditionsPtr());
+  } else {
+    blob_file_builder = nullptr;
+  }
+}
 
-  TEST_SYNC_POINT("CompactionJob::Run():Inprogress");
-  TEST_SYNC_POINT_CALLBACK("CompactionJob::Run():PausingManualCompaction:1",
-                           static_cast<void*>(const_cast<std::atomic<bool>*>(
-                               &manual_compaction_canceled_)));
+std::unique_ptr<CompactionIterator> CompactionJob::CreateCompactionIterator(
+    SubcompactionState* sub_compact, ColumnFamilyData* cfd,
+    InternalIterator* input, const CompactionFilter* compaction_filter,
+    MergeHelper& merge, std::unique_ptr<BlobFileBuilder>& blob_file_builder,
+    const WriteOptions& write_options) {
+  CreateBlobFileBuilder(sub_compact, cfd, blob_file_builder, write_options);
 
   const std::string* const full_history_ts_low =
       full_history_ts_low_.empty() ? nullptr : &full_history_ts_low_;
-  const SequenceNumber job_snapshot_seq =
-      job_context_ ? job_context_->GetJobSnapshotSequence()
-                   : kMaxSequenceNumber;
+  assert(job_context_);
 
-  auto c_iter = std::make_unique<CompactionIterator>(
+  return std::make_unique<CompactionIterator>(
       input, cfd->user_comparator(), &merge, versions_->LastSequence(),
-      &existing_snapshots_, earliest_snapshot_,
-      earliest_write_conflict_snapshot_, job_snapshot_seq, snapshot_checker_,
-      env_, ShouldReportDetailedTime(env_, stats_),
-      /*expect_valid_internal_key=*/true, sub_compact->RangeDelAgg(),
+      &(job_context_->snapshot_seqs), earliest_snapshot_,
+      job_context_->earliest_write_conflict_snapshot,
+      job_context_->GetJobSnapshotSequence(), job_context_->snapshot_checker,
+      env_, ShouldReportDetailedTime(env_, stats_), sub_compact->RangeDelAgg(),
       blob_file_builder.get(), db_options_.allow_data_in_errors,
       db_options_.enforce_single_del_contracts, manual_compaction_canceled_,
       sub_compact->compaction
           ->DoesInputReferenceBlobFiles() /* must_count_input_entries */,
       sub_compact->compaction, compaction_filter, shutting_down_,
       db_options_.info_log, full_history_ts_low, preserve_seqno_after_);
-  c_iter->SeekToFirst();
-
-  const auto& c_iter_stats = c_iter->iter_stats();
+}
 
-  // define the open and close functions for the compaction files, which will be
-  // used open/close output files when needed.
+std::pair<CompactionFileOpenFunc, CompactionFileCloseFunc>
+CompactionJob::CreateFileHandlers(SubcompactionState* sub_compact,
+                                  SubcompactionKeyBoundaries& boundaries) {
   const CompactionFileOpenFunc open_file_func =
       [this, sub_compact](CompactionOutputs& outputs) {
         return this->OpenCompactionOutputFile(sub_compact, outputs);
       };
 
+  const Slice* start_user_key =
+      sub_compact->start.has_value() ? &boundaries.start_user_key : nullptr;
+  const Slice* end_user_key =
+      sub_compact->end.has_value() ? &boundaries.end_user_key : nullptr;
+
   const CompactionFileCloseFunc close_file_func =
       [this, sub_compact, start_user_key, end_user_key](
-          CompactionOutputs& outputs, const Status& status,
-          const Slice& next_table_min_key) {
+          const Status& status,
+          const ParsedInternalKey& prev_iter_output_internal_key,
+          const Slice& next_table_min_key, const CompactionIterator* c_iter,
+          CompactionOutputs& outputs) {
         return this->FinishCompactionOutputFile(
-            status, sub_compact, outputs, next_table_min_key,
-            sub_compact->start.has_value() ? &start_user_key : nullptr,
-            sub_compact->end.has_value() ? &end_user_key : nullptr);
+            status, prev_iter_output_internal_key, next_table_min_key,
+            start_user_key, end_user_key, c_iter, sub_compact, outputs);
       };
 
+  return {open_file_func, close_file_func};
+}
+
+Status CompactionJob::ProcessKeyValue(
+    SubcompactionState* sub_compact, ColumnFamilyData* cfd,
+    CompactionIterator* c_iter, const CompactionFileOpenFunc& open_file_func,
+    const CompactionFileCloseFunc& close_file_func, uint64_t& prev_cpu_micros) {
+  // Cron interval for periodic operations: stats update, abort check,
+  // and sync points. Uses 1024 (power of 2) for efficient bitwise check.
+  const uint64_t kCronEveryMask = (1 << 10) - 1;
+  [[maybe_unused]] const std::optional<const Slice> end = sub_compact->end;
+
+  // Check for abort signal before starting key processing
+  if (compaction_aborted_.load(std::memory_order_acquire) > 0) {
+    return Status::Incomplete(Status::SubCode::kCompactionAborted);
+  }
+
   Status status;
+  IterKey prev_iter_output_key;
+  ParsedInternalKey prev_iter_output_internal_key;
+
   TEST_SYNC_POINT_CALLBACK(
       "CompactionJob::ProcessKeyValueCompaction()::Processing",
       static_cast<void*>(const_cast<Compaction*>(sub_compact->compaction)));
-  uint64_t last_cpu_micros = prev_cpu_micros;
-  while (status.ok() && !cfd->IsDropped() && c_iter->Valid()) {
-    // Invariant: c_iter.status() is guaranteed to be OK if c_iter->Valid()
-    // returns true.
+
+  while (status.ok() && !cfd->IsDropped() && c_iter->Valid() &&
+         c_iter->status().ok()) {
     assert(!end.has_value() ||
            cfd->user_comparator()->Compare(c_iter->user_key(), *end) < 0);
 
-    if (c_iter_stats.num_input_records % kRecordStatsEvery ==
-        kRecordStatsEvery - 1) {
-      RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats);
-      c_iter->ResetRecordCounts();
-      RecordCompactionIOStats();
-
-      uint64_t cur_cpu_micros = db_options_.clock->CPUMicros();
-      assert(cur_cpu_micros >= last_cpu_micros);
-      RecordTick(stats_, COMPACTION_CPU_TOTAL_TIME,
-                 cur_cpu_micros - last_cpu_micros);
-      last_cpu_micros = cur_cpu_micros;
+    const uint64_t num_records = c_iter->iter_stats().num_input_records;
+
+    // Periodic cron operations: stats update, abort check.
+    if ((num_records & kCronEveryMask) == kCronEveryMask) {
+      // Check for abort signal periodically
+      if (compaction_aborted_.load(std::memory_order_acquire) > 0) {
+        status = Status::Incomplete(Status::SubCode::kCompactionAborted);
+        break;
+      }
+
+      UpdateSubcompactionJobStatsIncrementally(
+          c_iter, &sub_compact->compaction_job_stats,
+          db_options_.clock->CPUMicros(), prev_cpu_micros);
     }
 
     const auto& ikey = c_iter->ikey();
-    bool use_penultimate_output = ikey.sequence > penultimate_after_seqno_;
+    bool use_proximal_output = ikey.sequence > proximal_after_seqno_;
+
 #ifndef NDEBUG
     if (sub_compact->compaction->SupportsPerKeyPlacement()) {
-      // Could be overridden by unittest
       PerKeyPlacementContext context(sub_compact->compaction->output_level(),
                                      ikey.user_key, c_iter->value(),
-                                     ikey.sequence, use_penultimate_output);
+                                     ikey.sequence, use_proximal_output);
       TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput.context",
                                &context);
-      if (use_penultimate_output) {
-        // Verify that entries sent to the penultimate level are within the
+      if (use_proximal_output) {
+        // Verify that entries sent to the proximal level are within the
         // allowed range (because the input key range of the last level could
-        // be larger than the allowed output key range of the penultimate
+        // be larger than the allowed output key range of the proximal
         // level). This check uses user keys (ignores sequence numbers) because
         // compaction boundaries are a "clean cut" between user keys (see
         // CompactionPicker::ExpandInputsToCleanCut()), which is especially
         // important when preferred sequence numbers has been swapped in for
         // kTypeValuePreferredSeqno / TimedPut.
-        sub_compact->compaction->TEST_AssertWithinPenultimateLevelOutputRange(
+        sub_compact->compaction->TEST_AssertWithinProximalLevelOutputRange(
             c_iter->user_key());
       }
     } else {
-      assert(penultimate_after_seqno_ == kMaxSequenceNumber);
-      assert(!use_penultimate_output);
+      assert(proximal_after_seqno_ == kMaxSequenceNumber);
+      assert(!use_proximal_output);
     }
 #endif  // NDEBUG
 
@@ -1431,8 +1646,9 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
     // and `close_file_func`.
     // TODO: it would be better to have the compaction file open/close moved
     // into `CompactionOutputs` which has the output file information.
-    status = sub_compact->AddToOutput(*c_iter, use_penultimate_output,
-                                      open_file_func, close_file_func);
+    status = sub_compact->AddToOutput(*c_iter, use_proximal_output,
+                                      open_file_func, close_file_func,
+                                      prev_iter_output_internal_key);
     if (!status.ok()) {
       break;
     }
@@ -1440,10 +1656,12 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
     TEST_SYNC_POINT_CALLBACK("CompactionJob::Run():PausingManualCompaction:2",
                              static_cast<void*>(const_cast<std::atomic<bool>*>(
                                  &manual_compaction_canceled_)));
+
+    prev_iter_output_key.SetInternalKey(c_iter->key(),
+                                        &prev_iter_output_internal_key);
+    prev_iter_output_internal_key.sequence = ikey.sequence;
+    prev_iter_output_internal_key.type = ikey.type;
     c_iter->Next();
-    if (c_iter->status().IsManualCompactionPaused()) {
-      break;
-    }
 
 #ifndef NDEBUG
     bool stop = false;
@@ -1455,13 +1673,33 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
 #endif  // NDEBUG
   }
 
-  // This number may not be accurate when CompactionIterator was created
-  // with `must_count_input_entries=false`.
+  return status;
+}
+
+void CompactionJob::UpdateSubcompactionJobStatsIncrementally(
+    CompactionIterator* c_iter, CompactionJobStats* compaction_job_stats,
+    uint64_t cur_cpu_micros, uint64_t& prev_cpu_micros) {
+  RecordDroppedKeys(c_iter->iter_stats(), compaction_job_stats);
+  c_iter->ResetRecordCounts();
+  RecordCompactionIOStats();
+
+  assert(cur_cpu_micros >= prev_cpu_micros);
+  RecordTick(stats_, COMPACTION_CPU_TOTAL_TIME,
+             cur_cpu_micros - prev_cpu_micros);
+  prev_cpu_micros = cur_cpu_micros;
+}
+
+void CompactionJob::FinalizeSubcompactionJobStats(
+    SubcompactionState* sub_compact, CompactionIterator* c_iter,
+    uint64_t start_cpu_micros, uint64_t prev_cpu_micros,
+    const CompactionIOStatsSnapshot& io_stats) {
+  const CompactionIterationStats& c_iter_stats = c_iter->iter_stats();
+
   assert(!sub_compact->compaction->DoesInputReferenceBlobFiles() ||
          c_iter->HasNumInputEntryScanned());
-  sub_compact->compaction_job_stats.has_num_input_records =
+  sub_compact->compaction_job_stats.has_accurate_num_input_records &=
       c_iter->HasNumInputEntryScanned();
-  sub_compact->compaction_job_stats.num_input_records =
+  sub_compact->compaction_job_stats.num_input_records +=
       c_iter->NumInputEntryScanned();
   sub_compact->compaction_job_stats.num_blobs_read =
       c_iter_stats.num_blobs_read;
@@ -1492,84 +1730,198 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
                c_iter_stats.total_blob_bytes_relocated);
   }
 
-  RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats);
-  RecordCompactionIOStats();
+  uint64_t cur_cpu_micros = db_options_.clock->CPUMicros();
+
+  // Record final compaction statistics including dropped keys, I/O stats,
+  // and CPU time delta from the last periodic measurement
+  UpdateSubcompactionJobStatsIncrementally(c_iter,
+                                           &sub_compact->compaction_job_stats,
+                                           cur_cpu_micros, prev_cpu_micros);
+
+  // Finalize timing and I/O statistics
+  sub_compact->compaction_job_stats.cpu_micros =
+      cur_cpu_micros - start_cpu_micros + sub_compact->GetWorkerCPUMicros();
+
+  if (measure_io_stats_) {
+    sub_compact->compaction_job_stats.file_write_nanos +=
+        IOSTATS(write_nanos) - io_stats.prev_write_nanos;
+    sub_compact->compaction_job_stats.file_fsync_nanos +=
+        IOSTATS(fsync_nanos) - io_stats.prev_fsync_nanos;
+    sub_compact->compaction_job_stats.file_range_sync_nanos +=
+        IOSTATS(range_sync_nanos) - io_stats.prev_range_sync_nanos;
+    sub_compact->compaction_job_stats.file_prepare_write_nanos +=
+        IOSTATS(prepare_write_nanos) - io_stats.prev_prepare_write_nanos;
+    sub_compact->compaction_job_stats.cpu_micros -=
+        (IOSTATS(cpu_write_nanos) - io_stats.prev_cpu_write_nanos +
+         IOSTATS(cpu_read_nanos) - io_stats.prev_cpu_read_nanos) /
+        1000;
+    if (io_stats.prev_perf_level !=
+        PerfLevel::kEnableTimeAndCPUTimeExceptForMutex) {
+      SetPerfLevel(io_stats.prev_perf_level);
+    }
+  }
+}
 
+Status CompactionJob::FinalizeProcessKeyValueStatus(
+    ColumnFamilyData* cfd, InternalIterator* input_iter,
+    CompactionIterator* c_iter, Status status) {
   if (status.ok() && cfd->IsDropped()) {
     status =
         Status::ColumnFamilyDropped("Column family dropped during compaction");
   }
-  if ((status.ok() || status.IsColumnFamilyDropped()) &&
-      shutting_down_->load(std::memory_order_relaxed)) {
+  if (status.ok() && shutting_down_->load(std::memory_order_relaxed)) {
     status = Status::ShutdownInProgress("Database shutdown");
   }
-  if ((status.ok() || status.IsColumnFamilyDropped()) &&
+  if (status.ok() &&
       (manual_compaction_canceled_.load(std::memory_order_relaxed))) {
     status = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
   }
   if (status.ok()) {
-    status = input->status();
+    status = input_iter->status();
   }
   if (status.ok()) {
     status = c_iter->status();
   }
 
+  return status;
+}
+
+Status CompactionJob::CleanupCompactionFiles(
+    SubcompactionState* sub_compact, Status status,
+    const CompactionFileOpenFunc& open_file_func,
+    const CompactionFileCloseFunc& close_file_func) {
   // Call FinishCompactionOutputFile() even if status is not ok: it needs to
   // close the output files. Open file function is also passed, in case there's
   // only range-dels, no file was opened, to save the range-dels, it need to
   // create a new output file.
-  status = sub_compact->CloseCompactionFiles(status, open_file_func,
-                                             close_file_func);
+  return sub_compact->CloseCompactionFiles(status, open_file_func,
+                                           close_file_func);
+}
 
+Status CompactionJob::FinalizeBlobFiles(SubcompactionState* sub_compact,
+                                        BlobFileBuilder* blob_file_builder,
+                                        Status status) {
   if (blob_file_builder) {
     if (status.ok()) {
       status = blob_file_builder->Finish();
     } else {
       blob_file_builder->Abandon(status);
     }
-    blob_file_builder.reset();
     sub_compact->Current().UpdateBlobStats();
   }
 
-  uint64_t cur_cpu_micros = db_options_.clock->CPUMicros();
-  sub_compact->compaction_job_stats.cpu_micros =
-      cur_cpu_micros - prev_cpu_micros;
-  RecordTick(stats_, COMPACTION_CPU_TOTAL_TIME,
-             cur_cpu_micros - last_cpu_micros);
+  return status;
+}
 
-  if (measure_io_stats_) {
-    sub_compact->compaction_job_stats.file_write_nanos +=
-        IOSTATS(write_nanos) - prev_write_nanos;
-    sub_compact->compaction_job_stats.file_fsync_nanos +=
-        IOSTATS(fsync_nanos) - prev_fsync_nanos;
-    sub_compact->compaction_job_stats.file_range_sync_nanos +=
-        IOSTATS(range_sync_nanos) - prev_range_sync_nanos;
-    sub_compact->compaction_job_stats.file_prepare_write_nanos +=
-        IOSTATS(prepare_write_nanos) - prev_prepare_write_nanos;
-    sub_compact->compaction_job_stats.cpu_micros -=
-        (IOSTATS(cpu_write_nanos) - prev_cpu_write_nanos +
-         IOSTATS(cpu_read_nanos) - prev_cpu_read_nanos) /
-        1000;
-    if (prev_perf_level != PerfLevel::kEnableTimeAndCPUTimeExceptForMutex) {
-      SetPerfLevel(prev_perf_level);
-    }
+void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
+  TEST_SYNC_POINT("CompactionJob::ProcessKeyValueCompaction:Start");
+  assert(sub_compact);
+  assert(sub_compact->compaction);
+
+  if (!ShouldUseLocalCompaction(sub_compact)) {
+    return;
   }
+
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_COMPACTION_PROCESS_KV);
+
+  const uint64_t start_cpu_micros = db_options_.clock->CPUMicros();
+  uint64_t prev_cpu_micros = start_cpu_micros;
+  const CompactionIOStatsSnapshot io_stats = InitializeIOStats();
+  ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
+  const CompactionFilter* compaction_filter;
+  std::unique_ptr<CompactionFilter> compaction_filter_from_factory = nullptr;
+  Status filter_status = SetupAndValidateCompactionFilter(
+      sub_compact, cfd->ioptions().compaction_filter, compaction_filter,
+      compaction_filter_from_factory);
+  if (!filter_status.ok()) {
+    sub_compact->status = filter_status;
+    return;
+  }
+
+  NotifyOnSubcompactionBegin(sub_compact);
+
+  SubcompactionKeyBoundaries boundaries(sub_compact->start, sub_compact->end);
+  SubcompactionInternalIterators iterators;
+  ReadOptions read_options;
+  const WriteOptions write_options(Env::IOPriority::IO_LOW,
+                                   Env::IOActivity::kCompaction);
+
+  InternalIterator* input_iter = CreateInputIterator(
+      sub_compact, cfd, iterators, boundaries, read_options);
+
+  assert(input_iter);
+
+  Status status =
+      MaybeResumeSubcompactionProgressOnInputIterator(sub_compact, input_iter);
+
+  if (status.IsNotFound()) {
+    input_iter->SeekToFirst();
+  } else if (!status.ok()) {
+    sub_compact->status = status;
+    return;
+  }
+
+  MergeHelper merge(
+      env_, cfd->user_comparator(), cfd->ioptions().merge_operator.get(),
+      compaction_filter, db_options_.info_log.get(),
+      false /* internal key corruption is expected */,
+      job_context_->GetLatestSnapshotSequence(), job_context_->snapshot_checker,
+      compact_->compaction->level(), db_options_.stats);
+  std::unique_ptr<BlobFileBuilder> blob_file_builder;
+
+  auto c_iter =
+      CreateCompactionIterator(sub_compact, cfd, input_iter, compaction_filter,
+                               merge, blob_file_builder, write_options);
+  assert(c_iter);
+  c_iter->SeekToFirst();
+
+  TEST_SYNC_POINT("CompactionJob::Run():Inprogress");
+  TEST_SYNC_POINT_CALLBACK("CompactionJob::Run():PausingManualCompaction:1",
+                           static_cast<void*>(const_cast<std::atomic<bool>*>(
+                               &manual_compaction_canceled_)));
+
+  auto [open_file_func, close_file_func] =
+      CreateFileHandlers(sub_compact, boundaries);
+
+  status = ProcessKeyValue(sub_compact, cfd, c_iter.get(), open_file_func,
+                           close_file_func, prev_cpu_micros);
+
+  status = FinalizeProcessKeyValueStatus(cfd, input_iter, c_iter.get(), status);
+
+  FinalizeSubcompaction(sub_compact, status, open_file_func, close_file_func,
+                        blob_file_builder.get(), c_iter.get(), input_iter,
+                        start_cpu_micros, prev_cpu_micros, io_stats);
+
+  NotifyOnSubcompactionCompleted(sub_compact);
+}
+
+void CompactionJob::FinalizeSubcompaction(
+    SubcompactionState* sub_compact, Status status,
+    const CompactionFileOpenFunc& open_file_func,
+    const CompactionFileCloseFunc& close_file_func,
+    BlobFileBuilder* blob_file_builder, CompactionIterator* c_iter,
+    [[maybe_unused]] InternalIterator* input_iter, uint64_t start_cpu_micros,
+    uint64_t prev_cpu_micros, const CompactionIOStatsSnapshot& io_stats) {
+  status = CleanupCompactionFiles(sub_compact, status, open_file_func,
+                                  close_file_func);
+  status = FinalizeBlobFiles(sub_compact, blob_file_builder, status);
+
+  FinalizeSubcompactionJobStats(sub_compact, c_iter, start_cpu_micros,
+                                prev_cpu_micros, io_stats);
+
 #ifdef ROCKSDB_ASSERT_STATUS_CHECKED
   if (!status.ok()) {
     if (c_iter) {
       c_iter->status().PermitUncheckedError();
     }
-    if (input) {
-      input->status().PermitUncheckedError();
+    if (input_iter) {
+      input_iter->status().PermitUncheckedError();
     }
   }
 #endif  // ROCKSDB_ASSERT_STATUS_CHECKED
 
-  blob_counter.reset();
-  clip.reset();
-  raw_input.reset();
   sub_compact->status = status;
-  NotifyOnSubcompactionCompleted(sub_compact);
 }
 
 uint64_t CompactionJob::GetCompactionId(SubcompactionState* sub_compact) const {
@@ -1614,9 +1966,11 @@ void CompactionJob::RecordDroppedKeys(
 }
 
 Status CompactionJob::FinishCompactionOutputFile(
-    const Status& input_status, SubcompactionState* sub_compact,
-    CompactionOutputs& outputs, const Slice& next_table_min_key,
-    const Slice* comp_start_user_key, const Slice* comp_end_user_key) {
+    const Status& input_status,
+    const ParsedInternalKey& prev_iter_output_internal_key,
+    const Slice& next_table_min_key, const Slice* comp_start_user_key,
+    const Slice* comp_end_user_key, const CompactionIterator* c_iter,
+    SubcompactionState* sub_compact, CompactionOutputs& outputs) {
   AutoThreadOperationStageUpdater stage_updater(
       ThreadStatus::STAGE_COMPACTION_SYNC_FILE);
   assert(sub_compact != nullptr);
@@ -1634,24 +1988,20 @@ Status CompactionJob::FinishCompactionOutputFile(
   Status s = input_status;
 
   // Add range tombstones
-  auto earliest_snapshot = kMaxSequenceNumber;
-  if (existing_snapshots_.size() > 0) {
-    earliest_snapshot = existing_snapshots_[0];
-  }
   if (s.ok()) {
     // Inclusive lower bound, exclusive upper bound
     std::pair<SequenceNumber, SequenceNumber> keep_seqno_range{
         0, kMaxSequenceNumber};
     if (sub_compact->compaction->SupportsPerKeyPlacement()) {
-      if (outputs.IsPenultimateLevel()) {
-        keep_seqno_range.first = penultimate_after_seqno_;
+      if (outputs.IsProximalLevel()) {
+        keep_seqno_range.first = proximal_after_seqno_;
       } else {
-        keep_seqno_range.second = penultimate_after_seqno_;
+        keep_seqno_range.second = proximal_after_seqno_;
       }
     }
     CompactionIterationStats range_del_out_stats;
     // NOTE1: Use `bottommost_level_ = true` for both bottommost and
-    // output_to_penultimate_level compaction here, as it's only used to decide
+    // output_to_proximal_level compaction here, as it's only used to decide
     // if range dels could be dropped. (Logically, we are taking a single sorted
     // run returned from CompactionIterator and physically splitting it between
     // two output levels.)
@@ -1663,7 +2013,7 @@ Status CompactionJob::FinishCompactionOutputFile(
       s = outputs.AddRangeDels(*sub_compact->RangeDelAgg(), comp_start_user_key,
                                comp_end_user_key, range_del_out_stats,
                                bottommost_level_, cfd->internal_comparator(),
-                               earliest_snapshot, keep_seqno_range,
+                               earliest_snapshot_, keep_seqno_range,
                                next_table_min_key, full_history_ts_low_);
     }
     RecordDroppedKeys(range_del_out_stats, &sub_compact->compaction_job_stats);
@@ -1720,14 +2070,11 @@ Status CompactionJob::FinishCompactionOutputFile(
   if (s.ok()) {
     tp = outputs.GetTableProperties();
   }
-
   if (s.ok() && current_entries == 0 && tp.num_range_deletions == 0) {
     // If there is nothing to output, no necessary to generate a sst file.
     // This happens when the output level is bottom level, at the same time
     // the sub_compact output nothing.
-    std::string fname =
-        TableFileName(sub_compact->compaction->immutable_options().cf_paths,
-                      meta->fd.GetNumber(), meta->fd.GetPathId());
+    std::string fname = GetTableFileName(meta->fd.GetNumber());
 
     // TODO(AR) it is not clear if there are any larger implications if
     // DeleteFile fails here
@@ -1797,10 +2144,99 @@ Status CompactionJob::FinishCompactionOutputFile(
     }
   }
 
+  if (s.ok() && ShouldUpdateSubcompactionProgress(sub_compact, c_iter,
+                                                  prev_iter_output_internal_key,
+                                                  next_table_min_key, meta)) {
+    UpdateSubcompactionProgress(c_iter, next_table_min_key, sub_compact);
+    s = PersistSubcompactionProgress(sub_compact);
+  }
   outputs.ResetBuilder();
   return s;
 }
 
+bool CompactionJob::ShouldUpdateSubcompactionProgress(
+    const SubcompactionState* sub_compact, const CompactionIterator* c_iter,
+    const ParsedInternalKey& prev_iter_output_internal_key,
+    const Slice& next_table_min_internal_key, const FileMetaData* meta) const {
+  const auto* cfd = sub_compact->compaction->column_family_data();
+  // No need to update when the progress will not get persisted
+  if (compaction_progress_writer_ == nullptr) {
+    return false;
+  }
+
+  // No need to update for a new empty output
+  if (meta == nullptr) {
+    return false;
+  }
+
+  // TODO(hx235): save progress even on the last output file
+  if (next_table_min_internal_key.empty()) {
+    return false;
+  }
+
+  // LIMITATION: Persisting compaction progress with timestamp
+  // is not supported since the feature of persisting timestamp of the key in
+  // SST files itself is still experimental
+  size_t ts_sz = cfd->user_comparator()->timestamp_size();
+  if (ts_sz > 0) {
+    return false;
+  }
+
+  // LIMITATION: Compaction progress persistence disabled for file boundaries
+  // containing range deletions. Range deletions can span file boundaries,
+  // making it difficult to ensure adjacent output tables have different user
+  // keys. See the last check for why different users keys of adjacent output
+  // tables are needed
+  const ValueType next_table_min_internal_key_type =
+      ExtractValueType(next_table_min_internal_key);
+  const ValueType prev_iter_output_internal_key_type =
+      prev_iter_output_internal_key.user_key.empty()
+          ? ValueType::kTypeValue
+          : prev_iter_output_internal_key.type;
+
+  // Range deletes truncated to align with file boundaries may be output by the
+  // compaction iterator with `ValueType::kTypeMaxValid` instead of the original
+  // type.
+  if ((next_table_min_internal_key_type == ValueType::kTypeRangeDeletion ||
+       next_table_min_internal_key_type == ValueType::kTypeMaxValid) ||
+      (prev_iter_output_internal_key_type == ValueType::kTypeRangeDeletion ||
+       prev_iter_output_internal_key_type == ValueType::kTypeMaxValid)) {
+    return false;
+  }
+
+  // LIMITATION: Compaction progress persistence disabled when adjacent output
+  // tables share the same user key at boundaries. This ensures a simple Seek()
+  // of the next key when resuming can process all versions of a user key
+  const Slice next_table_min_user_key =
+      ExtractUserKey(next_table_min_internal_key);
+  const Slice prev_table_last_user_key =
+      prev_iter_output_internal_key.user_key.empty()
+          ? Slice()
+          : prev_iter_output_internal_key.user_key;
+
+  if (cfd->user_comparator()->EqualWithoutTimestamp(next_table_min_user_key,
+                                                    prev_table_last_user_key)) {
+    return false;
+  }
+
+  // LIMITATION: Don't save progress if the current key has already been scanned
+  // (looked ahead) in the input but not yet output. This can happen with merge
+  // operations, single deletes, and deletes at the bottommost level where
+  // CompactionIterator needs to look ahead to process multiple entries for the
+  // same user key before outputting a result. If we saved progress and resumed
+  // at this boundary, the resumed session would see and process the same input
+  // key again through Seek(), leading to incorrect double-counting in
+  // number of processed input entries and input count verification failure
+  //
+  // TODO(hx235): Offset num_processed_input_records to avoid double counting
+  // instead of disabling progress persistence.
+  if (c_iter->IsCurrentKeyAlreadyScanned()) {
+    return false;
+  }
+
+  return true;
+}
+
 Status CompactionJob::InstallCompactionResults(bool* compaction_released) {
   assert(compact_);
 
@@ -1814,22 +2250,22 @@ Status CompactionJob::InstallCompactionResults(bool* compaction_released) {
 
   {
     Compaction::InputLevelSummaryBuffer inputs_summary;
-    if (compaction_stats_.has_penultimate_level_output) {
+    if (internal_stats_.has_proximal_level_output) {
       ROCKS_LOG_BUFFER(
           log_buffer_,
-          "[%s] [JOB %d] Compacted %s => output_to_penultimate_level: %" PRIu64
+          "[%s] [JOB %d] Compacted %s => output_to_proximal_level: %" PRIu64
           " bytes + last: %" PRIu64 " bytes. Total: %" PRIu64 " bytes",
           compaction->column_family_data()->GetName().c_str(), job_id_,
           compaction->InputLevelSummary(&inputs_summary),
-          compaction_stats_.penultimate_level_stats.bytes_written,
-          compaction_stats_.stats.bytes_written,
-          compaction_stats_.TotalBytesWritten());
+          internal_stats_.proximal_level_stats.bytes_written,
+          internal_stats_.output_level_stats.bytes_written,
+          internal_stats_.TotalBytesWritten());
     } else {
       ROCKS_LOG_BUFFER(log_buffer_,
                        "[%s] [JOB %d] Compacted %s => %" PRIu64 " bytes",
                        compaction->column_family_data()->GetName().c_str(),
                        job_id_, compaction->InputLevelSummary(&inputs_summary),
-                       compaction_stats_.TotalBytesWritten());
+                       internal_stats_.TotalBytesWritten());
     }
   }
 
@@ -1926,6 +2362,10 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact,
 
   // no need to lock because VersionSet::next_file_number_ is atomic
   uint64_t file_number = versions_->NewFileNumber();
+#ifndef NDEBUG
+  TEST_SYNC_POINT_CALLBACK(
+      "CompactionJob::OpenCompactionOutputFile::NewFileNumber", &file_number);
+#endif
   std::string fname = GetTableFileName(file_number);
   // Fire events.
   ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
@@ -1942,21 +2382,18 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact,
 
   // Pass temperature of the last level files to FileSystem.
   FileOptions fo_copy = file_options_;
-  Temperature temperature = sub_compact->compaction->output_temperature();
-  Temperature last_level_temp =
-      sub_compact->compaction->mutable_cf_options().last_level_temperature;
-  // Here last_level_temperature supersedes default_write_temperature, when
-  // enabled and applicable
-  if (last_level_temp != Temperature::kUnknown &&
-      sub_compact->compaction->is_last_level() &&
-      !outputs.IsPenultimateLevel()) {
-    temperature = last_level_temp;
-  }
+  auto temperature =
+      sub_compact->compaction->GetOutputTemperature(outputs.IsProximalLevel());
   fo_copy.temperature = temperature;
+  fo_copy.write_hint = write_hint_;
 
   Status s;
   IOStatus io_s = NewWritableFile(fs_.get(), fname, &writable_file, fo_copy);
   s = io_s;
+  if (io_s.ok()) {
+    // Track the SST file path for cleanup on abort.
+    outputs.AddOutputFilePath(fname);
+  }
   if (sub_compact->io_status.ok()) {
     sub_compact->io_status = io_s;
     // Since this error is really a copy of the io_s that is checked below as s,
@@ -2038,7 +2475,9 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact,
   }
 
   writable_file->SetIOPriority(GetRateLimiterPriority());
-  writable_file->SetWriteLifeTimeHint(write_hint_);
+  // Subsequent attempts to override the hint via SetWriteLifeTimeHint
+  // with the very same value will be ignored by the fs.
+  writable_file->SetWriteLifeTimeHint(fo_copy.write_hint);
   FileTypeSet tmp_set = db_options_.checksum_handoff_file_types;
   writable_file->SetPreallocationBlockSize(static_cast<size_t>(
       sub_compact->compaction->OutputFilePreallocationSize()));
@@ -2063,7 +2502,7 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact,
       bottommost_level_, TableFileCreationReason::kCompaction,
       0 /* oldest_key_time */, current_time, db_id_, db_session_id_,
       sub_compact->compaction->max_output_file_size(), file_number,
-      penultimate_after_seqno_ /*last_level_inclusive_max_seqno_threshold*/);
+      proximal_after_seqno_ /*last_level_inclusive_max_seqno_threshold*/);
 
   outputs.NewBuilder(tboptions);
 
@@ -2087,16 +2526,43 @@ void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) {
 }
 }  // namespace
 
-bool CompactionJob::UpdateCompactionStats(uint64_t* num_input_range_del) {
+bool CompactionJob::UpdateInternalStatsFromInputFiles(
+    uint64_t* num_input_range_del) {
   assert(compact_);
 
   Compaction* compaction = compact_->compaction;
-  compaction_stats_.stats.num_input_files_in_non_output_levels = 0;
-  compaction_stats_.stats.num_input_files_in_output_level = 0;
+  internal_stats_.output_level_stats.num_input_files_in_non_output_levels = 0;
+  internal_stats_.output_level_stats.num_input_files_in_output_level = 0;
 
   bool has_error = false;
   const ReadOptions read_options(Env::IOActivity::kCompaction);
   const auto& input_table_properties = compaction->GetInputTableProperties();
+
+  // Check all input files for old block-based SST format_version. Why? Old
+  // block-based SST files from roughly version 5.0 to 5.18 could produce
+  // inaccurate num_entries counts due to the evolution of its handling along
+  // with num_range_deletions. We have to disable some paranoid checks when
+  // compacting files from such an old release. However, we don't have great
+  // information to identify those files, so we heuristically over-approximate
+  // that set of files using
+  // (a) format_version < 5, which will be true for any files from RocksDB <
+  // 6.6.0 and should not be true for any recent production files
+  // (b) to avoid including non-block-based SST files (which still use older
+  // format_version markers, and do not support DeleteRange), we also require
+  // the presence of the user property "rocksdb.block.based.table.index.type",
+  // which was added in RocksDB 2.8 and is always present in block-based tables.
+  for (const auto& tp_pair : input_table_properties) {
+    if (tp_pair.second && tp_pair.second->format_version < 5) {
+      // Check for block-based table by looking for its index type property
+      const auto& user_props = tp_pair.second->user_collected_properties;
+      if (user_props.find(BlockBasedTablePropertyNames::kIndexType) !=
+          user_props.end()) {
+        job_stats_->has_accurate_num_input_records = false;
+        break;
+      }
+    }
+  }
+
   for (int input_level = 0;
        input_level < static_cast<int>(compaction->num_input_levels());
        ++input_level) {
@@ -2104,13 +2570,14 @@ bool CompactionJob::UpdateCompactionStats(uint64_t* num_input_range_del) {
     size_t num_input_files = flevel->num_files;
     uint64_t* bytes_read;
     if (compaction->level(input_level) != compaction->output_level()) {
-      compaction_stats_.stats.num_input_files_in_non_output_levels +=
+      internal_stats_.output_level_stats.num_input_files_in_non_output_levels +=
           static_cast<int>(num_input_files);
-      bytes_read = &compaction_stats_.stats.bytes_read_non_output_levels;
+      bytes_read =
+          &internal_stats_.output_level_stats.bytes_read_non_output_levels;
     } else {
-      compaction_stats_.stats.num_input_files_in_output_level +=
+      internal_stats_.output_level_stats.num_input_files_in_output_level +=
           static_cast<int>(num_input_files);
-      bytes_read = &compaction_stats_.stats.bytes_read_output_level;
+      bytes_read = &internal_stats_.output_level_stats.bytes_read_output_level;
     }
     for (size_t i = 0; i < num_input_files; ++i) {
       const FileMetaData* file_meta = flevel->files[i].file_metadata;
@@ -2130,7 +2597,8 @@ bool CompactionJob::UpdateCompactionStats(uint64_t* num_input_range_del) {
           has_error = true;
         }
       }
-      compaction_stats_.stats.num_input_records += file_input_entries;
+      internal_stats_.output_level_stats.num_input_records +=
+          file_input_entries;
       if (num_input_range_del) {
         *num_input_range_del += file_num_range_del;
       }
@@ -2141,62 +2609,123 @@ bool CompactionJob::UpdateCompactionStats(uint64_t* num_input_range_del) {
     size_t num_filtered_input_files = filtered_flevel.size();
     uint64_t* bytes_skipped;
     if (compaction->level(input_level) != compaction->output_level()) {
-      compaction_stats_.stats.num_filtered_input_files_in_non_output_levels +=
+      internal_stats_.output_level_stats
+          .num_filtered_input_files_in_non_output_levels +=
           static_cast<int>(num_filtered_input_files);
-      bytes_skipped = &compaction_stats_.stats.bytes_skipped_non_output_levels;
+      bytes_skipped =
+          &internal_stats_.output_level_stats.bytes_skipped_non_output_levels;
     } else {
-      compaction_stats_.stats.num_filtered_input_files_in_output_level +=
+      internal_stats_.output_level_stats
+          .num_filtered_input_files_in_output_level +=
           static_cast<int>(num_filtered_input_files);
-      bytes_skipped = &compaction_stats_.stats.bytes_skipped_output_level;
+      bytes_skipped =
+          &internal_stats_.output_level_stats.bytes_skipped_output_level;
     }
     for (const FileMetaData* filtered_file_meta : filtered_flevel) {
       *bytes_skipped += filtered_file_meta->fd.GetFileSize();
     }
   }
 
-  assert(compaction_job_stats_);
-  compaction_stats_.stats.bytes_read_blob =
-      compaction_job_stats_->total_blob_bytes_read;
-
-  compaction_stats_.stats.num_dropped_records =
-      compaction_stats_.DroppedRecords();
+  // TODO - find a better place to set these two
+  assert(job_stats_);
+  internal_stats_.output_level_stats.bytes_read_blob =
+      job_stats_->total_blob_bytes_read;
+  internal_stats_.output_level_stats.num_dropped_records =
+      internal_stats_.DroppedRecords();
   return !has_error;
 }
 
-void CompactionJob::UpdateCompactionJobStats(
-    const InternalStats::CompactionStats& stats) const {
-  compaction_job_stats_->elapsed_micros = stats.micros;
-
+void CompactionJob::UpdateCompactionJobInputStatsFromInternalStats(
+    const InternalStats::CompactionStatsFull& internal_stats,
+    uint64_t num_input_range_del) const {
+  assert(job_stats_);
   // input information
-  compaction_job_stats_->total_input_bytes =
-      stats.bytes_read_non_output_levels + stats.bytes_read_output_level;
-  compaction_job_stats_->num_input_records = stats.num_input_records;
-  compaction_job_stats_->num_input_files =
-      stats.num_input_files_in_non_output_levels +
-      stats.num_input_files_in_output_level;
-  compaction_job_stats_->num_input_files_at_output_level =
-      stats.num_input_files_in_output_level;
-  compaction_job_stats_->num_filtered_input_files =
-      stats.num_filtered_input_files_in_non_output_levels +
-      stats.num_filtered_input_files_in_output_level;
-  compaction_job_stats_->num_filtered_input_files_at_output_level =
-      stats.num_filtered_input_files_in_output_level;
-  compaction_job_stats_->total_skipped_input_bytes =
-      stats.bytes_skipped_non_output_levels + stats.bytes_skipped_output_level;
+  job_stats_->total_input_bytes =
+      internal_stats.output_level_stats.bytes_read_non_output_levels +
+      internal_stats.output_level_stats.bytes_read_output_level;
+  job_stats_->num_input_records =
+      internal_stats.output_level_stats.num_input_records - num_input_range_del;
+  job_stats_->num_input_files =
+      internal_stats.output_level_stats.num_input_files_in_non_output_levels +
+      internal_stats.output_level_stats.num_input_files_in_output_level;
+  job_stats_->num_input_files_at_output_level =
+      internal_stats.output_level_stats.num_input_files_in_output_level;
+  job_stats_->num_filtered_input_files =
+      internal_stats.output_level_stats
+          .num_filtered_input_files_in_non_output_levels +
+      internal_stats.output_level_stats
+          .num_filtered_input_files_in_output_level;
+  job_stats_->num_filtered_input_files_at_output_level =
+      internal_stats.output_level_stats
+          .num_filtered_input_files_in_output_level;
+  job_stats_->total_skipped_input_bytes =
+      internal_stats.output_level_stats.bytes_skipped_non_output_levels +
+      internal_stats.output_level_stats.bytes_skipped_output_level;
+
+  if (internal_stats.has_proximal_level_output) {
+    job_stats_->total_input_bytes +=
+        internal_stats.proximal_level_stats.bytes_read_non_output_levels +
+        internal_stats.proximal_level_stats.bytes_read_output_level;
+    job_stats_->num_input_records +=
+        internal_stats.proximal_level_stats.num_input_records;
+    job_stats_->num_input_files +=
+        internal_stats.proximal_level_stats
+            .num_input_files_in_non_output_levels +
+        internal_stats.proximal_level_stats.num_input_files_in_output_level;
+    job_stats_->num_input_files_at_output_level +=
+        internal_stats.proximal_level_stats.num_input_files_in_output_level;
+    job_stats_->num_filtered_input_files +=
+        internal_stats.proximal_level_stats
+            .num_filtered_input_files_in_non_output_levels +
+        internal_stats.proximal_level_stats
+            .num_filtered_input_files_in_output_level;
+    job_stats_->num_filtered_input_files_at_output_level +=
+        internal_stats.proximal_level_stats
+            .num_filtered_input_files_in_output_level;
+    job_stats_->total_skipped_input_bytes +=
+        internal_stats.proximal_level_stats.bytes_skipped_non_output_levels +
+        internal_stats.proximal_level_stats.bytes_skipped_output_level;
+  }
+}
 
-  // output information
-  compaction_job_stats_->total_output_bytes = stats.bytes_written;
-  compaction_job_stats_->total_output_bytes_blob = stats.bytes_written_blob;
-  compaction_job_stats_->num_output_records = stats.num_output_records;
-  compaction_job_stats_->num_output_files = stats.num_output_files;
-  compaction_job_stats_->num_output_files_blob = stats.num_output_files_blob;
+void CompactionJob::UpdateCompactionJobOutputStatsFromInternalStats(
+    const Status& status,
+    const InternalStats::CompactionStatsFull& internal_stats) const {
+  assert(job_stats_);
+  job_stats_->elapsed_micros = internal_stats.output_level_stats.micros;
+  job_stats_->cpu_micros = internal_stats.output_level_stats.cpu_micros;
 
-  if (stats.num_output_files > 0) {
+  // output information
+  job_stats_->total_output_bytes =
+      internal_stats.output_level_stats.bytes_written;
+  job_stats_->total_output_bytes_blob =
+      internal_stats.output_level_stats.bytes_written_blob;
+  job_stats_->num_output_records =
+      internal_stats.output_level_stats.num_output_records;
+  job_stats_->num_output_files =
+      internal_stats.output_level_stats.num_output_files;
+  job_stats_->num_output_files_blob =
+      internal_stats.output_level_stats.num_output_files_blob;
+
+  if (internal_stats.has_proximal_level_output) {
+    job_stats_->total_output_bytes +=
+        internal_stats.proximal_level_stats.bytes_written;
+    job_stats_->total_output_bytes_blob +=
+        internal_stats.proximal_level_stats.bytes_written_blob;
+    job_stats_->num_output_records +=
+        internal_stats.proximal_level_stats.num_output_records;
+    job_stats_->num_output_files +=
+        internal_stats.proximal_level_stats.num_output_files;
+    job_stats_->num_output_files_blob +=
+        internal_stats.proximal_level_stats.num_output_files_blob;
+  }
+
+  if (status.ok() && job_stats_->num_output_files > 0) {
     CopyPrefix(compact_->SmallestUserKey(),
                CompactionJobStats::kMaxPrefixLength,
-               &compaction_job_stats_->smallest_output_key_prefix);
+               &job_stats_->smallest_output_key_prefix);
     CopyPrefix(compact_->LargestUserKey(), CompactionJobStats::kMaxPrefixLength,
-               &compaction_job_stats_->largest_output_key_prefix);
+               &job_stats_->largest_output_key_prefix);
   }
 }
 
@@ -2217,8 +2746,8 @@ void CompactionJob::LogCompaction() {
                    cfd->GetName().c_str(), scratch);
     // build event logger report
     auto stream = event_logger_->Log();
-    stream << "job" << job_id_ << "event" << "compaction_started"
-           << "compaction_reason"
+    stream << "job" << job_id_ << "event" << "compaction_started" << "cf_name"
+           << cfd->GetName() << "compaction_reason"
            << GetCompactionReasonString(compaction->compaction_reason());
     for (size_t i = 0; i < compaction->num_input_levels(); ++i) {
       stream << ("files_L" + std::to_string(compaction->level(i)));
@@ -2230,23 +2759,24 @@ void CompactionJob::LogCompaction() {
     }
     stream << "score" << compaction->score() << "input_data_size"
            << compaction->CalculateTotalInputSize() << "oldest_snapshot_seqno"
-           << (existing_snapshots_.empty()
+           << (job_context_->snapshot_seqs.empty()
                    ? int64_t{-1}  // Use -1 for "none"
-                   : static_cast<int64_t>(existing_snapshots_[0]));
+                   : static_cast<int64_t>(
+                         job_context_->GetEarliestSnapshotSequence()));
     if (compaction->SupportsPerKeyPlacement()) {
-      stream << "prenultimate_after_seqno" << penultimate_after_seqno_;
+      stream << "proximal_after_seqno" << proximal_after_seqno_;
       stream << "preserve_seqno_after" << preserve_seqno_after_;
-      stream << "penultimate_output_level" << compaction->GetPenultimateLevel();
-      stream << "penultimate_output_range"
-             << GetCompactionPenultimateOutputRangeTypeString(
-                    compaction->GetPenultimateOutputRangeType());
+      stream << "proximal_output_level" << compaction->GetProximalLevel();
+      stream << "proximal_output_range"
+             << GetCompactionProximalOutputRangeTypeString(
+                    compaction->GetProximalOutputRangeType());
 
-      if (compaction->GetPenultimateOutputRangeType() ==
-          Compaction::PenultimateOutputRangeType::kDisabled) {
+      if (compaction->GetProximalOutputRangeType() ==
+          Compaction::ProximalOutputRangeType::kDisabled) {
         ROCKS_LOG_WARN(
             db_options_.info_log,
-            "[%s] [JOB %d] Penultimate level output is disabled, likely "
-            "because of the range conflict in the penultimate level",
+            "[%s] [JOB %d] Proximal level output is disabled, likely "
+            "because of the range conflict in the proximal level",
             cfd->GetName().c_str(), job_id_);
       }
     }
@@ -2271,4 +2801,409 @@ Env::IOPriority CompactionJob::GetRateLimiterPriority() {
   return Env::IO_LOW;
 }
 
+Status CompactionJob::ReadTablePropertiesDirectly(
+    const ImmutableOptions& ioptions, const MutableCFOptions& moptions,
+    const FileMetaData* file_meta, const ReadOptions& read_options,
+    std::shared_ptr<const TableProperties>* tp) {
+  std::unique_ptr<FSRandomAccessFile> file;
+  std::string file_name = GetTableFileName(file_meta->fd.GetNumber());
+  FileOptions fopts = file_options_;
+  fopts.file_checksum = file_meta->file_checksum;
+  fopts.file_checksum_func_name = file_meta->file_checksum_func_name;
+  Status s = ioptions.fs->NewRandomAccessFile(file_name, fopts, &file,
+                                              nullptr /* dbg */);
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::unique_ptr<RandomAccessFileReader> file_reader(
+      new RandomAccessFileReader(
+          std::move(file), file_name, ioptions.clock, io_tracer_,
+          ioptions.stats, Histograms::SST_READ_MICROS /* hist_type */,
+          nullptr /* file_read_hist */, ioptions.rate_limiter.get(),
+          ioptions.listeners));
+
+  std::unique_ptr<TableProperties> props;
+
+  uint64_t magic_number = kBlockBasedTableMagicNumber;
+
+  const auto* table_factory = moptions.table_factory.get();
+  if (table_factory == nullptr) {
+    return Status::Incomplete("Table factory is not set");
+  } else {
+    const auto& table_factory_name = table_factory->Name();
+    if (table_factory_name == TableFactory::kPlainTableName()) {
+      magic_number = kPlainTableMagicNumber;
+    } else if (table_factory_name == TableFactory::kCuckooTableName()) {
+      magic_number = kCuckooTableMagicNumber;
+    }
+  }
+
+  s = ReadTableProperties(file_reader.get(), file_meta->fd.GetFileSize(),
+                          magic_number, ioptions, read_options, &props);
+  if (!s.ok()) {
+    return s;
+  }
+
+  *tp = std::move(props);
+  return s;
+}
+
+Status CompactionJob::ReadOutputFilesTableProperties(
+    const autovector<FileMetaData>& output_files,
+    const ReadOptions& read_options,
+    std::vector<std::shared_ptr<const TableProperties>>&
+        output_files_table_properties,
+    bool is_proximal_level) {
+  assert(!output_files.empty());
+
+  static const char* level_type =
+      is_proximal_level ? "proximal output" : "output";
+
+  output_files_table_properties.reserve(output_files.size());
+
+  Status s;
+
+  for (const FileMetaData& metadata : output_files) {
+    std::shared_ptr<const TableProperties> tp;
+    s = ReadTablePropertiesDirectly(compact_->compaction->immutable_options(),
+                                    compact_->compaction->mutable_cf_options(),
+                                    &metadata, read_options, &tp);
+    if (!s.ok()) {
+      ROCKS_LOG_ERROR(
+          db_options_.info_log,
+          "Failed to read table properties for %s level output file #%" PRIu64
+          ": %s",
+          level_type, metadata.fd.GetNumber(), s.ToString().c_str());
+      return s;
+    }
+
+    if (tp == nullptr) {
+      ROCKS_LOG_ERROR(db_options_.info_log,
+                      "Empty table property for %s level output file #%" PRIu64
+                      "",
+                      level_type, metadata.fd.GetNumber());
+
+      s = Status::Corruption("Empty table property for " +
+                             std::string(level_type) +
+                             " level output files during resuming");
+      return s;
+    }
+    output_files_table_properties.push_back(tp);
+  }
+  return s;
+}
+
+void CompactionJob::RestoreCompactionOutputs(
+    const ColumnFamilyData* cfd,
+    const std::vector<std::shared_ptr<const TableProperties>>&
+        output_files_table_properties,
+    SubcompactionProgressPerLevel& subcompaction_progress_per_level,
+    CompactionOutputs* outputs_to_restore) {
+  assert(outputs_to_restore->GetOutputs().size() == 0);
+
+  const auto& output_files = subcompaction_progress_per_level.GetOutputFiles();
+
+  for (size_t i = 0; i < output_files.size(); i++) {
+    FileMetaData file_copy = output_files[i];
+
+    outputs_to_restore->AddOutput(std::move(file_copy),
+                                  cfd->internal_comparator(),
+                                  paranoid_file_checks_, true /* finished */);
+
+    outputs_to_restore->UpdateTableProperties(
+        *output_files_table_properties[i]);
+  }
+
+  outputs_to_restore->SetNumOutputRecords(
+      subcompaction_progress_per_level.GetNumProcessedOutputRecords());
+}
+
+// Attempt to resume compaction from a previously persisted compaction progress.
+//
+// RETURNS:
+// - Status::OK():
+// * Input iterator positioned at next unprocessed key
+// * CompactionOutputs objects fully restored for both output and proximal
+// output levels in SubcompactionState
+// * Compaction job statistics accurately reflect input and output records
+// processed for record count verification
+// * File number generation advanced to prevent conflicts with existing outputs
+// - Status::NotFound(): No valid progress to resume from
+// - Status::Corruption(): Resume key is invalid, beyond input range, or output
+// restoration failed
+Status CompactionJob::MaybeResumeSubcompactionProgressOnInputIterator(
+    SubcompactionState* sub_compact, InternalIterator* input_iter) {
+  const ReadOptions read_options(Env::IOActivity::kCompaction);
+  ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
+  SubcompactionProgress& subcompaction_progress =
+      sub_compact->GetSubcompactionProgressRef();
+
+  if (subcompaction_progress.output_level_progress
+              .GetNumProcessedOutputRecords() == 0 &&
+      subcompaction_progress.proximal_output_level_progress
+              .GetNumProcessedOutputRecords() == 0) {
+    return Status::NotFound("No subcompaction progress to resume");
+  }
+
+  ROCKS_LOG_INFO(db_options_.info_log, "[%s] [JOB %d] Resuming compaction : %s",
+                 cfd->GetName().c_str(), job_id_,
+                 subcompaction_progress.ToString().c_str());
+
+  input_iter->Seek(subcompaction_progress.next_internal_key_to_compact);
+
+  if (!input_iter->Valid()) {
+    ROCKS_LOG_ERROR(db_options_.info_log,
+                    "[%s] [JOB %d] Iterator is invalid after "
+                    "seeking to the key to resume. This indicates the key is "
+                    "incorrectly beyond the input data range.",
+                    cfd->GetName().c_str(), job_id_);
+    return Status::Corruption(
+        "The key to resume is beyond the input data range");
+  } else if (!input_iter->status().ok()) {
+    ROCKS_LOG_ERROR(db_options_.info_log,
+                    "[%s] [JOB %d] Iterator has error after seeking to "
+                    "the key to resume: %s",
+                    cfd->GetName().c_str(), job_id_,
+                    input_iter->status().ToString().c_str());
+    return Status::Corruption(
+        "Iterator has error status after seeking to the key: " +
+        input_iter->status().ToString());
+  }
+
+  sub_compact->compaction_job_stats.has_accurate_num_input_records =
+      subcompaction_progress.num_processed_input_records != 0;
+
+  sub_compact->compaction_job_stats.num_input_records =
+      subcompaction_progress.num_processed_input_records;
+
+  for (const bool& is_proximal_level : {false, true}) {
+    if (is_proximal_level &&
+        !sub_compact->compaction->SupportsPerKeyPlacement()) {
+      continue;
+    }
+
+    Status s;
+    SubcompactionProgressPerLevel& subcompaction_progress_per_level =
+        is_proximal_level
+            ? subcompaction_progress.proximal_output_level_progress
+            : subcompaction_progress.output_level_progress;
+
+    const auto& output_files =
+        subcompaction_progress_per_level.GetOutputFiles();
+
+    std::vector<std::shared_ptr<const TableProperties>>
+        output_files_table_properties;
+
+    // TODO(hx235): investigate if we can skip reading properties to save read
+    // IO
+    s = ReadOutputFilesTableProperties(output_files, read_options,
+                                       output_files_table_properties);
+    if (!s.ok()) {
+      ROCKS_LOG_ERROR(
+          db_options_.info_log,
+          "[%s] [JOB %d] Failed to read table properties for %s output level"
+          "files "
+          "during resume: %s.",
+          cfd->GetName().c_str(), job_id_, is_proximal_level ? "proximal" : "",
+          s.ToString().c_str());
+      return Status::Corruption(
+          "Not able to resume due to table property reading error " +
+          s.ToString());
+    }
+
+    RestoreCompactionOutputs(cfd, output_files_table_properties,
+                             subcompaction_progress_per_level,
+                             sub_compact->Outputs(is_proximal_level));
+
+    // Skip past all the used file numbers to avoid creating new output files
+    // after resumption that conflict with the existing output files
+    for (const auto& file_meta : output_files) {
+      uint64_t file_number = file_meta.fd.GetNumber();
+      while (versions_->NewFileNumber() <= file_number) {
+        versions_->FetchAddFileNumber(1);
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+void CompactionJob::UpdateSubcompactionProgress(
+    const CompactionIterator* c_iter, const Slice next_table_min_key,
+    SubcompactionState* sub_compact) {
+  assert(c_iter);
+  SubcompactionProgress& subcompaction_progress =
+      sub_compact->GetSubcompactionProgressRef();
+
+  IterKey next_ikey_to_compact;
+  next_ikey_to_compact.SetInternalKey(ExtractUserKey(next_table_min_key),
+                                      kMaxSequenceNumber, kValueTypeForSeek);
+  subcompaction_progress.next_internal_key_to_compact =
+      next_ikey_to_compact.GetInternalKey().ToString();
+
+  // Track total processed input records for progress reporting by combining:
+  // - Resumed count: records already processed before compaction was
+  // interrupted
+  // - Current count: records scanned in the current compaction session
+  // Only update when both tracking mechanisms provide accurate counts to ensure
+  // reliability.
+  subcompaction_progress.num_processed_input_records =
+      c_iter->HasNumInputEntryScanned() &&
+              sub_compact->compaction_job_stats.has_accurate_num_input_records
+          ? c_iter->NumInputEntryScanned() +
+                sub_compact->compaction_job_stats.num_input_records
+          : 0;
+
+  UpdateSubcompactionProgressPerLevel(
+      sub_compact, false /* is_proximal_level */, subcompaction_progress);
+
+  if (sub_compact->compaction->SupportsPerKeyPlacement()) {
+    UpdateSubcompactionProgressPerLevel(
+        sub_compact, true /* is_proximal_level */, subcompaction_progress);
+  }
+}
+
+void CompactionJob::UpdateSubcompactionProgressPerLevel(
+    SubcompactionState* sub_compact, bool is_proximal_level,
+    SubcompactionProgress& subcompaction_progress) {
+  SubcompactionProgressPerLevel& subcompaction_progress_per_level =
+      is_proximal_level ? subcompaction_progress.proximal_output_level_progress
+                        : subcompaction_progress.output_level_progress;
+
+  subcompaction_progress_per_level.SetNumProcessedOutputRecords(
+      sub_compact->OutputStats(is_proximal_level)->num_output_records);
+
+  const auto& prev_output_files =
+      subcompaction_progress_per_level.GetOutputFiles();
+
+  const auto& current_output_files =
+      sub_compact->Outputs(is_proximal_level)->GetOutputs();
+
+  for (size_t i = prev_output_files.size(); i < current_output_files.size();
+       i++) {
+    subcompaction_progress_per_level.AddToOutputFiles(
+        current_output_files[i].meta);
+  }
+}
+
+Status CompactionJob::PersistSubcompactionProgress(
+    SubcompactionState* sub_compact) {
+  SubcompactionProgress& subcompaction_progress =
+      sub_compact->GetSubcompactionProgressRef();
+
+  assert(compaction_progress_writer_);
+
+  VersionEdit edit;
+  edit.SetSubcompactionProgress(subcompaction_progress);
+
+  std::string record;
+  if (!edit.EncodeTo(&record)) {
+    ROCKS_LOG_ERROR(
+        db_options_.info_log,
+        "[%s] [JOB %d] Failed to encode subcompaction "
+        "progress",
+        compact_->compaction->column_family_data()->GetName().c_str(), job_id_);
+    return Status::Corruption("Failed to encode subcompaction progress");
+  }
+
+  WriteOptions write_options(Env::IOActivity::kCompaction);
+  Status s = compaction_progress_writer_->AddRecord(write_options, record);
+  IOOptions opts;
+  if (s.ok()) {
+    s = WritableFileWriter::PrepareIOOptions(write_options, opts);
+  }
+  if (s.ok()) {
+    s = compaction_progress_writer_->file()->Sync(opts, db_options_.use_fsync);
+  }
+
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(
+        db_options_.info_log,
+        "[%s] [JOB %d] Failed to persist subcompaction "
+        "progress: %s",
+        compact_->compaction->column_family_data()->GetName().c_str(), job_id_,
+        s.ToString().c_str());
+    return s;
+  }
+
+  subcompaction_progress.output_level_progress
+      .UpdateLastPersistedOutputFilesCount();
+
+  subcompaction_progress.proximal_output_level_progress
+      .UpdateLastPersistedOutputFilesCount();
+
+  return Status::OK();
+}
+
+Status CompactionJob::VerifyInputRecordCount(
+    uint64_t num_input_range_del) const {
+  size_t ts_sz = compact_->compaction->column_family_data()
+                     ->user_comparator()
+                     ->timestamp_size();
+  // When trim_ts_ is non-empty, CompactionIterator takes
+  // HistoryTrimmingIterator as input iterator and sees a trimmed view of
+  // input keys. So the number of keys it processed is not suitable for
+  // verification here.
+  // TODO: support verification when trim_ts_ is non-empty.
+  if (!(ts_sz > 0 && !trim_ts_.empty())) {
+    assert(internal_stats_.output_level_stats.num_input_records > 0);
+    // TODO: verify the number of range deletion entries.
+    uint64_t expected = internal_stats_.output_level_stats.num_input_records -
+                        num_input_range_del;
+    uint64_t actual = job_stats_->num_input_records;
+    if (expected != actual) {
+      char scratch[2345];
+      compact_->compaction->Summary(scratch, sizeof(scratch));
+      std::string msg =
+          "Compaction number of input keys does not match "
+          "number of keys processed. Expected " +
+          std::to_string(expected) + " but processed " +
+          std::to_string(actual) + ". Compaction summary: " + scratch;
+      ROCKS_LOG_WARN(
+          db_options_.info_log,
+          "[%s] [JOB %d] VerifyInputRecordCount() Status: %s",
+          compact_->compaction->column_family_data()->GetName().c_str(),
+          job_context_->job_id, msg.c_str());
+      if (db_options_.compaction_verify_record_count) {
+        return Status::Corruption(msg);
+      }
+    }
+  }
+  return Status::OK();
+}
+
+Status CompactionJob::VerifyOutputRecordCount() const {
+  uint64_t total_output_num = 0;
+  for (const auto& state : compact_->sub_compact_states) {
+    for (const auto& output : state.GetOutputs()) {
+      total_output_num += output.table_properties->num_entries -
+                          output.table_properties->num_range_deletions;
+    }
+  }
+
+  uint64_t expected = internal_stats_.output_level_stats.num_output_records;
+  if (internal_stats_.has_proximal_level_output) {
+    expected += internal_stats_.proximal_level_stats.num_output_records;
+  }
+  if (expected != total_output_num) {
+    char scratch[2345];
+    compact_->compaction->Summary(scratch, sizeof(scratch));
+    std::string msg =
+        "Number of keys in compaction output SST files does not match "
+        "number of keys added. Expected " +
+        std::to_string(expected) + " but there are " +
+        std::to_string(total_output_num) +
+        " in output SST files. Compaction summary: " + scratch;
+    ROCKS_LOG_WARN(
+        db_options_.info_log,
+        "[%s] [JOB %d] VerifyOutputRecordCount() status: %s",
+        compact_->compaction->column_family_data()->GetName().c_str(),
+        job_context_->job_id, msg.c_str());
+    if (db_options_.compaction_verify_record_count) {
+      return Status::Corruption(msg);
+    }
+  }
+  return Status::OK();
+}
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h
index 730b5ddac945..21486f89538e 100644
--- a/db/compaction/compaction_job.h
+++ b/db/compaction/compaction_job.h
@@ -67,7 +67,7 @@ class SubcompactionState;
 // if needed.
 //
 // CompactionJob has 2 main stats:
-// 1. CompactionJobStats compaction_job_stats_
+// 1. CompactionJobStats job_stats_
 //    CompactionJobStats is a public data structure which is part of Compaction
 //    event listener that rocksdb share the job stats with the user.
 //    Internally it's an aggregation of all the compaction_job_stats from each
@@ -81,7 +81,7 @@ class SubcompactionState;
 // +------------------------+     |
 // | CompactionJob          |     |          +------------------------+
 // |                        |     |          | SubcompactionState     |
-// |   compaction_job_stats +-----+          |                        |
+// |   job_stats            +-----+          |                        |
 // |                        |     +--------->|   compaction_job_stats |
 // |                        |     |          |                        |
 // +------------------------+     |          +------------------------+
@@ -98,16 +98,13 @@ class SubcompactionState;
 //                                +--------->+                        |
 //                                           +------------------------+
 //
-// 2. CompactionStatsFull compaction_stats_
+// 2. CompactionStatsFull internal_stats_
 //    `CompactionStatsFull` is an internal stats about the compaction, which
 //    is eventually sent to `ColumnFamilyData::internal_stats_` and used for
 //    logging and public metrics.
 //    Internally, it's an aggregation of stats_ from each `SubcompactionState`.
-//    It has 2 parts, normal stats about the main compaction information and
-//    the penultimate level output stats.
-//    `SubcompactionState` maintains the CompactionOutputs for normal output and
-//    the penultimate level output if exists, the per_level stats is
-//    stored with the outputs.
+//    It has 2 parts, ordinary output level stats and the proximal level output
+//    stats.
 //                                                +---------------------------+
 //                                                | SubcompactionState        |
 //                                                |                           |
@@ -119,15 +116,15 @@ class SubcompactionState;
 //                                            |   |                           |
 //                                            |   | +----------------------+  |
 // +--------------------------------+         |   | | CompactionOutputs    |  |
-// | CompactionJob                  |         |   | | (penultimate_level)  |  |
+// | CompactionJob                  |         |   | | (proximal_level)     |  |
 // |                                |    +--------->|   stats_             |  |
-// |   compaction_stats_            |    |    |   | +----------------------+  |
+// |   internal_stats_              |    |    |   | +----------------------+  |
 // |    +-------------------------+ |    |    |   |                           |
-// |    |stats (normal)           |------|----+   +---------------------------+
+// |    |output_level_stats       |------|----+   +---------------------------+
 // |    +-------------------------+ |    |    |
 // |                                |    |    |
 // |    +-------------------------+ |    |    |   +---------------------------+
-// |    |penultimate_level_stats  +------+    |   | SubcompactionState        |
+// |    |proximal_level_stats     |------+    |   | SubcompactionState        |
 // |    +-------------------------+ |    |    |   |                           |
 // |                                |    |    |   | +----------------------+  |
 // |                                |    |    |   | | CompactionOutputs    |  |
@@ -137,7 +134,7 @@ class SubcompactionState;
 //                                       |        |                           |
 //                                       |        | +----------------------+  |
 //                                       |        | | CompactionOutputs    |  |
-//                                       |        | | (penultimate_level)  |  |
+//                                       |        | | (proximal_level)     |  |
 //                                       +--------->|   stats_             |  |
 //                                                | +----------------------+  |
 //                                                |                           |
@@ -145,27 +142,31 @@ class SubcompactionState;
 
 class CompactionJob {
  public:
-  CompactionJob(
-      int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
-      const MutableDBOptions& mutable_db_options,
-      const FileOptions& file_options, VersionSet* versions,
-      const std::atomic<bool>* shutting_down, LogBuffer* log_buffer,
-      FSDirectory* db_directory, FSDirectory* output_directory,
-      FSDirectory* blob_output_directory, Statistics* stats,
-      InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
-      std::vector<SequenceNumber> existing_snapshots,
-      SequenceNumber earliest_write_conflict_snapshot,
-      const SnapshotChecker* snapshot_checker, JobContext* job_context,
-      std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
-      bool paranoid_file_checks, bool measure_io_stats,
-      const std::string& dbname, CompactionJobStats* compaction_job_stats,
-      Env::Priority thread_pri, const std::shared_ptr<IOTracer>& io_tracer,
-      const std::atomic<bool>& manual_compaction_canceled,
-      const std::string& db_id = "", const std::string& db_session_id = "",
-      std::string full_history_ts_low = "", std::string trim_ts = "",
-      BlobFileCompletionCallback* blob_callback = nullptr,
-      int* bg_compaction_scheduled = nullptr,
-      int* bg_bottom_compaction_scheduled = nullptr);
+  // Constant false aborted flag, used for compaction service jobs
+  static const std::atomic<int> kCompactionAbortedFalse;
+
+  CompactionJob(int job_id, Compaction* compaction,
+                const ImmutableDBOptions& db_options,
+                const MutableDBOptions& mutable_db_options,
+                const FileOptions& file_options, VersionSet* versions,
+                const std::atomic<bool>* shutting_down, LogBuffer* log_buffer,
+                FSDirectory* db_directory, FSDirectory* output_directory,
+                FSDirectory* blob_output_directory, Statistics* stats,
+                InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
+                JobContext* job_context, std::shared_ptr<Cache> table_cache,
+                EventLogger* event_logger, bool paranoid_file_checks,
+                bool measure_io_stats, const std::string& dbname,
+                CompactionJobStats* compaction_job_stats,
+                Env::Priority thread_pri,
+                const std::shared_ptr<IOTracer>& io_tracer,
+                const std::atomic<bool>& manual_compaction_canceled,
+                const std::atomic<int>& compaction_aborted,
+                const std::string& db_id = "",
+                const std::string& db_session_id = "",
+                std::string full_history_ts_low = "", std::string trim_ts = "",
+                BlobFileCompletionCallback* blob_callback = nullptr,
+                int* bg_compaction_scheduled = nullptr,
+                int* bg_bottom_compaction_scheduled = nullptr);
 
   virtual ~CompactionJob();
 
@@ -179,9 +180,20 @@ class CompactionJob {
   // and organizing seqno <-> time info. `known_single_subcompact` is non-null
   // if we already have a known single subcompaction, with optional key bounds
   // (currently for executing a remote compaction).
+  //
+  // @param compaction_progress Previously saved compaction progress
+  //   to resume from. If empty, compaction starts fresh from the
+  //   beginning.
+  //
+  // @param compaction_progress_writer Writer for persisting
+  //   subcompaction progress periodically during compaction
+  //   execution. If nullptr, progress tracking is disabled and compaction
+  //   cannot be resumed later.
   void Prepare(
       std::optional<std::pair<std::optional<Slice>, std::optional<Slice>>>
-          known_single_subcompact);
+          known_single_subcompact,
+      const CompactionProgress& compaction_progress = CompactionProgress{},
+      log::Writer* compaction_progress_writer = nullptr);
 
   // REQUIRED mutex not held
   // Launch threads for each subcompaction and wait for them to finish. After
@@ -199,23 +211,10 @@ class CompactionJob {
   IOStatus io_status() const { return io_status_; }
 
  protected:
-  // Update the following stats in compaction_stats_.stats
-  // - num_input_files_in_non_output_levels
-  // - num_input_files_in_output_level
-  // - bytes_read_non_output_levels
-  // - bytes_read_output_level
-  // - num_input_records
-  // - bytes_read_blob
-  // - num_dropped_records
-  //
-  // @param num_input_range_del if non-null, will be set to the number of range
-  // deletion entries in this compaction input.
-  //
-  // Returns true iff compaction_stats_.stats.num_input_records and
-  // num_input_range_del are calculated successfully.
-  bool UpdateCompactionStats(uint64_t* num_input_range_del = nullptr);
-  virtual void UpdateCompactionJobStats(
-      const InternalStats::CompactionStats& stats) const;
+  void UpdateCompactionJobOutputStatsFromInternalStats(
+      const Status& status,
+      const InternalStats::CompactionStatsFull& internal_stats) const;
+
   void LogCompaction();
   virtual void RecordCompactionIOStats();
   void CleanupCompaction();
@@ -224,7 +223,7 @@ class CompactionJob {
   void ProcessKeyValueCompaction(SubcompactionState* sub_compact);
 
   CompactionState* compact_;
-  InternalStats::CompactionStatsFull compaction_stats_;
+  InternalStats::CompactionStatsFull internal_stats_;
   const ImmutableDBOptions& db_options_;
   const MutableDBOptions mutable_db_options_copy_;
   LogBuffer* log_buffer_;
@@ -237,11 +236,42 @@ class CompactionJob {
 
   IOStatus io_status_;
 
-  CompactionJobStats* compaction_job_stats_;
+  CompactionJobStats* job_stats_;
 
  private:
   friend class CompactionJobTestBase;
 
+  // Collect the following stats from input files and table properties
+  // - num_input_files_in_non_output_levels
+  // - num_input_files_in_output_level
+  // - bytes_read_non_output_levels
+  // - bytes_read_output_level
+  // - num_input_records
+  // - bytes_read_blob
+  // - num_dropped_records
+  // and set them in internal_stats_.output_level_stats
+  //
+  // @param num_input_range_del if non-null, will be set to the number of range
+  // deletion entries in this compaction input.
+  //
+  // If any input file has potentially unreliable num_entries count (old SST
+  // files - details in implementation),
+  // job_stats_->has_accurate_num_input_records is set to false.
+  //
+  // Returns true iff internal_stats_.output_level_stats.num_input_records and
+  // num_input_range_del are calculated successfully.
+  //
+  // This should be called only once for compactions (not per subcompaction)
+  bool UpdateInternalStatsFromInputFiles(
+      uint64_t* num_input_range_del = nullptr);
+
+  void UpdateCompactionJobInputStatsFromInternalStats(
+      const InternalStats::CompactionStatsFull& internal_stats,
+      uint64_t num_input_range_del) const;
+
+  Status VerifyInputRecordCount(uint64_t num_input_range_del) const;
+  Status VerifyOutputRecordCount() const;
+
   // Generates a histogram representing potential divisions of key ranges from
   // the input. It adds the starting and/or ending keys of certain input files
   // to the working set and then finds the approximate size of data in between
@@ -249,6 +279,10 @@ class CompactionJob {
   // consecutive groups such that each group has a similar size.
   void GenSubcompactionBoundaries();
 
+  void MaybeAssignCompactionProgressAndWriter(
+      const CompactionProgress& compaction_progress,
+      log::Writer* compaction_progress_writer);
+
   // Get the number of planned subcompactions based on max_subcompactions and
   // extra reserved resources
   uint64_t GetSubcompactionsLimit();
@@ -269,18 +303,141 @@ class CompactionJob {
   // Release all reserved threads and update the compaction limits.
   void ReleaseSubcompactionResources();
 
+  void InitializeCompactionRun();
+  void RunSubcompactions();
+  void UpdateTimingStats(uint64_t start_micros);
+  void RemoveEmptyOutputs();
+  void CleanupAbortedSubcompactions();
+  bool HasNewBlobFiles() const;
+  Status CollectSubcompactionErrors();
+  Status SyncOutputDirectories();
+  Status VerifyOutputFiles();
+  void SetOutputTableProperties();
+  // Aggregates subcompaction output stats to internal stat, and aggregates
+  // subcompaction's compaction job stats to the whole entire surrounding
+  // compaction job stats.
+  void AggregateSubcompactionOutputAndJobStats();
+  Status VerifyCompactionRecordCounts(bool stats_built_from_input_table_prop,
+                                      uint64_t num_input_range_del);
+  void FinalizeCompactionRun(const Status& status,
+                             bool stats_built_from_input_table_prop,
+                             uint64_t num_input_range_del);
+
   CompactionServiceJobStatus ProcessKeyValueCompactionWithCompactionService(
       SubcompactionState* sub_compact);
 
+  struct CompactionIOStatsSnapshot {
+    PerfLevel prev_perf_level = PerfLevel::kEnableTime;
+    uint64_t prev_write_nanos = 0;
+    uint64_t prev_fsync_nanos = 0;
+    uint64_t prev_range_sync_nanos = 0;
+    uint64_t prev_prepare_write_nanos = 0;
+    uint64_t prev_cpu_write_nanos = 0;
+    uint64_t prev_cpu_read_nanos = 0;
+  };
+
+  struct SubcompactionKeyBoundaries {
+    const std::optional<const Slice> start;
+    const std::optional<const Slice> end;
+
+    // Boundaries without timestamps for read options
+    std::optional<Slice> start_without_ts;
+    std::optional<Slice> end_without_ts;
+
+    // Timestamp management
+    static constexpr char kMaxTs[] =
+        "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff";
+    std::string max_ts;
+    Slice ts_slice;
+
+    // Internal key boundaries
+    IterKey start_ikey;
+    IterKey end_ikey;
+    Slice start_internal_key;
+    Slice end_internal_key;
+
+    // User key boundaries
+    Slice start_user_key;
+    Slice end_user_key;
+
+    SubcompactionKeyBoundaries(std::optional<const Slice> start_boundary,
+                               std::optional<const Slice> end_boundary)
+        : start(start_boundary), end(end_boundary) {}
+  };
+
+  struct SubcompactionInternalIterators {
+    std::unique_ptr<InternalIterator> raw_input;
+    std::unique_ptr<InternalIterator> clip;
+    std::unique_ptr<InternalIterator> blob_counter;
+    std::unique_ptr<InternalIterator> trim_history_iter;
+  };
+
+  bool ShouldUseLocalCompaction(SubcompactionState* sub_compact);
+  CompactionIOStatsSnapshot InitializeIOStats();
+  Status SetupAndValidateCompactionFilter(
+      SubcompactionState* sub_compact,
+      const CompactionFilter* configured_compaction_filter,
+      const CompactionFilter*& compaction_filter,
+      std::unique_ptr<CompactionFilter>& compaction_filter_from_factory);
+  void InitializeReadOptionsAndBoundaries(
+      size_t ts_sz, ReadOptions& read_options,
+      SubcompactionKeyBoundaries& boundaries);
+  InternalIterator* CreateInputIterator(
+      SubcompactionState* sub_compact, ColumnFamilyData* cfd,
+      SubcompactionInternalIterators& iterators,
+      SubcompactionKeyBoundaries& boundaries, ReadOptions& read_options);
+  void CreateBlobFileBuilder(
+      SubcompactionState* sub_compact, ColumnFamilyData* cfd,
+      std::unique_ptr<BlobFileBuilder>& blob_file_builder,
+      const WriteOptions& write_options);
+  std::unique_ptr<CompactionIterator> CreateCompactionIterator(
+      SubcompactionState* sub_compact, ColumnFamilyData* cfd,
+      InternalIterator* input_iter, const CompactionFilter* compaction_filter,
+      MergeHelper& merge, std::unique_ptr<BlobFileBuilder>& blob_file_builder,
+      const WriteOptions& write_options);
+  std::pair<CompactionFileOpenFunc, CompactionFileCloseFunc> CreateFileHandlers(
+      SubcompactionState* sub_compact, SubcompactionKeyBoundaries& boundaries);
+  Status ProcessKeyValue(SubcompactionState* sub_compact, ColumnFamilyData* cfd,
+                         CompactionIterator* c_iter,
+                         const CompactionFileOpenFunc& open_file_func,
+                         const CompactionFileCloseFunc& close_file_func,
+                         uint64_t& prev_cpu_micros);
+  void UpdateSubcompactionJobStatsIncrementally(
+      CompactionIterator* c_iter, CompactionJobStats* compaction_job_stats,
+      uint64_t cur_cpu_micros, uint64_t& prev_cpu_micros);
+  void FinalizeSubcompactionJobStats(SubcompactionState* sub_compact,
+                                     CompactionIterator* c_iter,
+                                     uint64_t start_cpu_micros,
+                                     uint64_t prev_cpu_micros,
+                                     const CompactionIOStatsSnapshot& io_stats);
+  Status FinalizeProcessKeyValueStatus(ColumnFamilyData* cfd,
+                                       InternalIterator* input_iter,
+                                       CompactionIterator* c_iter,
+                                       Status status);
+  Status CleanupCompactionFiles(SubcompactionState* sub_compact, Status status,
+                                const CompactionFileOpenFunc& open_file_func,
+                                const CompactionFileCloseFunc& close_file_func);
+  Status FinalizeBlobFiles(SubcompactionState* sub_compact,
+                           BlobFileBuilder* blob_file_builder, Status status);
+  void FinalizeSubcompaction(SubcompactionState* sub_compact, Status status,
+                             const CompactionFileOpenFunc& open_file_func,
+                             const CompactionFileCloseFunc& close_file_func,
+                             BlobFileBuilder* blob_file_builder,
+                             CompactionIterator* c_iter,
+                             InternalIterator* input_iter,
+                             uint64_t start_cpu_micros,
+                             uint64_t prev_cpu_micros,
+                             const CompactionIOStatsSnapshot& io_stats);
+
   // update the thread status for starting a compaction.
   void ReportStartedCompaction(Compaction* compaction);
 
-  Status FinishCompactionOutputFile(const Status& input_status,
-                                    SubcompactionState* sub_compact,
-                                    CompactionOutputs& outputs,
-                                    const Slice& next_table_min_key,
-                                    const Slice* comp_start_user_key,
-                                    const Slice* comp_end_user_key);
+  Status FinishCompactionOutputFile(
+      const Status& input_status,
+      const ParsedInternalKey& prev_iter_output_internal_key,
+      const Slice& next_table_min_key, const Slice* comp_start_user_key,
+      const Slice* comp_end_user_key, const CompactionIterator* c_iter,
+      SubcompactionState* sub_compact, CompactionOutputs& outputs);
   Status InstallCompactionResults(bool* compaction_released);
   Status OpenCompactionOutputFile(SubcompactionState* sub_compact,
                                   CompactionOutputs& outputs);
@@ -308,25 +465,13 @@ class CompactionJob {
   VersionSet* versions_;
   const std::atomic<bool>* shutting_down_;
   const std::atomic<bool>& manual_compaction_canceled_;
+  const std::atomic<int>& compaction_aborted_;
   FSDirectory* db_directory_;
   FSDirectory* blob_output_directory_;
   InstrumentedMutex* db_mutex_;
   ErrorHandler* db_error_handler_;
-  // If there were two snapshots with seq numbers s1 and
-  // s2 and s1 < s2, and if we find two instances of a key k1 then lies
-  // entirely within s1 and s2, then the earlier version of k1 can be safely
-  // deleted because that version is not visible in any snapshot.
-  std::vector<SequenceNumber> existing_snapshots_;
 
   SequenceNumber earliest_snapshot_;
-
-  // This is the earliest snapshot that could be used for write-conflict
-  // checking by a transaction.  For any user-key newer than this snapshot, we
-  // should make sure not to remove evidence that a write occurred.
-  SequenceNumber earliest_write_conflict_snapshot_;
-
-  const SnapshotChecker* const snapshot_checker_;
-
   JobContext* job_context_;
 
   std::shared_ptr<Cache> table_cache_;
@@ -363,13 +508,16 @@ class CompactionJob {
 
   // Minimal sequence number to preclude the data from the last level. If the
   // key has bigger (newer) sequence number than this, it will be precluded from
-  // the last level (output to penultimate level).
-  SequenceNumber penultimate_after_seqno_ = kMaxSequenceNumber;
+  // the last level (output to proximal level).
+  SequenceNumber proximal_after_seqno_ = kMaxSequenceNumber;
 
   // Options File Number used for Remote Compaction
   // Setting this requires DBMutex.
   uint64_t options_file_number_ = 0;
 
+  // Writer for persisting compaction progress during compaction
+  log::Writer* compaction_progress_writer_ = nullptr;
+
   // Get table file name in where it's outputting to, which should also be in
   // `output_directory_`.
   virtual std::string GetTableFileName(uint64_t file_number);
@@ -377,6 +525,43 @@ class CompactionJob {
   // The Compaction Read and Write priorities are the same for different
   // scenarios, such as write stalled.
   Env::IOPriority GetRateLimiterPriority();
+
+  Status MaybeResumeSubcompactionProgressOnInputIterator(
+      SubcompactionState* sub_compact, InternalIterator* input_iter);
+
+  Status ReadOutputFilesTableProperties(
+      const autovector<FileMetaData>& temporary_output_file_allocation,
+      const ReadOptions& read_options,
+      std::vector<std::shared_ptr<const TableProperties>>&
+          output_files_table_properties,
+      bool is_proximal_level = false);
+
+  Status ReadTablePropertiesDirectly(
+      const ImmutableOptions& ioptions, const MutableCFOptions& moptions,
+      const FileMetaData* file_meta, const ReadOptions& read_options,
+      std::shared_ptr<const TableProperties>* tp);
+
+  void RestoreCompactionOutputs(
+      const ColumnFamilyData* cfd,
+      const std::vector<std::shared_ptr<const TableProperties>>&
+          output_files_table_properties,
+      SubcompactionProgressPerLevel& subcompaction_progress_per_level,
+      CompactionOutputs* outputs_to_restore);
+
+  bool ShouldUpdateSubcompactionProgress(
+      const SubcompactionState* sub_compact, const CompactionIterator* c_iter,
+      const ParsedInternalKey& prev_iter_output_internal_key,
+      const Slice& next_table_min_internal_key, const FileMetaData* meta) const;
+
+  void UpdateSubcompactionProgress(const CompactionIterator* c_iter,
+                                   const Slice next_table_min_key,
+                                   SubcompactionState* sub_compact);
+
+  Status PersistSubcompactionProgress(SubcompactionState* sub_compact);
+
+  void UpdateSubcompactionProgressPerLevel(
+      SubcompactionState* sub_compact, bool is_proximal_level,
+      SubcompactionProgress& subcompaction_progress);
 };
 
 // CompactionServiceInput is used the pass compaction information between two
@@ -418,8 +603,9 @@ struct CompactionServiceInput {
 // CompactionServiceOutputFile is the metadata for the output SST file
 struct CompactionServiceOutputFile {
   std::string file_name;
-  SequenceNumber smallest_seqno;
-  SequenceNumber largest_seqno;
+  uint64_t file_size{};
+  SequenceNumber smallest_seqno{};
+  SequenceNumber largest_seqno{};
   std::string smallest_internal_key;
   std::string largest_internal_key;
   uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
@@ -427,21 +613,26 @@ struct CompactionServiceOutputFile {
   uint64_t epoch_number = kUnknownEpochNumber;
   std::string file_checksum = kUnknownFileChecksum;
   std::string file_checksum_func_name = kUnknownFileChecksumFuncName;
-  uint64_t paranoid_hash;
+  uint64_t paranoid_hash{};
   bool marked_for_compaction;
   UniqueId64x2 unique_id{};
   TableProperties table_properties;
+  bool is_proximal_level_output;
+  Temperature file_temperature = Temperature::kUnknown;
 
   CompactionServiceOutputFile() = default;
   CompactionServiceOutputFile(
-      const std::string& name, SequenceNumber smallest, SequenceNumber largest,
-      std::string _smallest_internal_key, std::string _largest_internal_key,
-      uint64_t _oldest_ancester_time, uint64_t _file_creation_time,
-      uint64_t _epoch_number, const std::string& _file_checksum,
+      const std::string& name, uint64_t size, SequenceNumber smallest,
+      SequenceNumber largest, std::string _smallest_internal_key,
+      std::string _largest_internal_key, uint64_t _oldest_ancester_time,
+      uint64_t _file_creation_time, uint64_t _epoch_number,
+      const std::string& _file_checksum,
       const std::string& _file_checksum_func_name, uint64_t _paranoid_hash,
       bool _marked_for_compaction, UniqueId64x2 _unique_id,
-      const TableProperties& _table_properties)
+      const TableProperties& _table_properties, bool _is_proximal_level_output,
+      Temperature _file_temperature)
       : file_name(name),
+        file_size(size),
         smallest_seqno(smallest),
         largest_seqno(largest),
         smallest_internal_key(std::move(_smallest_internal_key)),
@@ -454,7 +645,9 @@ struct CompactionServiceOutputFile {
         paranoid_hash(_paranoid_hash),
         marked_for_compaction(_marked_for_compaction),
         unique_id(std::move(_unique_id)),
-        table_properties(_table_properties) {}
+        table_properties(_table_properties),
+        is_proximal_level_output(_is_proximal_level_output),
+        file_temperature(_file_temperature) {}
 };
 
 // CompactionServiceResult contains the compaction result from a different db
@@ -470,8 +663,21 @@ struct CompactionServiceResult {
 
   uint64_t bytes_read = 0;
   uint64_t bytes_written = 0;
+
+  // Job-level Compaction Stats.
+  //
+  // NOTE: Job level stats cannot be rebuilt from scratch by simply aggregating
+  // per-level stats due to some fields populated directly during compaction
+  // (e.g. RecordDroppedKeys()). This is why we need both job-level stats and
+  // per-level in the serialized result. If rebuilding job-level stats from
+  // per-level stats become possible in the future, consider deprecating this
+  // field.
   CompactionJobStats stats;
 
+  // Per-level Compaction Stats for both output_level_stats and
+  // proximal_level_stats
+  InternalStats::CompactionStatsFull internal_stats;
+
   // serialization interface to read and write the object
   static Status Read(const std::string& data_str, CompactionServiceResult* obj);
   Status Write(std::string* output);
@@ -494,9 +700,9 @@ class CompactionServiceCompactionJob : private CompactionJob {
       const std::atomic<bool>* shutting_down, LogBuffer* log_buffer,
       FSDirectory* output_directory, Statistics* stats,
       InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
-      std::vector<SequenceNumber> existing_snapshots,
-      std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
-      const std::string& dbname, const std::shared_ptr<IOTracer>& io_tracer,
+      JobContext* job_context, std::shared_ptr<Cache> table_cache,
+      EventLogger* event_logger, const std::string& dbname,
+      const std::shared_ptr<IOTracer>& io_tracer,
       const std::atomic<bool>& manual_compaction_canceled,
       const std::string& db_id, const std::string& db_session_id,
       std::string output_path,
@@ -505,7 +711,9 @@ class CompactionServiceCompactionJob : private CompactionJob {
 
   // REQUIRED: mutex held
   // Like CompactionJob::Prepare()
-  void Prepare();
+  void Prepare(
+      const CompactionProgress& compaction_progress = CompactionProgress{},
+      log::Writer* compaction_progress_writer = nullptr);
 
   // Run the compaction in current thread and return the result
   Status Run();
@@ -517,9 +725,6 @@ class CompactionServiceCompactionJob : private CompactionJob {
  protected:
   void RecordCompactionIOStats() override;
 
-  void UpdateCompactionJobStats(
-      const InternalStats::CompactionStats& stats) const override;
-
  private:
   // Get table file name in output_path
   std::string GetTableFileName(uint64_t file_number) override;
diff --git a/db/compaction/compaction_job_stats_test.cc b/db/compaction/compaction_job_stats_test.cc
index c4a05c951dfc..6a91271520d0 100644
--- a/db/compaction/compaction_job_stats_test.cc
+++ b/db/compaction/compaction_job_stats_test.cc
@@ -82,7 +82,7 @@ class CompactionJobStatsTest : public testing::Test,
   std::string dbname_;
   std::string alternative_wal_dir_;
   Env* env_;
-  DB* db_;
+  std::unique_ptr<DB> db_;
   std::vector<ColumnFamilyHandle*> handles_;
   uint32_t max_subcompactions_;
 
@@ -123,7 +123,7 @@ class CompactionJobStatsTest : public testing::Test,
   static void SetUpTestCase() {}
   static void TearDownTestCase() {}
 
-  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_); }
+  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_.get()); }
 
   void CreateColumnFamilies(const std::vector<std::string>& cfs,
                             const Options& options) {
@@ -162,7 +162,8 @@ class CompactionJobStatsTest : public testing::Test,
       column_families.emplace_back(cfs[i], options[i]);
     }
     DBOptions db_opts = DBOptions(options[0]);
-    return DB::Open(db_opts, dbname_, column_families, &handles_, &db_);
+    auto s = DB::Open(db_opts, dbname_, column_families, &handles_, &db_);
+    return s;
   }
 
   Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
@@ -179,8 +180,7 @@ class CompactionJobStatsTest : public testing::Test,
       delete h;
     }
     handles_.clear();
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
   }
 
   void DestroyAndReopen(const Options& options) {
@@ -743,7 +743,7 @@ TEST_P(CompactionJobStatsTest, CompactionJobStatsTest) {
     }
 
     ASSERT_OK(Flush(1));
-    ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_WaitForCompact());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
     stats_checker->set_verify_next_comp_io_stats(true);
     std::atomic<bool> first_prepare_write(true);
@@ -944,7 +944,7 @@ TEST_P(CompactionJobStatsTest, UniversalCompactionTest) {
        start_key += key_base) {
     MakeTableWithKeyValues(&rnd, start_key, start_key + key_base - 1, kKeySize,
                            kValueSize, key_interval, compression_ratio, 1);
-    ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_WaitForCompact());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
   }
   ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 0U);
 }
diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc
index 1108223a6f29..7a6f77ee222a 100644
--- a/db/compaction/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@@ -17,6 +17,7 @@
 #include "db/db_impl/db_impl.h"
 #include "db/error_handler.h"
 #include "db/version_set.h"
+#include "file/filename.h"
 #include "file/random_access_file_reader.h"
 #include "file/writable_file_writer.h"
 #include "options/options_helper.h"
@@ -43,7 +44,6 @@ void VerifyInitializationOfCompactionJobStats(
   ASSERT_EQ(compaction_job_stats.elapsed_micros, 0U);
 
   ASSERT_EQ(compaction_job_stats.num_input_records, 0U);
-  ASSERT_EQ(compaction_job_stats.num_input_files, 0U);
   ASSERT_EQ(compaction_job_stats.num_input_files_at_output_level, 0U);
 
   ASSERT_EQ(compaction_job_stats.num_output_records, 0U);
@@ -52,7 +52,6 @@ void VerifyInitializationOfCompactionJobStats(
   ASSERT_TRUE(compaction_job_stats.is_manual_compaction);
   ASSERT_FALSE(compaction_job_stats.is_remote_compaction);
 
-  ASSERT_EQ(compaction_job_stats.total_input_bytes, 0U);
   ASSERT_EQ(compaction_job_stats.total_output_bytes, 0U);
 
   ASSERT_EQ(compaction_job_stats.total_input_raw_key_bytes, 0U);
@@ -212,12 +211,12 @@ class CompactionJobTestBase : public testing::Test {
         table_cache_(NewLRUCache(50000, 16)),
         write_buffer_manager_(db_options_.db_write_buffer_size),
         versions_(new VersionSet(
-            dbname_, &db_options_, env_options_, table_cache_.get(),
-            &write_buffer_manager_, &write_controller_,
+            dbname_, &db_options_, mutable_db_options_, env_options_,
+            table_cache_.get(), &write_buffer_manager_, &write_controller_,
             /*block_cache_tracer=*/nullptr,
             /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"",
             /*daily_offpeak_time_utc=*/"",
-            /*error_handler=*/nullptr, /*read_only=*/false)),
+            /*error_handler=*/nullptr, /*unchanging=*/false)),
         shutting_down_(false),
         mock_table_factory_(new mock::MockTableFactory()),
         error_handler_(nullptr, db_options_, &mutex_),
@@ -460,9 +459,10 @@ class CompactionJobTestBase : public testing::Test {
       ReadOptions read_opts;
       Status s = cf_options_.table_factory->NewTableReader(
           read_opts,
-          TableReaderOptions(cfd->ioptions(), nullptr, FileOptions(),
+          TableReaderOptions(cfd->ioptions(), /*prefix_extractor=*/nullptr,
+                             /*compression_manager=*/nullptr, FileOptions(),
                              cfd_->internal_comparator(),
-                             0 /* block_protection_bytes_per_key */),
+                             /*block_protection_bytes_per_key=*/0),
           std::move(freader), file_size, &table_reader, false);
       ASSERT_OK(s);
       assert(table_reader);
@@ -546,13 +546,13 @@ class CompactionJobTestBase : public testing::Test {
     ASSERT_OK(s);
     db_options_.info_log = info_log;
 
-    versions_.reset(
-        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
-                       &write_buffer_manager_, &write_controller_,
-                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
-                       test::kUnitTestDbId, /*db_session_id=*/"",
-                       /*daily_offpeak_time_utc=*/"",
-                       /*error_handler=*/nullptr, /*read_only=*/false));
+    versions_.reset(new VersionSet(
+        dbname_, &db_options_, mutable_db_options_, env_options_,
+        table_cache_.get(), &write_buffer_manager_, &write_controller_,
+        /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+        test::kUnitTestDbId, /*db_session_id=*/"",
+        /*daily_offpeak_time_utc=*/"",
+        /*error_handler=*/nullptr, /*unchanging=*/false));
     compaction_job_stats_.Reset();
 
     VersionEdit new_db;
@@ -595,11 +595,11 @@ class CompactionJobTestBase : public testing::Test {
       const std::vector<std::vector<FileMetaData*>>& input_files,
       const std::vector<int> input_levels,
       std::function<void(Compaction& comp)>&& verify_func,
-      const std::vector<SequenceNumber>& snapshots = {}) {
+      std::vector<SequenceNumber>&& snapshots = {}) {
     const int kLastLevel = cf_options_.num_levels - 1;
     verify_per_key_placement_ = std::move(verify_func);
     mock::KVVector empty_map;
-    RunCompaction(input_files, input_levels, {empty_map}, snapshots,
+    RunCompaction(input_files, input_levels, {empty_map}, std::move(snapshots),
                   kMaxSequenceNumber, kLastLevel, false);
   }
 
@@ -608,7 +608,7 @@ class CompactionJobTestBase : public testing::Test {
       const std::vector<std::vector<FileMetaData*>>& input_files,
       const std::vector<int>& input_levels,
       const std::vector<mock::KVVector>& expected_results,
-      const std::vector<SequenceNumber>& snapshots = {},
+      std::vector<SequenceNumber>&& snapshots = {},
       SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber,
       int output_level = 1, bool verify = true,
       std::vector<uint64_t> expected_oldest_blob_file_numbers = {},
@@ -652,7 +652,8 @@ class CompactionJobTestBase : public testing::Test {
         mutable_cf_options_.max_compaction_bytes, 0, kNoCompression,
         cfd->GetLatestMutableCFOptions().compression_opts,
         Temperature::kUnknown, max_subcompactions, grandparents,
-        /*earliest_snapshot*/ std::nullopt, /*snapshot_checker*/ nullptr, true);
+        /*earliest_snapshot*/ std::nullopt, /*snapshot_checker*/ nullptr,
+        CompactionReason::kManualCompaction);
     compaction.FinalizeInputInfo(cfd->current());
 
     assert(db_options_.info_log);
@@ -665,16 +666,18 @@ class CompactionJobTestBase : public testing::Test {
                 ucmp_->timestamp_size() == full_history_ts_low_.size());
     const std::atomic<bool> kManualCompactionCanceledFalse{false};
     JobContext job_context(1, false /* create_superversion */);
+    job_context.InitSnapshotContext(snapshot_checker, nullptr,
+                                    earliest_write_conflict_snapshot,
+                                    std::move(snapshots));
     CompactionJob compaction_job(
         0, &compaction, db_options_, mutable_db_options_, env_options_,
         versions_.get(), &shutting_down_, &log_buffer, nullptr, nullptr,
-        nullptr, nullptr, &mutex_, &error_handler_, snapshots,
-        earliest_write_conflict_snapshot, snapshot_checker, &job_context,
-        table_cache_, &event_logger, false, false, dbname_,
-        &compaction_job_stats_, Env::Priority::USER, nullptr /* IOTracer */,
+        nullptr, nullptr, &mutex_, &error_handler_, &job_context, table_cache_,
+        &event_logger, false, false, dbname_, &compaction_job_stats_,
+        Env::Priority::USER, nullptr /* IOTracer */,
         /*manual_compaction_canceled=*/kManualCompactionCanceledFalse,
-        env_->GenerateUniqueId(), DBImpl::GenerateDbSessionId(nullptr),
-        full_history_ts_low_);
+        CompactionJob::kCompactionAbortedFalse, env_->GenerateUniqueId(),
+        DBImpl::GenerateDbSessionId(nullptr), full_history_ts_low_);
     VerifyInitializationOfCompactionJobStats(compaction_job_stats_);
 
     compaction_job.Prepare(std::nullopt /*subcompact to be computed*/);
@@ -1474,7 +1477,7 @@ TEST_F(CompactionJobTest, OldestBlobFileNumber) {
                 /* expected_oldest_blob_file_numbers */ {19});
 }
 
-TEST_F(CompactionJobTest, VerifyPenultimateLevelOutput) {
+TEST_F(CompactionJobTest, VerifyProximalLevelOutput) {
   cf_options_.last_level_temperature = Temperature::kCold;
   SyncPoint::GetInstance()->SetCallBack(
       "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
@@ -1487,8 +1490,7 @@ TEST_F(CompactionJobTest, VerifyPenultimateLevelOutput) {
   SyncPoint::GetInstance()->SetCallBack(
       "CompactionIterator::PrepareOutput.context", [&](void* arg) {
         auto context = static_cast<PerKeyPlacementContext*>(arg);
-        context->output_to_penultimate_level =
-            context->seq_num > latest_cold_seq;
+        context->output_to_proximal_level = context->seq_num > latest_cold_seq;
       });
   SyncPoint::GetInstance()->EnableProcessing();
 
@@ -1534,11 +1536,11 @@ TEST_F(CompactionJobTest, VerifyPenultimateLevelOutput) {
       /*verify_func=*/[&](Compaction& comp) {
         for (char c = 'a'; c <= 'z'; c++) {
           if (c == 'a') {
-            comp.TEST_AssertWithinPenultimateLevelOutputRange(
+            comp.TEST_AssertWithinProximalLevelOutputRange(
                 "a", true /*expect_failure*/);
           } else {
             std::string c_str{c};
-            comp.TEST_AssertWithinPenultimateLevelOutputRange(c_str);
+            comp.TEST_AssertWithinProximalLevelOutputRange(c_str);
           }
         }
       });
@@ -1670,6 +1672,7 @@ TEST_F(CompactionJobTest, ResultSerialization) {
     UniqueId64x2 id{rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX)};
     result.output_files.emplace_back(
         rnd.RandomString(rnd.Uniform(kStrMaxLen)) /* file_name */,
+        rnd64.Uniform(UINT64_MAX) /* file_size */,
         rnd64.Uniform(UINT64_MAX) /* smallest_seqno */,
         rnd64.Uniform(UINT64_MAX) /* largest_seqno */,
         rnd.RandomBinaryString(
@@ -1682,7 +1685,8 @@ TEST_F(CompactionJobTest, ResultSerialization) {
         file_checksum /* file_checksum */,
         file_checksum_func_name /* file_checksum_func_name */,
         rnd64.Uniform(UINT64_MAX) /* paranoid_hash */,
-        rnd.OneIn(2) /* marked_for_compaction */, id /* unique_id */, tp);
+        rnd.OneIn(2) /* marked_for_compaction */, id /* unique_id */, tp,
+        false /* is_proximal_level_output */, Temperature::kHot);
   }
   result.output_level = rnd.Uniform(10);
   result.output_path = rnd.RandomString(rnd.Uniform(kStrMaxLen));
@@ -1736,6 +1740,8 @@ TEST_F(CompactionJobTest, ResultSerialization) {
     ASSERT_EQ(deserialized_tmp.output_files[0].file_checksum, file_checksum);
     ASSERT_EQ(deserialized_tmp.output_files[0].file_checksum_func_name,
               file_checksum_func_name);
+    ASSERT_EQ(deserialized_tmp.output_files[0].file_temperature,
+              Temperature::kHot);
   }
 
   // Test unknown field
@@ -2033,7 +2039,7 @@ TEST_F(CompactionJobTest, CutToAlignGrandparentBoundarySameKey) {
     snapshots.emplace_back(i);
   }
   RunCompaction({lvl0_files, lvl1_files}, input_levels,
-                {expected_file1, expected_file2}, snapshots);
+                {expected_file1, expected_file2}, std::move(snapshots));
 }
 
 TEST_F(CompactionJobTest, CutForMaxCompactionBytesSameKey) {
@@ -2092,7 +2098,8 @@ TEST_F(CompactionJobTest, CutForMaxCompactionBytesSameKey) {
     snapshots.emplace_back(i);
   }
   RunCompaction({lvl0_files, lvl1_files}, input_levels,
-                {expected_file1, expected_file2, expected_file3}, snapshots);
+                {expected_file1, expected_file2, expected_file3},
+                std::move(snapshots));
 }
 
 class CompactionJobTimestampTest : public CompactionJobTestBase {
@@ -2402,7 +2409,6 @@ TEST_F(CompactionJobIOPriorityTest, GetRateLimiterPriority) {
                 kMaxSequenceNumber, 1, false, {kInvalidBlobFileNumber}, true,
                 Env::IO_LOW, Env::IO_LOW);
 }
-
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/compaction/compaction_outputs.cc b/db/compaction/compaction_outputs.cc
index 3e1c4402cea3..8c86df870dee 100644
--- a/db/compaction/compaction_outputs.cc
+++ b/db/compaction/compaction_outputs.cc
@@ -49,12 +49,16 @@ Status CompactionOutputs::Finish(
     meta->fd.file_size = current_bytes;
     meta->tail_size = builder_->GetTailSize();
     meta->marked_for_compaction = builder_->NeedCompact();
-    meta->user_defined_timestamps_persisted = static_cast<bool>(
-        builder_->GetTableProperties().user_defined_timestamps_persisted);
+    const TableProperties& tp = builder_->GetTableProperties();
+    meta->user_defined_timestamps_persisted =
+        static_cast<bool>(tp.user_defined_timestamps_persisted);
+    ExtractTimestampFromTableProperties(tp, meta);
   }
   current_output().finished = true;
   stats_.bytes_written += current_bytes;
-  stats_.num_output_files = outputs_.size();
+  stats_.bytes_written_pre_comp += builder_->PreCompressionSize();
+  stats_.num_output_files = static_cast<int>(outputs_.size());
+  worker_cpu_micros_ += builder_->GetWorkerCPUMicros();
 
   return s;
 }
@@ -276,7 +280,11 @@ bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) {
   }
 
   // reach the max file size
-  if (current_output_file_size_ >= compaction_->max_output_file_size()) {
+  uint64_t estimated_file_size = current_output_file_size_;
+  if (compaction_->mutable_cf_options().target_file_size_is_upper_bound) {
+    estimated_file_size += builder_->EstimatedTailSize();
+  }
+  if (estimated_file_size >= compaction_->max_output_file_size()) {
     return true;
   }
 
@@ -357,7 +365,8 @@ bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) {
 Status CompactionOutputs::AddToOutput(
     const CompactionIterator& c_iter,
     const CompactionFileOpenFunc& open_file_func,
-    const CompactionFileCloseFunc& close_file_func) {
+    const CompactionFileCloseFunc& close_file_func,
+    const ParsedInternalKey& prev_iter_output_internal_key) {
   Status s;
   bool is_range_del = c_iter.IsDeleteRangeSentinelKey();
   if (is_range_del && compaction_->bottommost_level()) {
@@ -368,7 +377,8 @@ Status CompactionOutputs::AddToOutput(
   }
   const Slice& key = c_iter.key();
   if (ShouldStopBefore(c_iter) && HasBuilder()) {
-    s = close_file_func(*this, c_iter.InputStatus(), key);
+    s = close_file_func(c_iter.InputStatus(), prev_iter_output_internal_key,
+                        key, &c_iter, *this);
     if (!s.ok()) {
       return s;
     }
@@ -792,8 +802,8 @@ void CompactionOutputs::FillFilesToCutForTtl() {
 }
 
 CompactionOutputs::CompactionOutputs(const Compaction* compaction,
-                                     const bool is_penultimate_level)
-    : compaction_(compaction), is_penultimate_level_(is_penultimate_level) {
+                                     const bool is_proximal_level)
+    : compaction_(compaction), is_proximal_level_(is_proximal_level) {
   partitioner_ = compaction->output_level() == 0
                      ? nullptr
                      : compaction->CreateSstPartitioner();
diff --git a/db/compaction/compaction_outputs.h b/db/compaction/compaction_outputs.h
index 33259be4670a..757e1b6b85ed 100644
--- a/db/compaction/compaction_outputs.h
+++ b/db/compaction/compaction_outputs.h
@@ -21,7 +21,8 @@ namespace ROCKSDB_NAMESPACE {
 class CompactionOutputs;
 using CompactionFileOpenFunc = std::function<Status(CompactionOutputs&)>;
 using CompactionFileCloseFunc =
-    std::function<Status(CompactionOutputs&, const Status&, const Slice&)>;
+    std::function<Status(const Status&, const ParsedInternalKey&, const Slice&,
+                         const CompactionIterator*, CompactionOutputs&)>;
 
 // Files produced by subcompaction, most of the functions are used by
 // compaction_job Open/Close compaction file functions.
@@ -30,31 +31,36 @@ class CompactionOutputs {
   // compaction output file
   struct Output {
     Output(FileMetaData&& _meta, const InternalKeyComparator& _icmp,
-           bool _enable_hash, bool _finished, uint64_t precalculated_hash)
+           bool _enable_hash, bool _finished, uint64_t precalculated_hash,
+           bool _is_proximal_level)
         : meta(std::move(_meta)),
           validator(_icmp, _enable_hash, precalculated_hash),
-          finished(_finished) {}
+          finished(_finished),
+          is_proximal_level(_is_proximal_level) {}
     FileMetaData meta;
     OutputValidator validator;
     bool finished;
+    bool is_proximal_level;
     std::shared_ptr<const TableProperties> table_properties;
   };
 
   CompactionOutputs() = delete;
 
   explicit CompactionOutputs(const Compaction* compaction,
-                             const bool is_penultimate_level);
+                             const bool is_proximal_level);
 
-  bool IsPenultimateLevel() const { return is_penultimate_level_; }
+  bool IsProximalLevel() const { return is_proximal_level_; }
 
   // Add generated output to the list
   void AddOutput(FileMetaData&& meta, const InternalKeyComparator& icmp,
                  bool enable_hash, bool finished = false,
                  uint64_t precalculated_hash = 0) {
     outputs_.emplace_back(std::move(meta), icmp, enable_hash, finished,
-                          precalculated_hash);
+                          precalculated_hash, is_proximal_level_);
   }
 
+  const std::vector<Output>& GetOutputs() const { return outputs_; }
+
   // Set new table builder for the current output
   void NewBuilder(const TableBuilderOptions& tboptions);
 
@@ -63,34 +69,42 @@ class CompactionOutputs {
     file_writer_.reset(writer);
   }
 
-  // TODO: Remove it when remote compaction support tiered compaction
-  void AddBytesWritten(uint64_t bytes) { stats_.bytes_written += bytes; }
-  void SetNumOutputRecords(uint64_t num) { stats_.num_output_records = num; }
-  void SetNumOutputFiles(uint64_t num) { stats_.num_output_files = num; }
-
   // TODO: Move the BlobDB builder into CompactionOutputs
   const std::vector<BlobFileAddition>& GetBlobFileAdditions() const {
-    if (is_penultimate_level_) {
+    if (is_proximal_level_) {
       assert(blob_file_additions_.empty());
     }
     return blob_file_additions_;
   }
 
   std::vector<BlobFileAddition>* GetBlobFileAdditionsPtr() {
-    assert(!is_penultimate_level_);
+    assert(!is_proximal_level_);
     return &blob_file_additions_;
   }
 
   bool HasBlobFileAdditions() const { return !blob_file_additions_.empty(); }
 
+  // Get all file paths (SST and blob) created during compaction.
+  const std::vector<std::string>& GetOutputFilePaths() const {
+    return output_file_paths_;
+  }
+
+  std::vector<std::string>* GetOutputFilePathsPtr() {
+    return &output_file_paths_;
+  }
+
+  void AddOutputFilePath(const std::string& path) {
+    output_file_paths_.push_back(path);
+  }
+
   BlobGarbageMeter* CreateBlobGarbageMeter() {
-    assert(!is_penultimate_level_);
+    assert(!is_proximal_level_);
     blob_garbage_meter_ = std::make_unique<BlobGarbageMeter>();
     return blob_garbage_meter_.get();
   }
 
   BlobGarbageMeter* GetBlobGarbageMeter() const {
-    if (is_penultimate_level_) {
+    if (is_proximal_level_) {
       // blobdb doesn't support per_key_placement yet
       assert(blob_garbage_meter_ == nullptr);
       return nullptr;
@@ -99,8 +113,9 @@ class CompactionOutputs {
   }
 
   void UpdateBlobStats() {
-    assert(!is_penultimate_level_);
-    stats_.num_output_files_blob = blob_file_additions_.size();
+    assert(!is_proximal_level_);
+    stats_.num_output_files_blob =
+        static_cast<int>(blob_file_additions_.size());
     for (const auto& blob : blob_file_additions_) {
       stats_.bytes_written_blob += blob.GetTotalBlobBytes();
     }
@@ -169,6 +184,10 @@ class CompactionOutputs {
 
   uint64_t NumEntries() const { return builder_->NumEntries(); }
 
+  uint64_t GetWorkerCPUMicros() const {
+    return worker_cpu_micros_ + (builder_ ? builder_->GetWorkerCPUMicros() : 0);
+  }
+
   void ResetBuilder() {
     builder_.reset();
     current_output_file_size_ = 0;
@@ -192,6 +211,10 @@ class CompactionOutputs {
       std::pair<SequenceNumber, SequenceNumber> keep_seqno_range,
       const Slice& next_table_min_key, const std::string& full_history_ts_low);
 
+  void SetNumOutputRecords(uint64_t num_output_records) {
+    stats_.num_output_records = num_output_records;
+  }
+
  private:
   friend class SubcompactionState;
 
@@ -251,7 +274,8 @@ class CompactionOutputs {
   // close and open new compaction output with the functions provided.
   Status AddToOutput(const CompactionIterator& c_iter,
                      const CompactionFileOpenFunc& open_file_func,
-                     const CompactionFileCloseFunc& close_file_func);
+                     const CompactionFileCloseFunc& close_file_func,
+                     const ParsedInternalKey& prev_iter_output_internal_key);
 
   // Close the current output. `open_file_func` is needed for creating new file
   // for range-dels only output file.
@@ -267,9 +291,12 @@ class CompactionOutputs {
         !range_del_agg->IsEmpty()) {
       status = open_file_func(*this);
     }
+
     if (HasBuilder()) {
+      const ParsedInternalKey empty_internal_key{};
       const Slice empty_key{};
-      Status s = close_file_func(*this, status, empty_key);
+      Status s = close_file_func(status, empty_internal_key, empty_key,
+                                 nullptr /* c_iter */, *this);
       if (!s.ok() && status.ok()) {
         status = s;
       }
@@ -297,6 +324,9 @@ class CompactionOutputs {
   uint64_t current_output_file_size_ = 0;
   SequenceNumber smallest_preferred_seqno_ = kMaxSequenceNumber;
 
+  // Sum of all the GetWorkerCPUMicros() for all the closed builders so far.
+  uint64_t worker_cpu_micros_ = 0;
+
   // all the compaction outputs so far
   std::vector<Output> outputs_;
 
@@ -304,12 +334,18 @@ class CompactionOutputs {
   std::vector<BlobFileAddition> blob_file_additions_;
   std::unique_ptr<BlobGarbageMeter> blob_garbage_meter_;
 
-  // Basic compaction output stats for this level's outputs
-  InternalStats::CompactionOutputsStats stats_;
+  // All file paths (SST and blob) created during compaction.
+  // Used for cleanup on abort - ensures orphan files are deleted even if
+  // they were removed from outputs_ or blob_file_additions_ (e.g., by
+  // RemoveLastEmptyOutput when file_size is 0 because builder was abandoned).
+  std::vector<std::string> output_file_paths_;
+
+  // Per level's output stat
+  InternalStats::CompactionStats stats_;
 
-  // indicate if this CompactionOutputs obj for penultimate_level, should always
+  // indicate if this CompactionOutputs obj for proximal_level, should always
   // be false if per_key_placement feature is not enabled.
-  const bool is_penultimate_level_;
+  const bool is_proximal_level_;
 
   // partitioner information
   std::string last_key_for_partitioner_;
@@ -363,7 +399,7 @@ class CompactionOutputs {
   std::vector<size_t> level_ptrs_;
 };
 
-// helper struct to concatenate the last level and penultimate level outputs
+// helper struct to concatenate the last level and proximal level outputs
 // which could be replaced by std::ranges::join_view() in c++20
 struct OutputIterator {
  public:
diff --git a/db/compaction/compaction_picker.cc b/db/compaction/compaction_picker.cc
index 946dab5ddefe..14c25677c0b9 100644
--- a/db/compaction/compaction_picker.cc
+++ b/db/compaction/compaction_picker.cc
@@ -27,12 +27,68 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-bool FindIntraL0Compaction(const std::vector<FileMetaData*>& level_files,
-                           size_t min_files_to_compact,
-                           uint64_t max_compact_bytes_per_del_file,
-                           uint64_t max_compaction_bytes,
-                           CompactionInputFiles* comp_inputs) {
-  TEST_SYNC_POINT("FindIntraL0Compaction");
+#ifndef NDEBUG
+static void AssertCleanCut(const InternalKeyComparator* icmp,
+                           VersionStorageInfo* vstorage,
+                           CompactionInputFiles* inputs, int level,
+                           Logger* logger) {
+  const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(level);
+  if (inputs->files.empty() || level_files.empty()) {
+    return;
+  }
+
+  const Comparator* ucmp = icmp->user_comparator();
+
+  // Find first and last input file indices in level
+  int first_input_idx = -1;
+  int last_input_idx = -1;
+  for (size_t i = 0; i < level_files.size(); i++) {
+    if (level_files[i] == inputs->files.front()) {
+      first_input_idx = static_cast<int>(i);
+    }
+    if (level_files[i] == inputs->files.back()) {
+      last_input_idx = static_cast<int>(i);
+    }
+  }
+
+  // Check file before first input
+  if (first_input_idx > 0) {
+    const FileMetaData* prev_file = level_files[first_input_idx - 1];
+    const FileMetaData* first_file = inputs->files.front();
+    int cmp = sstableKeyCompare(ucmp, prev_file->largest, first_file->smallest);
+    if (cmp == 0) {
+      ROCKS_LOG_ERROR(logger,
+                      "Clean cut violated: L%d unselected file %" PRIu64
+                      " adjacent to first selected file %" PRIu64,
+                      level, prev_file->fd.GetNumber(),
+                      first_file->fd.GetNumber());
+      assert(false);
+    }
+  }
+
+  // Check file after last input
+  if (last_input_idx >= 0 &&
+      static_cast<size_t>(last_input_idx) < level_files.size() - 1) {
+    const FileMetaData* last_file = inputs->files.back();
+    const FileMetaData* next_file = level_files[last_input_idx + 1];
+    int cmp = sstableKeyCompare(ucmp, last_file->largest, next_file->smallest);
+    if (cmp == 0) {
+      ROCKS_LOG_ERROR(logger,
+                      "Clean cut violated: L%d unselected file %" PRIu64
+                      " adjacent to last selected file %" PRIu64,
+                      level, next_file->fd.GetNumber(),
+                      last_file->fd.GetNumber());
+      assert(false);
+    }
+  }
+}
+#endif  // NDEBUG
+
+bool PickCostBasedIntraL0Compaction(
+    const std::vector<FileMetaData*>& level_files, size_t min_files_to_compact,
+    uint64_t max_compact_bytes_per_del_file, uint64_t max_compaction_bytes,
+    CompactionInputFiles* comp_inputs) {
+  TEST_SYNC_POINT("PickCostBasedIntraL0Compaction");
 
   size_t start = 0;
 
@@ -242,7 +298,7 @@ bool CompactionPicker::ExpandInputsToCleanCut(const std::string& /*cf_name*/,
     GetRange(*inputs, &smallest, &largest);
     inputs->clear();
     vstorage->GetOverlappingInputs(level, &smallest, &largest, &inputs->files,
-                                   hint_index, &hint_index, true,
+                                   hint_index, &hint_index, true, nullptr,
                                    next_smallest);
   } while (inputs->size() > old_size);
 
@@ -250,6 +306,10 @@ bool CompactionPicker::ExpandInputsToCleanCut(const std::string& /*cf_name*/,
   // inputs. thus, inputs should be non-empty here
   assert(!inputs->empty());
 
+#ifndef NDEBUG
+  AssertCleanCut(icmp_, vstorage, inputs, level, ioptions_.logger);
+#endif  // NDEBUG
+
   // If, after the expansion, there are files that are already under
   // compaction, then we must drop/cancel this compaction.
   if (AreFilesInCompaction(inputs->files)) {
@@ -272,8 +332,8 @@ bool CompactionPicker::RangeOverlapWithCompaction(
       return true;
     }
     if (c->SupportsPerKeyPlacement()) {
-      if (c->OverlapPenultimateLevelOutputRange(smallest_user_key,
-                                                largest_user_key)) {
+      if (c->OverlapProximalLevelOutputRange(smallest_user_key,
+                                             largest_user_key)) {
         return true;
       }
     }
@@ -284,7 +344,7 @@ bool CompactionPicker::RangeOverlapWithCompaction(
 
 bool CompactionPicker::FilesRangeOverlapWithCompaction(
     const std::vector<CompactionInputFiles>& inputs, int level,
-    int penultimate_level) const {
+    int proximal_level) const {
   bool is_empty = true;
   for (auto& in : inputs) {
     if (!in.empty()) {
@@ -301,18 +361,18 @@ bool CompactionPicker::FilesRangeOverlapWithCompaction(
   //  files cannot be overlapped in the order of L0 files.
   InternalKey smallest, largest;
   GetRange(inputs, &smallest, &largest, Compaction::kInvalidLevel);
-  if (penultimate_level != Compaction::kInvalidLevel) {
+  if (proximal_level != Compaction::kInvalidLevel) {
     if (ioptions_.compaction_style == kCompactionStyleUniversal) {
       if (RangeOverlapWithCompaction(smallest.user_key(), largest.user_key(),
-                                     penultimate_level)) {
+                                     proximal_level)) {
         return true;
       }
     } else {
-      InternalKey penultimate_smallest, penultimate_largest;
-      GetRange(inputs, &penultimate_smallest, &penultimate_largest, level);
-      if (RangeOverlapWithCompaction(penultimate_smallest.user_key(),
-                                     penultimate_largest.user_key(),
-                                     penultimate_level)) {
+      InternalKey proximal_smallest, proximal_largest;
+      GetRange(inputs, &proximal_smallest, &proximal_largest, level);
+      if (RangeOverlapWithCompaction(proximal_smallest.user_key(),
+                                     proximal_largest.user_key(),
+                                     proximal_level)) {
         return true;
       }
     }
@@ -333,11 +393,13 @@ bool CompactionPicker::AreFilesInCompaction(
   return false;
 }
 
-Compaction* CompactionPicker::CompactFiles(
+Compaction* CompactionPicker::PickCompactionForCompactFiles(
     const CompactionOptions& compact_options,
     const std::vector<CompactionInputFiles>& input_files, int output_level,
     VersionStorageInfo* vstorage, const MutableCFOptions& mutable_cf_options,
-    const MutableDBOptions& mutable_db_options, uint32_t output_path_id) {
+    const MutableDBOptions& mutable_db_options, uint32_t output_path_id,
+    std::optional<SequenceNumber> earliest_snapshot,
+    const SnapshotChecker* snapshot_checker) {
 #ifndef NDEBUG
   assert(input_files.size());
   // This compaction output should not overlap with a running compaction as
@@ -353,7 +415,7 @@ Compaction* CompactionPicker::CompactFiles(
   }
   assert(output_level == 0 || !FilesRangeOverlapWithCompaction(
                                   input_files, output_level,
-                                  Compaction::EvaluatePenultimateLevel(
+                                  Compaction::EvaluateProximalLevel(
                                       vstorage, mutable_cf_options, ioptions_,
                                       start_level, output_level)));
 #endif /* !NDEBUG */
@@ -373,15 +435,16 @@ Compaction* CompactionPicker::CompactFiles(
     // without configurable `CompressionOptions`, which is inconsistent.
     compression_type = compact_options.compression;
   }
+
   auto c = new Compaction(
       vstorage, ioptions_, mutable_cf_options, mutable_db_options, input_files,
       output_level, compact_options.output_file_size_limit,
       mutable_cf_options.max_compaction_bytes, output_path_id, compression_type,
       GetCompressionOptions(mutable_cf_options, vstorage, output_level),
-      mutable_cf_options.default_write_temperature,
+      compact_options.output_temperature_override,
       compact_options.max_subcompactions,
-      /* grandparents */ {}, /* earliest_snapshot */ std::nullopt,
-      /* snapshot_checker */ nullptr, true);
+      /* grandparents */ {}, earliest_snapshot, snapshot_checker,
+      CompactionReason::kManualCompaction);
   RegisterCompaction(c);
   return c;
 }
@@ -462,7 +525,8 @@ bool CompactionPicker::SetupOtherInputs(
     const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
     VersionStorageInfo* vstorage, CompactionInputFiles* inputs,
     CompactionInputFiles* output_level_inputs, int* parent_index,
-    int base_index, bool only_expand_towards_right) {
+    int base_index, bool only_expand_towards_right,
+    const FileMetaData* starting_l0_file) {
   assert(!inputs->empty());
   assert(output_level_inputs->empty());
   const int input_level = inputs->level;
@@ -518,11 +582,11 @@ bool CompactionPicker::SetupOtherInputs(
       // Round-robin compaction only allows expansion towards the larger side.
       vstorage->GetOverlappingInputs(input_level, &smallest, &all_limit,
                                      &expanded_inputs.files, base_index,
-                                     nullptr);
+                                     nullptr, true, starting_l0_file);
     } else {
       vstorage->GetOverlappingInputs(input_level, &all_start, &all_limit,
                                      &expanded_inputs.files, base_index,
-                                     nullptr);
+                                     nullptr, true, starting_l0_file);
     }
     uint64_t expanded_inputs_size = TotalFileSize(expanded_inputs.files);
     if (!ExpandInputsToCleanCut(cf_name, vstorage, &expanded_inputs)) {
@@ -601,13 +665,14 @@ void CompactionPicker::GetGrandparents(
   }
 }
 
-Compaction* CompactionPicker::CompactRange(
+Compaction* CompactionPicker::PickCompactionForCompactRange(
     const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
     const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
     int input_level, int output_level,
     const CompactRangeOptions& compact_range_options, const InternalKey* begin,
     const InternalKey* end, InternalKey** compaction_end, bool* manual_conflict,
-    uint64_t max_file_num_to_ignore, const std::string& trim_ts) {
+    uint64_t max_file_num_to_ignore, const std::string& trim_ts,
+    const std::string& full_history_ts_low) {
   // CompactionPickerFIFO has its own implementation of compact range
   assert(ioptions_.compaction_style != kCompactionStyleFIFO);
 
@@ -617,8 +682,8 @@ Compaction* CompactionPicker::CompactRange(
     // Universal compaction with more than one level always compacts all the
     // files together to the last level.
     assert(vstorage->num_levels() > 1);
-    int max_output_level =
-        vstorage->MaxOutputLevel(ioptions_.allow_ingest_behind);
+    int max_output_level = vstorage->MaxOutputLevel(
+        ioptions_.cf_allow_ingest_behind || ioptions_.allow_ingest_behind);
     // DBImpl::CompactRange() set output level to be the last level
     assert(output_level == max_output_level);
     // DBImpl::RunManualCompaction will make full range for universal compaction
@@ -659,9 +724,9 @@ Compaction* CompactionPicker::CompactRange(
     // overlaping outputs in the same level.
     if (FilesRangeOverlapWithCompaction(
             inputs, output_level,
-            Compaction::EvaluatePenultimateLevel(vstorage, mutable_cf_options,
-                                                 ioptions_, start_level,
-                                                 output_level))) {
+            Compaction::EvaluateProximalLevel(vstorage, mutable_cf_options,
+                                              ioptions_, start_level,
+                                              output_level))) {
       // This compaction output could potentially conflict with the output
       // of a currently running compaction, we cannot run it.
       *manual_conflict = true;
@@ -677,18 +742,17 @@ Compaction* CompactionPicker::CompactRange(
         compact_range_options.target_path_id,
         GetCompressionType(vstorage, mutable_cf_options, output_level, 1),
         GetCompressionOptions(mutable_cf_options, vstorage, output_level),
-        mutable_cf_options.default_write_temperature,
-        compact_range_options.max_subcompactions,
+        Temperature::kUnknown, compact_range_options.max_subcompactions,
         /* grandparents */ {}, /* earliest_snapshot */ std::nullopt,
-        /* snapshot_checker */ nullptr,
-        /* is manual */ true, trim_ts, /* score */ -1,
-        /* deletion_compaction */ false, /* l0_files_might_overlap */ true,
-        CompactionReason::kUnknown,
+        /* snapshot_checker */ nullptr, CompactionReason::kManualCompaction,
+        trim_ts, /* score */ -1,
+        /* l0_files_might_overlap */ true,
         compact_range_options.blob_garbage_collection_policy,
         compact_range_options.blob_garbage_collection_age_cutoff);
 
     RegisterCompaction(c);
-    vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options);
+    vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options,
+                                     full_history_ts_low);
     return c;
   }
 
@@ -848,9 +912,9 @@ Compaction* CompactionPicker::CompactRange(
   // overlaping outputs in the same level.
   if (FilesRangeOverlapWithCompaction(
           compaction_inputs, output_level,
-          Compaction::EvaluatePenultimateLevel(vstorage, mutable_cf_options,
-                                               ioptions_, input_level,
-                                               output_level))) {
+          Compaction::EvaluateProximalLevel(vstorage, mutable_cf_options,
+                                            ioptions_, input_level,
+                                            output_level))) {
     // This compaction output could potentially conflict with the output
     // of a currently running compaction, we cannot run it.
     *manual_conflict = true;
@@ -870,12 +934,11 @@ Compaction* CompactionPicker::CompactRange(
       GetCompressionType(vstorage, mutable_cf_options, output_level,
                          vstorage->base_level()),
       GetCompressionOptions(mutable_cf_options, vstorage, output_level),
-      mutable_cf_options.default_write_temperature,
-      compact_range_options.max_subcompactions, std::move(grandparents),
+      Temperature::kUnknown, compact_range_options.max_subcompactions,
+      std::move(grandparents),
       /* earliest_snapshot */ std::nullopt, /* snapshot_checker */ nullptr,
-      /* is manual */ true, trim_ts, /* score */ -1,
-      /* deletion_compaction */ false, /* l0_files_might_overlap */ true,
-      CompactionReason::kUnknown,
+      CompactionReason::kManualCompaction, trim_ts, /* score */ -1,
+      /* l0_files_might_overlap */ true,
       compact_range_options.blob_garbage_collection_policy,
       compact_range_options.blob_garbage_collection_age_cutoff);
 
@@ -886,7 +949,8 @@ Compaction* CompactionPicker::CompactRange(
   // takes running compactions into account (by skipping files that are already
   // being compacted). Since we just changed compaction score, we recalculate it
   // here
-  vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options);
+  vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options,
+                                   full_history_ts_low);
 
   return compaction;
 }
@@ -1137,7 +1201,7 @@ Status CompactionPicker::SanitizeAndConvertCompactionInputFiles(
   if (output_level != 0 &&
       FilesRangeOverlapWithCompaction(
           *converted_input_files, output_level,
-          Compaction::EvaluatePenultimateLevel(
+          Compaction::EvaluateProximalLevel(
               version->storage_info(), version->GetMutableCFOptions(),
               ioptions_, (*converted_input_files)[0].level, output_level))) {
     return Status::Aborted(
@@ -1154,7 +1218,7 @@ void CompactionPicker::RegisterCompaction(Compaction* c) {
   assert(ioptions_.compaction_style != kCompactionStyleLevel ||
          c->output_level() == 0 ||
          !FilesRangeOverlapWithCompaction(*c->inputs(), c->output_level(),
-                                          c->GetPenultimateLevel()));
+                                          c->GetProximalLevel()));
   // CompactionReason::kExternalSstIngestion's start level is just a placeholder
   // number without actual meaning as file ingestion technically does not have
   // an input level like other compactions
@@ -1231,7 +1295,7 @@ void CompactionPicker::PickFilesMarkedForCompaction(
 
 bool CompactionPicker::GetOverlappingL0Files(
     VersionStorageInfo* vstorage, CompactionInputFiles* start_level_inputs,
-    int output_level, int* parent_index) {
+    int output_level, int* parent_index, const FileMetaData* starting_l0_file) {
   // Two level 0 compaction won't run at the same time, so don't need to worry
   // about files on level 0 being compacted.
   assert(level0_compactions_in_progress()->empty());
@@ -1242,7 +1306,11 @@ bool CompactionPicker::GetOverlappingL0Files(
   // which will include the picked file.
   start_level_inputs->files.clear();
   vstorage->GetOverlappingInputs(0, &smallest, &largest,
-                                 &(start_level_inputs->files));
+                                 &(start_level_inputs->files),
+                                 /*hint_index=*/-1,
+                                 /*file_index=*/nullptr,
+                                 /*expand_range=*/true,
+                                 /*starting_l0_file=*/starting_l0_file);
 
   // If we include more L0 files in the same compaction run it can
   // cause the 'smallest' and 'largest' key to get extended to a
diff --git a/db/compaction/compaction_picker.h b/db/compaction/compaction_picker.h
index 6285e054301e..bb9b22456e50 100644
--- a/db/compaction/compaction_picker.h
+++ b/db/compaction/compaction_picker.h
@@ -65,7 +65,8 @@ class CompactionPicker {
       const MutableDBOptions& mutable_db_options,
       const std::vector<SequenceNumber>& existing_snapshots,
       const SnapshotChecker* snapshot_checker, VersionStorageInfo* vstorage,
-      LogBuffer* log_buffer) = 0;
+      LogBuffer* log_buffer, const std::string& full_history_ts_low,
+      bool require_max_output_level = false) = 0;
 
   // The returned Compaction might not include the whole requested range.
   // In that case, compaction_end will be set to the next key that needs
@@ -75,14 +76,15 @@ class CompactionPicker {
   // *compaction_end should point to valid InternalKey!
   // REQUIRES: If not compacting all levels (input_level == kCompactAllLevels),
   // then levels between input_level and output_level should be empty.
-  virtual Compaction* CompactRange(
+  virtual Compaction* PickCompactionForCompactRange(
       const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
       const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
       int input_level, int output_level,
       const CompactRangeOptions& compact_range_options,
       const InternalKey* begin, const InternalKey* end,
       InternalKey** compaction_end, bool* manual_conflict,
-      uint64_t max_file_num_to_ignore, const std::string& trim_ts);
+      uint64_t max_file_num_to_ignore, const std::string& trim_ts,
+      const std::string& full_history_ts_low);
 
   // The maximum allowed output level.  Default value is NumberLevels() - 1.
   virtual int MaxOutputLevel() const { return NumberLevels() - 1; }
@@ -117,12 +119,17 @@ class CompactionPicker {
   // Caller must provide a set of input files that has been passed through
   // `SanitizeAndConvertCompactionInputFiles` earlier. The lock should not be
   // released between that call and this one.
-  Compaction* CompactFiles(const CompactionOptions& compact_options,
-                           const std::vector<CompactionInputFiles>& input_files,
-                           int output_level, VersionStorageInfo* vstorage,
-                           const MutableCFOptions& mutable_cf_options,
-                           const MutableDBOptions& mutable_db_options,
-                           uint32_t output_path_id);
+  //
+  //  TODO - Remove default values for earliest_snapshot and snapshot_checker
+  //  and require all callers to pass them in so that DB::CompactFiles() can
+  //  also benefit from Standalone Range Tombstone Optimization
+  Compaction* PickCompactionForCompactFiles(
+      const CompactionOptions& compact_options,
+      const std::vector<CompactionInputFiles>& input_files, int output_level,
+      VersionStorageInfo* vstorage, const MutableCFOptions& mutable_cf_options,
+      const MutableDBOptions& mutable_db_options, uint32_t output_path_id,
+      std::optional<SequenceNumber> earliest_snapshot = std::nullopt,
+      const SnapshotChecker* snapshot_checker = nullptr);
 
   // Converts a set of compaction input file numbers into
   // a list of CompactionInputFiles.
@@ -138,6 +145,12 @@ class CompactionPicker {
     return !level0_compactions_in_progress_.empty();
   }
 
+  // Is any compaction in progress
+  bool IsCompactionInProgress() const {
+    return !(level0_compactions_in_progress_.empty() &&
+             compactions_in_progress_.empty());
+  }
+
   // Return true if the passed key range overlap with a compaction output
   // that is currently running.
   bool RangeOverlapWithCompaction(const Slice& smallest_user_key,
@@ -190,15 +203,18 @@ class CompactionPicker {
   // key range of a currently running compaction.
   bool FilesRangeOverlapWithCompaction(
       const std::vector<CompactionInputFiles>& inputs, int level,
-      int penultimate_level) const;
+      int proximal_level) const;
 
+  // @param starting_l0_file If not null, restricts L0 file selection to only
+  //                         include files at or older than starting_l0_file.
   bool SetupOtherInputs(const std::string& cf_name,
                         const MutableCFOptions& mutable_cf_options,
                         VersionStorageInfo* vstorage,
                         CompactionInputFiles* inputs,
                         CompactionInputFiles* output_level_inputs,
                         int* parent_index, int base_index,
-                        bool only_expand_towards_right = false);
+                        bool only_expand_towards_right = false,
+                        const FileMetaData* starting_l0_file = nullptr);
 
   void GetGrandparents(VersionStorageInfo* vstorage,
                        const CompactionInputFiles& inputs,
@@ -211,9 +227,12 @@ class CompactionPicker {
       CompactionInputFiles* start_level_inputs,
       std::function<bool(const FileMetaData*)> skip_marked_file);
 
+  // @param starting_l0_file If not null, restricts L0 file selection to only
+  //                         include files at or older than starting_l0_file.
   bool GetOverlappingL0Files(VersionStorageInfo* vstorage,
                              CompactionInputFiles* start_level_inputs,
-                             int output_level, int* parent_index);
+                             int output_level, int* parent_index,
+                             const FileMetaData* starting_l0_file = nullptr);
 
   // Register this compaction in the set of running compactions
   void RegisterCompaction(Compaction* c);
@@ -266,23 +285,24 @@ class NullCompactionPicker : public CompactionPicker {
       const MutableDBOptions& /*mutable_db_options*/,
       const std::vector<SequenceNumber>& /*existing_snapshots*/,
       const SnapshotChecker* /*snapshot_checker*/,
-      VersionStorageInfo* /*vstorage*/, LogBuffer* /* log_buffer */) override {
+      VersionStorageInfo* /*vstorage*/, LogBuffer* /* log_buffer */,
+      const std::string& /*full_history_ts_low*/,
+      bool /*require_max_output_level*/) override {
     return nullptr;
   }
 
   // Always return "nullptr"
-  Compaction* CompactRange(const std::string& /*cf_name*/,
-                           const MutableCFOptions& /*mutable_cf_options*/,
-                           const MutableDBOptions& /*mutable_db_options*/,
-                           VersionStorageInfo* /*vstorage*/,
-                           int /*input_level*/, int /*output_level*/,
-                           const CompactRangeOptions& /*compact_range_options*/,
-                           const InternalKey* /*begin*/,
-                           const InternalKey* /*end*/,
-                           InternalKey** /*compaction_end*/,
-                           bool* /*manual_conflict*/,
-                           uint64_t /*max_file_num_to_ignore*/,
-                           const std::string& /*trim_ts*/) override {
+  Compaction* PickCompactionForCompactRange(
+      const std::string& /*cf_name*/,
+      const MutableCFOptions& /*mutable_cf_options*/,
+      const MutableDBOptions& /*mutable_db_options*/,
+      VersionStorageInfo* /*vstorage*/, int /*input_level*/,
+      int /*output_level*/,
+      const CompactRangeOptions& /*compact_range_options*/,
+      const InternalKey* /*begin*/, const InternalKey* /*end*/,
+      InternalKey** /*compaction_end*/, bool* /*manual_conflict*/,
+      uint64_t /*max_file_num_to_ignore*/, const std::string& /*trim_ts*/,
+      const std::string& /*full_history_ts_low*/) override {
     return nullptr;
   }
 
@@ -308,11 +328,10 @@ class NullCompactionPicker : public CompactionPicker {
 //                                        files. Cannot be nullptr.
 //
 // @return                                true iff compaction was found.
-bool FindIntraL0Compaction(const std::vector<FileMetaData*>& level_files,
-                           size_t min_files_to_compact,
-                           uint64_t max_compact_bytes_per_del_file,
-                           uint64_t max_compaction_bytes,
-                           CompactionInputFiles* comp_inputs);
+bool PickCostBasedIntraL0Compaction(
+    const std::vector<FileMetaData*>& level_files, size_t min_files_to_compact,
+    uint64_t max_compact_bytes_per_del_file, uint64_t max_compaction_bytes,
+    CompactionInputFiles* comp_inputs);
 
 CompressionType GetCompressionType(const VersionStorageInfo* vstorage,
                                    const MutableCFOptions& mutable_cf_options,
diff --git a/db/compaction/compaction_picker_fifo.cc b/db/compaction/compaction_picker_fifo.cc
index d5c735194004..e13c333856d2 100644
--- a/db/compaction/compaction_picker_fifo.cc
+++ b/db/compaction/compaction_picker_fifo.cc
@@ -9,6 +9,7 @@
 
 #include "db/compaction/compaction_picker_fifo.h"
 
+#include <algorithm>
 #include <cinttypes>
 #include <string>
 #include <vector>
@@ -31,6 +32,29 @@ uint64_t GetTotalFilesSize(const std::vector<FileMetaData*>& files) {
   }
   return total_size;
 }
+
+// Compute effective data size and capacity limit for FIFO compaction.
+// When max_data_files_size > 0 (blob-aware mode), the effective size includes
+// both SST and blob file sizes, and the limit is max_data_files_size.
+// Otherwise, only SST sizes are used with max_table_files_size as the limit.
+void GetEffectiveSizeAndLimit(const CompactionOptionsFIFO& fifo_opts,
+                              uint64_t total_sst_size, uint64_t total_blob_size,
+                              uint64_t* effective_size,
+                              uint64_t* effective_max) {
+  *effective_size = total_sst_size;
+  *effective_max = fifo_opts.max_table_files_size;
+  if (fifo_opts.max_data_files_size > 0) {
+    *effective_size += total_blob_size;
+    *effective_max = fifo_opts.max_data_files_size;
+  }
+}
+
+// Return the effective capacity limit for FIFO compaction.
+// Convenience wrapper when only the limit is needed (e.g., PickTTLCompaction).
+uint64_t GetEffectiveMax(const CompactionOptionsFIFO& fifo_opts) {
+  return fifo_opts.max_data_files_size > 0 ? fifo_opts.max_data_files_size
+                                           : fifo_opts.max_table_files_size;
+}
 }  // anonymous namespace
 
 bool FIFOCompactionPicker::NeedsCompaction(
@@ -98,10 +122,43 @@ Compaction* FIFOCompactionPicker::PickTTLCompaction(
   // Return a nullptr and proceed to size-based FIFO compaction if:
   // 1. there are no files older than ttl OR
   // 2. there are a few files older than ttl, but deleting them will not bring
-  //    the total size to be less than max_table_files_size threshold.
-  if (inputs[0].files.empty() ||
-      total_size >
-          mutable_cf_options.compaction_options_fifo.max_table_files_size) {
+  //    the total size to be less than the size threshold.
+  uint64_t effective_max =
+      GetEffectiveMax(mutable_cf_options.compaction_options_fifo);
+  // Estimate the effective remaining data after dropping TTL-expired SSTs.
+  // Each dropped SST also frees a proportional share of blob data.
+  //
+  // In multi-level FIFO (migration), we must use total SST across ALL levels
+  // as the reference, because total_blob covers all levels. Using only L0
+  // SST would inflate the blob estimate.
+  uint64_t effective_remaining = total_size;
+  if (mutable_cf_options.compaction_options_fifo.max_data_files_size > 0) {
+    uint64_t total_blob = vstorage->GetBlobStats().total_file_size;
+    // Compute total SST across all levels so the reference scope matches
+    // total_blob's scope (all levels).
+    uint64_t total_sst_all_levels = GetTotalFilesSize(level_files);
+    for (int level = 1; level < vstorage->num_levels(); ++level) {
+      total_sst_all_levels += GetTotalFilesSize(vstorage->LevelFiles(level));
+    }
+    // remaining_sst_all = total_sst_all - dropped_l0_sst
+    // total_size is the remaining L0 SST after removing expired files;
+    // original L0 SST minus remaining L0 SST = dropped.
+    uint64_t original_l0_sst = GetTotalFilesSize(level_files);
+    uint64_t dropped_sst = original_l0_sst - total_size;
+    uint64_t remaining_sst_all = total_sst_all_levels - dropped_sst;
+    // Proportional blob estimate: each SST byte "owns" a proportional
+    // share of blob bytes. Both reference sizes must come from the same
+    // scope (all levels) to avoid inflated estimates.
+    if (total_sst_all_levels > 0 && total_blob > 0) {
+      effective_remaining =
+          remaining_sst_all +
+          static_cast<uint64_t>(static_cast<double>(remaining_sst_all) /
+                                total_sst_all_levels * total_blob);
+    } else {
+      effective_remaining = remaining_sst_all;
+    }
+  }
+  if (inputs[0].files.empty() || effective_remaining > effective_max) {
     return nullptr;
   }
 
@@ -124,14 +181,11 @@ Compaction* FIFOCompactionPicker::PickTTLCompaction(
   Compaction* c = new Compaction(
       vstorage, ioptions_, mutable_cf_options, mutable_db_options,
       std::move(inputs), 0, 0, 0, 0, kNoCompression,
-      mutable_cf_options.compression_opts,
-      mutable_cf_options.default_write_temperature,
+      mutable_cf_options.compression_opts, Temperature::kUnknown,
       /* max_subcompactions */ 0, {}, /* earliest_snapshot */ std::nullopt,
-      /* snapshot_checker */ nullptr,
-      /* is manual */ false,
+      /* snapshot_checker */ nullptr, CompactionReason::kFIFOTtl,
       /* trim_ts */ "", vstorage->CompactionScore(0),
-      /* is deletion compaction */ true, /* l0_files_might_overlap */ true,
-      CompactionReason::kFIFOTtl);
+      /* l0_files_might_overlap */ true);
   return c;
 }
 
@@ -154,7 +208,9 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction(
     const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
     const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
     LogBuffer* log_buffer) {
-  // compute the total size and identify the last non-empty level
+  const auto& fifo_opts = mutable_cf_options.compaction_options_fifo;
+
+  // compute the total SST size and identify the last non-empty level
   int last_level = 0;
   uint64_t total_size = 0;
   for (int level = 0; level < vstorage->num_levels(); ++level) {
@@ -167,54 +223,13 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction(
   const std::vector<FileMetaData*>& last_level_files =
       vstorage->LevelFiles(last_level);
 
-  if (last_level == 0 &&
-      total_size <=
-          mutable_cf_options.compaction_options_fifo.max_table_files_size) {
-    // total size not exceeded, try to find intra level 0 compaction if enabled
-    const std::vector<FileMetaData*>& level0_files = vstorage->LevelFiles(0);
-    if (mutable_cf_options.compaction_options_fifo.allow_compaction &&
-        level0_files.size() > 0) {
-      CompactionInputFiles comp_inputs;
-      // try to prevent same files from being compacted multiple times, which
-      // could produce large files that may never TTL-expire. Achieve this by
-      // disallowing compactions with files larger than memtable (inflate its
-      // size by 10% to account for uncompressed L0 files that may have size
-      // slightly greater than memtable size limit).
-      size_t max_compact_bytes_per_del_file =
-          static_cast<size_t>(MultiplyCheckOverflow(
-              static_cast<uint64_t>(mutable_cf_options.write_buffer_size),
-              1.1));
-      if (FindIntraL0Compaction(
-              level0_files,
-              mutable_cf_options
-                  .level0_file_num_compaction_trigger /* min_files_to_compact */
-              ,
-              max_compact_bytes_per_del_file,
-              mutable_cf_options.max_compaction_bytes, &comp_inputs)) {
-        Compaction* c = new Compaction(
-            vstorage, ioptions_, mutable_cf_options, mutable_db_options,
-            {comp_inputs}, 0, 16 * 1024 * 1024 /* output file size limit */,
-            0 /* max compaction bytes, not applicable */,
-            0 /* output path ID */, mutable_cf_options.compression,
-            mutable_cf_options.compression_opts,
-            mutable_cf_options.default_write_temperature,
-            0 /* max_subcompactions */, {},
-            /* earliest_snapshot */ std::nullopt,
-            /* snapshot_checker */ nullptr, /* is manual */ false,
-            /* trim_ts */ "", vstorage->CompactionScore(0),
-            /* is deletion compaction */ false,
-            /* l0_files_might_overlap */ true,
-            CompactionReason::kFIFOReduceNumFiles);
-        return c;
-      }
-    }
+  // Compute effective size and limit for comparison.
+  uint64_t effective_size, effective_max;
+  GetEffectiveSizeAndLimit(fifo_opts, total_size,
+                           vstorage->GetBlobStats().total_file_size,
+                           &effective_size, &effective_max);
 
-    ROCKS_LOG_BUFFER(
-        log_buffer,
-        "[%s] FIFO compaction: nothing to do. Total size %" PRIu64
-        ", max size %" PRIu64 "\n",
-        cf_name.c_str(), total_size,
-        mutable_cf_options.compaction_options_fifo.max_table_files_size);
+  if (last_level == 0 && effective_size <= effective_max) {
     return nullptr;
   }
 
@@ -232,11 +247,29 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction(
   inputs[0].level = last_level;
 
   if (last_level == 0) {
+    // When using blob-aware sizing, use proportional estimation (same
+    // principle as EstimateTotalDataForSST): each SST "owns"
+    // effective_size / num_files of total data. This is an approximation
+    // — individual SSTs may reference different amounts of blob data,
+    // but uniform distribution is a reasonable estimate for FIFO dropping.
+    uint64_t remaining_size = effective_size;
+    const uint64_t num_files = last_level_files.size();
+    // Proportional estimate of data per file (SST + blob).
+    // Use max(1) to prevent stalling when effective_size < num_files.
+    const uint64_t data_per_file =
+        (fifo_opts.max_data_files_size > 0 && num_files > 0)
+            ? std::max(effective_size / num_files, uint64_t{1})
+            : 0;
+
     // In L0, right-most files are the oldest files.
     for (auto ritr = last_level_files.rbegin(); ritr != last_level_files.rend();
          ++ritr) {
       auto f = *ritr;
-      total_size -= f->fd.file_size;
+      if (fifo_opts.max_data_files_size > 0) {
+        remaining_size -= std::min(remaining_size, data_per_file);
+      } else {
+        remaining_size -= std::min(remaining_size, f->fd.file_size);
+      }
       inputs[0].files.push_back(f);
       char tmp_fsize[16];
       AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize));
@@ -244,13 +277,11 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction(
                        "[%s] FIFO compaction: picking file %" PRIu64
                        " with size %s for deletion",
                        cf_name.c_str(), f->fd.GetNumber(), tmp_fsize);
-      if (total_size <=
-          mutable_cf_options.compaction_options_fifo.max_table_files_size) {
+      if (remaining_size <= effective_max) {
         break;
       }
     }
-  } else if (total_size >
-             mutable_cf_options.compaction_options_fifo.max_table_files_size) {
+  } else if (effective_size > effective_max) {
     // If the last level is non-L0, we actually don't know which file is
     // logically the oldest since the file creation time only represents
     // when this file was compacted to this level, which is independent
@@ -260,31 +291,36 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction(
     // file with the smallest key will be deleted first.  This design decision
     // better serves a major type of FIFO use cases where smaller keys are
     // associated with older data.
+    const uint64_t num_files = last_level_files.size();
+    // Proportional estimate of data per file (SST + blob), same as L0 path.
+    const uint64_t data_per_file =
+        (fifo_opts.max_data_files_size > 0 && num_files > 0)
+            ? std::max(effective_size / num_files, uint64_t{1})
+            : 0;
     for (const auto& f : last_level_files) {
-      total_size -= f->fd.file_size;
+      if (f->being_compacted) {
+        continue;
+      }
+      if (fifo_opts.max_data_files_size > 0) {
+        effective_size -= std::min(effective_size, data_per_file);
+      } else {
+        effective_size -= std::min(effective_size, f->fd.file_size);
+      }
       inputs[0].files.push_back(f);
       char tmp_fsize[16];
       AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize));
-      ROCKS_LOG_BUFFER(
-          log_buffer,
-          "[%s] FIFO compaction: picking file %" PRIu64
-          " with size %s for deletion under total size %" PRIu64
-          " vs max table files size %" PRIu64,
-          cf_name.c_str(), f->fd.GetNumber(), tmp_fsize, total_size,
-          mutable_cf_options.compaction_options_fifo.max_table_files_size);
-
-      if (total_size <=
-          mutable_cf_options.compaction_options_fifo.max_table_files_size) {
+      ROCKS_LOG_BUFFER(log_buffer,
+                       "[%s] FIFO compaction: picking file %" PRIu64
+                       " with size %s for deletion under total size %" PRIu64
+                       " vs max size %" PRIu64,
+                       cf_name.c_str(), f->fd.GetNumber(), tmp_fsize,
+                       effective_size, effective_max);
+
+      if (effective_size <= effective_max) {
         break;
       }
     }
   } else {
-    ROCKS_LOG_BUFFER(
-        log_buffer,
-        "[%s] FIFO compaction: nothing to do. Total size %" PRIu64
-        ", max size %" PRIu64 "\n",
-        cf_name.c_str(), total_size,
-        mutable_cf_options.compaction_options_fifo.max_table_files_size);
     return nullptr;
   }
 
@@ -294,14 +330,11 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction(
       /* target_file_size */ 0,
       /* max_compaction_bytes */ 0,
       /* output_path_id */ 0, kNoCompression,
-      mutable_cf_options.compression_opts,
-      mutable_cf_options.default_write_temperature,
+      mutable_cf_options.compression_opts, Temperature::kUnknown,
       /* max_subcompactions */ 0, {}, /* earliest_snapshot */ std::nullopt,
-      /* snapshot_checker */ nullptr,
-      /* is manual */ false,
+      /* snapshot_checker */ nullptr, CompactionReason::kFIFOMaxSize,
       /* trim_ts */ "", vstorage->CompactionScore(0),
-      /* is deletion compaction */ true,
-      /* l0_files_might_overlap */ true, CompactionReason::kFIFOMaxSize);
+      /* l0_files_might_overlap */ true);
   return c;
 }
 
@@ -392,12 +425,14 @@ Compaction* FIFOCompactionPicker::PickTemperatureChangeCompaction(
       assert(compaction_target_temp == Temperature::kLastTemperature);
       compaction_target_temp = cur_target_temp;
       inputs[0].files.push_back(cur_file);
-      ROCKS_LOG_BUFFER(
-          log_buffer,
-          "[%s] FIFO compaction: picking file %" PRIu64
-          " with estimated newest key time %" PRIu64 " for temperature %s.",
-          cf_name.c_str(), cur_file->fd.GetNumber(), est_newest_key_time,
-          temperature_to_string[cur_target_temp].c_str());
+      ROCKS_LOG_BUFFER(log_buffer,
+                       "[%s] FIFO compaction: picking file %" PRIu64
+                       " with estimated newest key time %" PRIu64
+                       " and temperature %s for temperature %s.",
+                       cf_name.c_str(), cur_file->fd.GetNumber(),
+                       est_newest_key_time,
+                       temperature_to_string[cur_file->temperature].c_str(),
+                       temperature_to_string[cur_target_temp].c_str());
       break;
     }
   }
@@ -416,19 +451,268 @@ Compaction* FIFOCompactionPicker::PickTemperatureChangeCompaction(
       mutable_cf_options.compression, mutable_cf_options.compression_opts,
       compaction_target_temp,
       /* max_subcompactions */ 0, {}, /* earliest_snapshot */ std::nullopt,
-      /* snapshot_checker */ nullptr,
-      /* is manual */ false, /* trim_ts */ "", vstorage->CompactionScore(0),
-      /* is deletion compaction */ false, /* l0_files_might_overlap */ true,
-      CompactionReason::kChangeTemperature);
+      /* snapshot_checker */ nullptr, CompactionReason::kChangeTemperature,
+      /* trim_ts */ "", vstorage->CompactionScore(0),
+      /* l0_files_might_overlap */ true);
   return c;
 }
 
+Compaction* FIFOCompactionPicker::PickIntraL0Compaction(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+    LogBuffer* log_buffer) {
+  const auto& fifo_opts = mutable_cf_options.compaction_options_fifo;
+
+  if (!fifo_opts.allow_compaction) {
+    return nullptr;
+  }
+
+  const std::vector<FileMetaData*>& level0_files = vstorage->LevelFiles(0);
+  if (level0_files.empty()) {
+    return nullptr;
+  }
+
+  if (fifo_opts.use_kv_ratio_compaction) {
+    return PickRatioBasedIntraL0Compaction(
+        cf_name, mutable_cf_options, mutable_db_options, vstorage, log_buffer);
+  }
+
+  // Old intra-L0 path: merge small files using PickCostBasedIntraL0Compaction.
+  // Minimum files to compact follows level0_file_num_compaction_trigger.
+  // Try to prevent same files from being compacted multiple times, which
+  // could produce large files that may never TTL-expire. Achieve this by
+  // disallowing compactions with files larger than memtable (inflate its
+  // size by 10% to account for uncompressed L0 files that may have size
+  // slightly greater than memtable size limit).
+
+  CompactionInputFiles comp_inputs;
+  size_t max_compact_bytes_per_del_file =
+      static_cast<size_t>(MultiplyCheckOverflow(
+          static_cast<uint64_t>(mutable_cf_options.write_buffer_size), 1.1));
+  if (PickCostBasedIntraL0Compaction(
+          level0_files,
+          mutable_cf_options
+              .level0_file_num_compaction_trigger /* min_files_to_compact */,
+          max_compact_bytes_per_del_file,
+          mutable_cf_options.max_compaction_bytes, &comp_inputs)) {
+    Compaction* c = new Compaction(
+        vstorage, ioptions_, mutable_cf_options, mutable_db_options,
+        {comp_inputs}, 0, 16 * 1024 * 1024 /* output file size limit */,
+        0 /* max compaction bytes, not applicable */, 0 /* output path ID */,
+        mutable_cf_options.compression, mutable_cf_options.compression_opts,
+        Temperature::kUnknown, 0 /* max_subcompactions */, {},
+        /* earliest_snapshot */ std::nullopt,
+        /* snapshot_checker */ nullptr, CompactionReason::kFIFOReduceNumFiles,
+        /* trim_ts */ "", vstorage->CompactionScore(0),
+        /* l0_files_might_overlap */ true);
+    return c;
+  }
+
+  return nullptr;
+}
+
+Compaction* FIFOCompactionPicker::PickRatioBasedIntraL0Compaction(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+    LogBuffer* log_buffer) {
+  const auto& fifo_opts = mutable_cf_options.compaction_options_fifo;
+  assert(fifo_opts.use_kv_ratio_compaction);
+  assert(fifo_opts.max_data_files_size > 0);
+
+  // During migration from level/universal compaction to FIFO, non-L0 levels
+  // may still contain files. The ratio-based algorithm only operates on L0,
+  // so skip it until PickSizeCompaction has drained all non-L0 levels.
+  // Once levels collapse to L0-only, this algorithm will kick in.
+  for (int level = 1; level < vstorage->num_levels(); ++level) {
+    if (!vstorage->LevelFiles(level).empty()) {
+      ROCKS_LOG_BUFFER(log_buffer,
+                       "[%s] FIFO kv-ratio compaction: skipping — non-L0 "
+                       "level %d still has %" ROCKSDB_PRIszt
+                       " files (migration in progress)",
+                       cf_name.c_str(), level,
+                       vstorage->LevelFiles(level).size());
+      return nullptr;
+    }
+  }
+
+  if (!level0_compactions_in_progress_.empty()) {
+    return nullptr;
+  }
+
+  const std::vector<FileMetaData*>& level0_files = vstorage->LevelFiles(0);
+  if (mutable_cf_options.level0_file_num_compaction_trigger <= 1) {
+    // trigger <= 0 is invalid; trigger == 1 means compact after every flush,
+    // which doesn't make sense for tiered merging (the tier boundary loop
+    // divides by trigger, so trigger == 1 would cause an infinite loop).
+    return nullptr;
+  }
+  const size_t trigger = static_cast<size_t>(
+      mutable_cf_options.level0_file_num_compaction_trigger);
+  if (level0_files.size() < trigger) {
+    return nullptr;
+  }
+
+  // Determine the target compacted file size.
+  //
+  // When max_compaction_bytes > 0 (explicitly set by user), use it directly
+  // as the target. This allows users to override the auto-calculated value.
+  //
+  // When max_compaction_bytes == 0 (default), auto-calculate from the data
+  // capacity and observed SST/blob ratio:
+  //   target = max_data_files_size * sst_ratio / trigger
+  //
+  // This is recomputed on every PickCompaction call. The computation is
+  // trivial (sum file sizes + arithmetic) and PickCompaction is only called
+  // once per flush or compaction completion, so no caching is needed.
+  uint64_t target = 0;
+  if (mutable_cf_options.max_compaction_bytes > 0) {
+    // User explicitly set max_compaction_bytes — use it as target
+    target = mutable_cf_options.max_compaction_bytes;
+  } else {
+    // Auto-calculate from capacity and observed SST/blob ratio
+    uint64_t total_sst = GetTotalFilesSize(level0_files);
+    uint64_t total_blob = vstorage->GetBlobStats().total_file_size;
+    uint64_t total_data = total_sst + total_blob;
+
+    if (total_data == 0 || total_sst == 0) {
+      return nullptr;
+    }
+
+    // Compute sst_ratio (inverse of EstimateTotalDataForSST's proportion):
+    // when no blob files exist, sst_ratio is 1.0 and the target becomes
+    // max_data_files_size / trigger, which is large. The algorithm will
+    // naturally not find small enough files to compact.
+    double sst_ratio =
+        (total_blob > 0) ? static_cast<double>(total_sst) / total_data : 1.0;
+
+    uint64_t total_sst_at_cap =
+        static_cast<uint64_t>(fifo_opts.max_data_files_size * sst_ratio);
+    target = total_sst_at_cap / trigger;
+
+    ROCKS_LOG_BUFFER(log_buffer,
+                     "[%s] FIFO ratio-based compaction: sst_ratio=%.4f, "
+                     "target_file_size=%" PRIu64,
+                     cf_name.c_str(), sst_ratio, target);
+  }
+  if (target == 0) {
+    return nullptr;
+  }
+
+  // Tiered size-based file selection.
+  //
+  // Tier boundaries form a geometric sequence descending from target:
+  //   ..., target/trigger^2, target/trigger, target
+  // For each boundary (smallest first), find contiguous L0 files with
+  // size < boundary. If their accumulated bytes >= boundary, merge them.
+  // The output (~boundary bytes) advances to the next tier. Files that
+  // reach target are "graduated" and never compacted again.
+  //
+  // Trade-off: write amplification vs L0 file count.
+  //
+  // Write amp: O(log(target/flush) / log(trigger)) per byte, instead of
+  //   O(target / (trigger * flush)) from flat merging. Each byte is
+  //   rewritten once per tier crossing.
+  //
+  // L0 file count: trigger + k * (trigger - 1) at steady state, where
+  //   k = ceil(log(target/flush) / log(trigger)). This is higher than
+  //   the original trigger target because intermediate tier files
+  //   accumulate while waiting for the next tier merge. The trade-off
+  //   is explicit: more L0 files in exchange for logarithmic (instead
+  //   of linear) write amplification.
+
+  // Build tier boundaries from smallest to largest.
+  // Stop at 10KB minimum — SST files of most workloads are larger than
+  // this, so lower boundaries would only waste CPU scanning L0 files.
+  // Files smaller than the lowest boundary simply merge at that boundary.
+  static constexpr uint64_t kMinTierBoundary = 10 * 1024;  // 10KB
+  std::vector<uint64_t> boundaries;
+  for (uint64_t b = target; b >= kMinTierBoundary; b /= trigger) {
+    boundaries.push_back(b);
+  }
+  if (boundaries.empty()) {
+    // target itself is below kMinTierBoundary — use target as the
+    // sole boundary so we can still compact at the target size.
+    boundaries.push_back(target);
+  }
+  std::reverse(boundaries.begin(), boundaries.end());
+
+  // For each tier boundary (smallest first), scan L0 for mergeable batches.
+  // L0 files are stored newest-first; oldest is at the end.
+  for (const uint64_t boundary : boundaries) {
+    for (size_t scan = level0_files.size(); scan > 0;) {
+      // Skip files >= boundary (they belong to higher tiers) or in-progress
+      if (level0_files[scan - 1]->fd.file_size >= boundary ||
+          level0_files[scan - 1]->being_compacted) {
+        --scan;
+        continue;
+      }
+
+      // Found a file < boundary — collect contiguous batch
+      std::vector<FileMetaData*> batch;
+      uint64_t accumulated = 0;
+      size_t pos = scan;
+      while (pos > 0 && level0_files[pos - 1]->fd.file_size < boundary &&
+             !level0_files[pos - 1]->being_compacted) {
+        // Don't let output exceed 2x boundary (prevent tier-skipping)
+        if (accumulated >= boundary &&
+            accumulated + level0_files[pos - 1]->fd.file_size > boundary * 2) {
+          break;
+        }
+        batch.push_back(level0_files[pos - 1]);
+        accumulated += level0_files[pos - 1]->fd.file_size;
+        --pos;
+      }
+
+      // Viable: >= 2 files and accumulated >= boundary
+      if (batch.size() >= 2 && accumulated >= boundary) {
+        CompactionInputFiles comp_inputs;
+        comp_inputs.level = 0;
+        comp_inputs.files = std::move(batch);
+
+        ROCKS_LOG_BUFFER(
+            log_buffer,
+            "[%s] FIFO kv-ratio compaction: picking %" ROCKSDB_PRIszt
+            " files (%" PRIu64 " bytes) at tier boundary %" PRIu64
+            " for intra-L0 compaction, target=%" PRIu64,
+            cf_name.c_str(), comp_inputs.files.size(), accumulated, boundary,
+            target);
+
+        Compaction* c = new Compaction(
+            vstorage, ioptions_, mutable_cf_options, mutable_db_options,
+            {comp_inputs}, 0, boundary /* output file size limit */,
+            0 /* max compaction bytes, not applicable */,
+            0 /* output path ID */, mutable_cf_options.compression,
+            mutable_cf_options.compression_opts, Temperature::kUnknown,
+            0 /* max_subcompactions */, {},
+            /* earliest_snapshot */ std::nullopt,
+            /* snapshot_checker */ nullptr,
+            CompactionReason::kFIFOReduceNumFiles,
+            /* trim_ts */ "", vstorage->CompactionScore(0),
+            /* l0_files_might_overlap */ true);
+        return c;
+      }
+
+      // This batch wasn't enough — advance past it
+      scan = pos;
+    }
+  }
+
+  return nullptr;
+}
+
+// The full_history_ts_low parameter is used to control bottommost file marking
+// for compaction when user-defined timestamps (UDT) are enabled.
+
+// TODO leverage full_history_ts_low for FIFO compaction, by trigggerring
+// compaction early for data that has already expired to achieve the goal of TTL
+// enforced compliance.
 Compaction* FIFOCompactionPicker::PickCompaction(
     const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
     const MutableDBOptions& mutable_db_options,
     const std::vector<SequenceNumber>& /* existing_snapshots */,
     const SnapshotChecker* /* snapshot_checker */, VersionStorageInfo* vstorage,
-    LogBuffer* log_buffer) {
+    LogBuffer* log_buffer, const std::string& /* full_history_ts_low */,
+    bool /* require_max_output_level*/) {
   Compaction* c = nullptr;
   if (mutable_cf_options.ttl > 0) {
     c = PickTTLCompaction(cf_name, mutable_cf_options, mutable_db_options,
@@ -438,22 +722,35 @@ Compaction* FIFOCompactionPicker::PickCompaction(
     c = PickSizeCompaction(cf_name, mutable_cf_options, mutable_db_options,
                            vstorage, log_buffer);
   }
+  // Intra-L0 compaction merges small files to reduce file count.
+  // It runs after size-based dropping: if PickSizeCompaction dropped files,
+  // it returned non-null and we skip this. Otherwise, we try to reduce
+  // L0 file count by merging small files together.
+  if (c == nullptr) {
+    c = PickIntraL0Compaction(cf_name, mutable_cf_options, mutable_db_options,
+                              vstorage, log_buffer);
+  }
   if (c == nullptr) {
     c = PickTemperatureChangeCompaction(
         cf_name, mutable_cf_options, mutable_db_options, vstorage, log_buffer);
   }
+  if (c == nullptr) {
+    ROCKS_LOG_BUFFER(log_buffer, "[%s] FIFO compaction: no compaction picked",
+                     cf_name.c_str());
+  }
   RegisterCompaction(c);
   return c;
 }
 
-Compaction* FIFOCompactionPicker::CompactRange(
+Compaction* FIFOCompactionPicker::PickCompactionForCompactRange(
     const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
     const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
     int input_level, int output_level,
     const CompactRangeOptions& /*compact_range_options*/,
     const InternalKey* /*begin*/, const InternalKey* /*end*/,
     InternalKey** compaction_end, bool* /*manual_conflict*/,
-    uint64_t /*max_file_num_to_ignore*/, const std::string& /*trim_ts*/) {
+    uint64_t /*max_file_num_to_ignore*/, const std::string& /*trim_ts*/,
+    const std::string& full_history_ts_low) {
 #ifdef NDEBUG
   (void)input_level;
   (void)output_level;
@@ -462,10 +759,10 @@ Compaction* FIFOCompactionPicker::CompactRange(
   assert(output_level == 0);
   *compaction_end = nullptr;
   LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.logger);
-  Compaction* c =
-      PickCompaction(cf_name, mutable_cf_options, mutable_db_options,
-                     /*existing_snapshots*/ {}, /*snapshot_checker*/ nullptr,
-                     vstorage, &log_buffer);
+  Compaction* c = PickCompaction(
+      cf_name, mutable_cf_options, mutable_db_options,
+      /*existing_snapshots*/ {}, /*snapshot_checker*/ nullptr, vstorage,
+      &log_buffer, full_history_ts_low, /* require_max_output_level */ false);
   log_buffer.FlushBufferToLog();
   return c;
 }
diff --git a/db/compaction/compaction_picker_fifo.h b/db/compaction/compaction_picker_fifo.h
index 4dd1053e127b..2c1cd21321b9 100644
--- a/db/compaction/compaction_picker_fifo.h
+++ b/db/compaction/compaction_picker_fifo.h
@@ -23,18 +23,19 @@ class FIFOCompactionPicker : public CompactionPicker {
       const MutableDBOptions& mutable_db_options,
       const std::vector<SequenceNumber>& /* existing_snapshots */,
       const SnapshotChecker* /* snapshot_checker */,
-      VersionStorageInfo* version, LogBuffer* log_buffer) override;
-
-  Compaction* CompactRange(const std::string& cf_name,
-                           const MutableCFOptions& mutable_cf_options,
-                           const MutableDBOptions& mutable_db_options,
-                           VersionStorageInfo* vstorage, int input_level,
-                           int output_level,
-                           const CompactRangeOptions& compact_range_options,
-                           const InternalKey* begin, const InternalKey* end,
-                           InternalKey** compaction_end, bool* manual_conflict,
-                           uint64_t max_file_num_to_ignore,
-                           const std::string& trim_ts) override;
+      VersionStorageInfo* version, LogBuffer* log_buffer,
+      const std::string& /* full_history_ts_low */,
+      bool /* require_max_output_level*/ = false) override;
+
+  Compaction* PickCompactionForCompactRange(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+      int input_level, int output_level,
+      const CompactRangeOptions& compact_range_options,
+      const InternalKey* begin, const InternalKey* end,
+      InternalKey** compaction_end, bool* manual_conflict,
+      uint64_t max_file_num_to_ignore, const std::string& trim_ts,
+      const std::string& full_history_ts_low) override;
 
   // The maximum allowed output level.  Always returns 0.
   int MaxOutputLevel() const override { return 0; }
@@ -54,6 +55,28 @@ class FIFOCompactionPicker : public CompactionPicker {
                                  VersionStorageInfo* version,
                                  LogBuffer* log_buffer);
 
+  // Intra-L0 compaction: merges small L0 files to reduce file count.
+  // Dispatches between two strategies based on configuration:
+  //   - use_kv_ratio_compaction = true: PickRatioBasedIntraL0Compaction
+  //   (BlobDB-optimized)
+  //   - use_kv_ratio_compaction = false: PickCostBasedIntraL0Compaction
+  //   (original)
+  // Only active when allow_compaction = true.
+  Compaction* PickIntraL0Compaction(const std::string& cf_name,
+                                    const MutableCFOptions& mutable_cf_options,
+                                    const MutableDBOptions& mutable_db_options,
+                                    VersionStorageInfo* vstorage,
+                                    LogBuffer* log_buffer);
+
+  // Capacity-derived intra-L0 compaction for BlobDB workloads.
+  // Uses the observed SST/blob ratio to compute a target file size,
+  // producing uniform files for predictable FIFO trimming.
+  // Called from PickIntraL0Compaction when use_kv_ratio_compaction = true.
+  Compaction* PickRatioBasedIntraL0Compaction(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+      LogBuffer* log_buffer);
+
   // Will pick one file to compact at a time, starting from the oldest file.
   Compaction* PickTemperatureChangeCompaction(
       const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
diff --git a/db/compaction/compaction_picker_level.cc b/db/compaction/compaction_picker_level.cc
index 612c1e5af21a..ade42ce5e3e8 100644
--- a/db/compaction/compaction_picker_level.cc
+++ b/db/compaction/compaction_picker_level.cc
@@ -61,14 +61,16 @@ class LevelCompactionBuilder {
                          LogBuffer* log_buffer,
                          const MutableCFOptions& mutable_cf_options,
                          const ImmutableOptions& ioptions,
-                         const MutableDBOptions& mutable_db_options)
+                         const MutableDBOptions& mutable_db_options,
+                         const std::string& full_history_ts_low)
       : cf_name_(cf_name),
         vstorage_(vstorage),
         compaction_picker_(compaction_picker),
         log_buffer_(log_buffer),
         mutable_cf_options_(mutable_cf_options),
         ioptions_(ioptions),
-        mutable_db_options_(mutable_db_options) {}
+        mutable_db_options_(mutable_db_options),
+        full_history_ts_low_(full_history_ts_low) {}
 
   // Pick and return a compaction.
   Compaction* PickCompaction();
@@ -145,7 +147,6 @@ class LevelCompactionBuilder {
   int parent_index_ = -1;
   int base_index_ = -1;
   double start_level_score_ = 0;
-  bool is_manual_ = false;
   bool is_l0_trivial_move_ = false;
   CompactionInputFiles start_level_inputs_;
   std::vector<CompactionInputFiles> compaction_inputs_;
@@ -156,6 +157,7 @@ class LevelCompactionBuilder {
   const MutableCFOptions& mutable_cf_options_;
   const ImmutableOptions& ioptions_;
   const MutableDBOptions& mutable_db_options_;
+  const std::string& full_history_ts_low_;
   // Pick a path ID to place a newly generated file, with its level
   static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
                             const MutableCFOptions& mutable_cf_options,
@@ -414,9 +416,9 @@ void LevelCompactionBuilder::SetupOtherFilesWithRoundRobinExpansion() {
                                                     &tmp_start_level_inputs) ||
         compaction_picker_->FilesRangeOverlapWithCompaction(
             {tmp_start_level_inputs}, output_level_,
-            Compaction::EvaluatePenultimateLevel(vstorage_, mutable_cf_options_,
-                                                 ioptions_, start_level_,
-                                                 output_level_))) {
+            Compaction::EvaluateProximalLevel(vstorage_, mutable_cf_options_,
+                                              ioptions_, start_level_,
+                                              output_level_))) {
       // Constraint 1a
       tmp_start_level_inputs.clear();
       return;
@@ -490,9 +492,9 @@ bool LevelCompactionBuilder::SetupOtherInputsIfNeeded() {
     // We need to disallow this from happening.
     if (compaction_picker_->FilesRangeOverlapWithCompaction(
             compaction_inputs_, output_level_,
-            Compaction::EvaluatePenultimateLevel(vstorage_, mutable_cf_options_,
-                                                 ioptions_, start_level_,
-                                                 output_level_))) {
+            Compaction::EvaluateProximalLevel(vstorage_, mutable_cf_options_,
+                                              ioptions_, start_level_,
+                                              output_level_))) {
       // This compaction output could potentially conflict with the output
       // of a currently running compaction, we cannot run it.
       return false;
@@ -558,12 +560,11 @@ Compaction* LevelCompactionBuilder::GetCompaction() {
       GetCompressionType(vstorage_, mutable_cf_options_, output_level_,
                          vstorage_->base_level()),
       GetCompressionOptions(mutable_cf_options_, vstorage_, output_level_),
-      mutable_cf_options_.default_write_temperature,
+      Temperature::kUnknown,
       /* max_subcompactions */ 0, std::move(grandparents_),
       /* earliest_snapshot */ std::nullopt, /* snapshot_checker */ nullptr,
-      is_manual_,
-      /* trim_ts */ "", start_level_score_, false /* deletion_compaction */,
-      l0_files_might_overlap, compaction_reason_);
+      compaction_reason_,
+      /* trim_ts */ "", start_level_score_, l0_files_might_overlap);
 
   // If it's level 0 compaction, make sure we don't execute any other level 0
   // compactions in parallel
@@ -573,7 +574,8 @@ Compaction* LevelCompactionBuilder::GetCompaction() {
   // takes running compactions into account (by skipping files that are already
   // being compacted). Since we just changed compaction score, we recalculate it
   // here
-  vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
+  vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_,
+                                    full_history_ts_low_);
   return c;
 }
 
@@ -846,9 +848,9 @@ bool LevelCompactionBuilder::PickFileToCompact() {
                                                     &start_level_inputs_) ||
         compaction_picker_->FilesRangeOverlapWithCompaction(
             {start_level_inputs_}, output_level_,
-            Compaction::EvaluatePenultimateLevel(vstorage_, mutable_cf_options_,
-                                                 ioptions_, start_level_,
-                                                 output_level_))) {
+            Compaction::EvaluateProximalLevel(vstorage_, mutable_cf_options_,
+                                              ioptions_, start_level_,
+                                              output_level_))) {
       // A locked (pending compaction) input-level file was pulled in due to
       // user-key overlap.
       start_level_inputs_.clear();
@@ -912,10 +914,10 @@ bool LevelCompactionBuilder::PickIntraL0Compaction() {
     // resort to L0->L0 compaction yet.
     return false;
   }
-  return FindIntraL0Compaction(level_files, kMinFilesForIntraL0Compaction,
-                               std::numeric_limits<uint64_t>::max(),
-                               mutable_cf_options_.max_compaction_bytes,
-                               &start_level_inputs_);
+  return PickCostBasedIntraL0Compaction(
+      level_files, kMinFilesForIntraL0Compaction,
+      std::numeric_limits<uint64_t>::max(),
+      mutable_cf_options_.max_compaction_bytes, &start_level_inputs_);
 }
 
 bool LevelCompactionBuilder::PickSizeBasedIntraL0Compaction() {
@@ -978,10 +980,11 @@ Compaction* LevelCompactionPicker::PickCompaction(
     const MutableDBOptions& mutable_db_options,
     const std::vector<SequenceNumber>& /*existing_snapshots */,
     const SnapshotChecker* /*snapshot_checker*/, VersionStorageInfo* vstorage,
-    LogBuffer* log_buffer) {
+    LogBuffer* log_buffer, const std::string& full_history_ts_low,
+    bool /* require_max_output_level*/) {
   LevelCompactionBuilder builder(cf_name, vstorage, this, log_buffer,
                                  mutable_cf_options, ioptions_,
-                                 mutable_db_options);
+                                 mutable_db_options, full_history_ts_low);
   return builder.PickCompaction();
 }
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/compaction/compaction_picker_level.h b/db/compaction/compaction_picker_level.h
index 9cb41dfb64f8..e86c821aa309 100644
--- a/db/compaction/compaction_picker_level.h
+++ b/db/compaction/compaction_picker_level.h
@@ -25,7 +25,9 @@ class LevelCompactionPicker : public CompactionPicker {
       const MutableDBOptions& mutable_db_options,
       const std::vector<SequenceNumber>& /* existing_snapshots */,
       const SnapshotChecker* /* snapshot_checker */,
-      VersionStorageInfo* vstorage, LogBuffer* log_buffer) override;
+      VersionStorageInfo* vstorage, LogBuffer* log_buffer,
+      const std::string& full_history_ts_low,
+      bool /*require_max_output_level*/ = false) override;
 
   bool NeedsCompaction(const VersionStorageInfo* vstorage) const override;
 };
diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc
index f48195e29a0b..4dfa327ae162 100644
--- a/db/compaction/compaction_picker_test.cc
+++ b/db/compaction/compaction_picker_test.cc
@@ -7,6 +7,8 @@
 #include <string>
 #include <utility>
 
+#include "db/blob/blob_file_meta.h"
+#include "db/column_family.h"
 #include "db/compaction/compaction.h"
 #include "db/compaction/compaction_picker_fifo.h"
 #include "db/compaction/compaction_picker_level.h"
@@ -17,6 +19,7 @@
 #include "table/unique_id_impl.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/random.h"
 #include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -160,11 +163,19 @@ class CompactionPickerTestBase : public testing::Test {
         kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
         kUnknownFileCreationTime, epoch_number, kUnknownFileChecksum,
         kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0,
-        true /* user_defined_timestamps_persisted */);
+        true /* user_defined_timestamps_persisted */, "" /* min timestamp */,
+        "" /* max timestamp */);
     f->compensated_file_size =
         (compensated_file_size != 0) ? compensated_file_size : file_size;
     // oldest_ancester_time is only used if newest_key_time is not available
     f->oldest_ancester_time = oldest_ancestor_time;
+    // Set min/max timestamps for UDT support
+    if (!ts_of_smallest.empty()) {
+      f->min_timestamp = ts_of_smallest.ToString();
+    }
+    if (!ts_of_largest.empty()) {
+      f->max_timestamp = ts_of_largest.ToString();
+    }
     TableProperties tp;
     tp.newest_key_time = newest_key_time;
     f->fd.table_reader = new mock::MockTableReader(mock::KVVector{}, tp);
@@ -195,6 +206,11 @@ class CompactionPickerTestBase : public testing::Test {
   }
 
   void UpdateVersionStorageInfo() {
+    UpdateVersionStorageInfoWithTsLow(/*full_history_ts_low=*/"");
+  }
+
+  void UpdateVersionStorageInfoWithTsLow(
+      const std::string& full_history_ts_low) {
     if (temp_vstorage_) {
       VersionBuilder builder(FileOptions(), &ioptions_, nullptr,
                              vstorage_.get(), nullptr);
@@ -202,10 +218,51 @@ class CompactionPickerTestBase : public testing::Test {
       vstorage_ = std::move(temp_vstorage_);
     }
     vstorage_->PrepareForVersionAppend(ioptions_, mutable_cf_options_);
-    vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
+    vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_,
+                                      full_history_ts_low);
     vstorage_->SetFinalized();
   }
 
+  void AddBlobFile(uint64_t blob_file_number, uint64_t total_blob_bytes,
+                   BlobFileMetaData::LinkedSsts linked_ssts = {}) {
+    auto shared_meta = SharedBlobFileMetaData::Create(
+        blob_file_number, /*total_blob_count=*/1, total_blob_bytes,
+        /*checksum_method=*/"", /*checksum_value=*/"");
+    auto meta =
+        BlobFileMetaData::Create(std::move(shared_meta), std::move(linked_ssts),
+                                 /*garbage_blob_count=*/0,
+                                 /*garbage_blob_bytes=*/0);
+    vstorage_->AddBlobFile(std::move(meta));
+  }
+
+  // Helper to set up FIFO ratio-based compaction options and version storage.
+  // Call before Add()/AddBlobFile(), then create FIFOCompactionPicker after.
+  void SetupFIFORatioBased(uint64_t max_table_files_size,
+                           uint64_t max_data_files_size, int trigger,
+                           bool allow_compaction = true,
+                           bool use_kv_ratio = true, int num_levels = 1) {
+    ioptions_.compaction_style = kCompactionStyleFIFO;
+    NewVersionStorage(num_levels, kCompactionStyleFIFO);
+    mutable_cf_options_.compaction_options_fifo.max_table_files_size =
+        max_table_files_size;
+    mutable_cf_options_.compaction_options_fifo.max_data_files_size =
+        max_data_files_size;
+    mutable_cf_options_.compaction_options_fifo.allow_compaction =
+        allow_compaction;
+    mutable_cf_options_.compaction_options_fifo.use_kv_ratio_compaction =
+        use_kv_ratio;
+    mutable_cf_options_.level0_file_num_compaction_trigger = trigger;
+  }
+
+  // Helper to finalize version storage and pick a FIFO compaction.
+  std::unique_ptr<Compaction> PickFIFOCompaction(FIFOCompactionPicker& picker) {
+    UpdateVersionStorageInfo();
+    return std::unique_ptr<Compaction>(picker.PickCompaction(
+        cf_name_, mutable_cf_options_, mutable_db_options_,
+        /*existing_snapshots=*/{}, /*snapshot_checker=*/nullptr,
+        vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
+  }
+
  private:
   Options CreateOptions(const Comparator* ucmp) const {
     Options opts;
@@ -242,6 +299,60 @@ class CompactionPickerU64TsTest : public CompactionPickerTestBase {
       : CompactionPickerTestBase(test::BytewiseComparatorWithU64TsWrapper()) {}
 
   ~CompactionPickerU64TsTest() override = default;
+
+ protected:
+  // Helper to create a U64 timestamp string from a uint64_t value
+  static std::string MakeU64Timestamp(uint64_t ts) {
+    std::string result;
+    PutFixed64(&result, ts);
+    return result;
+  }
+
+  // Helper to add a bottommost file with timestamps and setup version storage
+  // for testing bottommost file marking behavior
+  void SetupBottommostFileWithTimestamps(uint64_t min_ts, uint64_t max_ts,
+                                         uint64_t full_history_ts_low_val,
+                                         SequenceNumber oldest_snapshot_seqnum,
+                                         std::string* out_full_history_ts_low) {
+    std::string ts_small = MakeU64Timestamp(min_ts);
+    std::string ts_large = MakeU64Timestamp(max_ts);
+
+    Add(5, 1U, "100", "200", /*file_size=*/1000, /*path_id=*/0,
+        /*smallest_seq=*/10, /*largest_seq=*/40,
+        /*compensated_file_size=*/1000,
+        /*marked_for_compact=*/false, Temperature::kUnknown,
+        kUnknownOldestAncesterTime, kUnknownNewestKeyTime, ts_small, ts_large);
+
+    std::string full_history_ts_low = MakeU64Timestamp(full_history_ts_low_val);
+
+    UpdateVersionStorageInfoWithTsLow(full_history_ts_low);
+
+    vstorage_->UpdateOldestSnapshot(oldest_snapshot_seqnum,
+                                    /*allow_ingest_behind=*/false,
+                                    /*ucmp=*/ucmp_, full_history_ts_low);
+
+    if (out_full_history_ts_low) {
+      *out_full_history_ts_low = full_history_ts_low;
+    }
+  }
+
+  // Helper to add L0 files with timestamps for compaction trigger tests
+  void AddL0FilesWithTimestamps(uint64_t ts1_val, uint64_t ts2_val,
+                                uint64_t file_size = 1U) {
+    std::string ts1 = MakeU64Timestamp(ts1_val);
+    std::string ts2 = MakeU64Timestamp(ts2_val);
+
+    Add(0, 1U, "100", "200", file_size, /*path_id=*/0,
+        /*smallest_seq=*/100, /*largest_seq=*/100,
+        /*compensated_file_size=*/file_size,
+        /*marked_for_compact=*/false, Temperature::kUnknown,
+        kUnknownOldestAncesterTime, kUnknownNewestKeyTime, ts1, ts2);
+    Add(0, 2U, "150", "250", file_size, /*path_id=*/0,
+        /*smallest_seq=*/200, /*largest_seq=*/200,
+        /*compensated_file_size=*/file_size,
+        /*marked_for_compact=*/false, Temperature::kUnknown,
+        kUnknownOldestAncesterTime, kUnknownNewestKeyTime, ts1, ts2);
+  }
 };
 
 TEST_F(CompactionPickerTest, Empty) {
@@ -250,7 +361,7 @@ TEST_F(CompactionPickerTest, Empty) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() == nullptr);
 }
 
@@ -263,7 +374,7 @@ TEST_F(CompactionPickerTest, Single) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() == nullptr);
 }
 
@@ -278,7 +389,7 @@ TEST_F(CompactionPickerTest, Level0Trigger) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_files(0));
   ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
@@ -293,7 +404,7 @@ TEST_F(CompactionPickerTest, Level1Trigger) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_files(0));
   ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber());
@@ -313,7 +424,7 @@ TEST_F(CompactionPickerTest, Level1Trigger2) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_files(0));
   ASSERT_EQ(2U, compaction->num_input_files(1));
@@ -346,7 +457,7 @@ TEST_F(CompactionPickerTest, LevelMaxScore) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_files(0));
   ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber());
@@ -395,7 +506,7 @@ TEST_F(CompactionPickerTest, Level0TriggerDynamic) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_files(0));
   ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
@@ -421,7 +532,7 @@ TEST_F(CompactionPickerTest, Level0TriggerDynamic2) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_files(0));
   ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
@@ -448,7 +559,7 @@ TEST_F(CompactionPickerTest, Level0TriggerDynamic3) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_files(0));
   ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
@@ -479,7 +590,7 @@ TEST_F(CompactionPickerTest, Level0TriggerDynamic4) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_files(0));
   ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
@@ -513,7 +624,7 @@ TEST_F(CompactionPickerTest, LevelTriggerDynamic4) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_files(0));
   ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber());
@@ -544,41 +655,48 @@ TEST_F(CompactionPickerTest, NeedsCompactionUniversal) {
 }
 
 TEST_F(CompactionPickerTest, CompactionUniversalIngestBehindReservedLevel) {
-  const uint64_t kFileSize = 100000;
-  NewVersionStorage(3 /* num_levels */, kCompactionStyleUniversal);
-  ioptions_.allow_ingest_behind = true;
-  ioptions_.num_levels = 3;
-  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
-  UpdateVersionStorageInfo();
-  // must return false when there's no files.
-  ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()),
-            false);
+  for (bool cf_option : {false, true}) {
+    SCOPED_TRACE("cf_option = " + std::to_string(cf_option));
+    const uint64_t kFileSize = 100000;
+    NewVersionStorage(3 /* num_levels */, kCompactionStyleUniversal);
+    if (cf_option) {
+      ioptions_.cf_allow_ingest_behind = true;
+    } else {
+      ioptions_.allow_ingest_behind = true;
+    }
+    ioptions_.num_levels = 3;
+    UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+    UpdateVersionStorageInfo();
+    // must return false when there's no files.
+    ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()),
+              false);
 
-  NewVersionStorage(3, kCompactionStyleUniversal);
+    NewVersionStorage(3, kCompactionStyleUniversal);
 
-  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
-  Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
-  Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
-  Add(1, 5U, "100", "151", kFileSize, 0, 200, 251);
-  Add(1, 3U, "301", "350", kFileSize, 0, 101, 150);
-  Add(2, 6U, "120", "200", kFileSize, 0, 20, 100);
+    Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+    Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+    Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
+    Add(1, 5U, "100", "151", kFileSize, 0, 200, 251);
+    Add(1, 3U, "301", "350", kFileSize, 0, 101, 150);
+    Add(2, 6U, "120", "200", kFileSize, 0, 20, 100);
 
-  UpdateVersionStorageInfo();
+    UpdateVersionStorageInfo();
 
-  std::unique_ptr<Compaction> compaction(
-      universal_compaction_picker.PickCompaction(
-          cf_name_, mutable_cf_options_, mutable_db_options_,
-          /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+    std::unique_ptr<Compaction> compaction(
+        universal_compaction_picker.PickCompaction(
+            cf_name_, mutable_cf_options_, mutable_db_options_,
+            /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
+            vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
 
-  // output level should be the one above the bottom-most
-  ASSERT_EQ(1, compaction->output_level());
+    // output level should be the one above the bottom-most
+    ASSERT_EQ(1, compaction->output_level());
 
-  // input should not include the reserved level
-  const std::vector<CompactionInputFiles>* inputs = compaction->inputs();
-  for (const auto& compaction_input : *inputs) {
-    if (!compaction_input.empty()) {
-      ASSERT_LT(compaction_input.level, 2);
+    // input should not include the reserved level
+    const std::vector<CompactionInputFiles>* inputs = compaction->inputs();
+    for (const auto& compaction_input : *inputs) {
+      if (!compaction_input.empty()) {
+        ASSERT_LT(compaction_input.level, 2);
+      }
     }
   }
 }
@@ -613,7 +731,7 @@ TEST_F(CompactionPickerTest, CannotTrivialMoveUniversal) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
 
   ASSERT_TRUE(!compaction->is_trivial_move());
 }
@@ -641,7 +759,7 @@ TEST_F(CompactionPickerTest, AllowsTrivialMoveUniversal) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
 
   ASSERT_TRUE(compaction->is_trivial_move());
 }
@@ -671,7 +789,7 @@ TEST_F(CompactionPickerTest, UniversalPeriodicCompaction1) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
 
   ASSERT_TRUE(compaction);
   ASSERT_EQ(4, compaction->output_level());
@@ -703,7 +821,7 @@ TEST_F(CompactionPickerTest, UniversalPeriodicCompaction2) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
 
   ASSERT_FALSE(compaction);
 }
@@ -731,7 +849,7 @@ TEST_F(CompactionPickerTest, UniversalPeriodicCompaction3) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
 
   ASSERT_FALSE(compaction);
 }
@@ -763,7 +881,7 @@ TEST_F(CompactionPickerTest, UniversalPeriodicCompaction4) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(!compaction ||
               compaction->start_level() != compaction->output_level());
 }
@@ -785,7 +903,7 @@ TEST_F(CompactionPickerTest, UniversalPeriodicCompaction5) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction);
   ASSERT_EQ(0, compaction->start_level());
   ASSERT_EQ(1U, compaction->num_input_files(0));
@@ -811,7 +929,7 @@ TEST_F(CompactionPickerTest, UniversalPeriodicCompaction6) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction);
   ASSERT_EQ(4, compaction->start_level());
   ASSERT_EQ(2U, compaction->num_input_files(0));
@@ -850,7 +968,7 @@ TEST_F(CompactionPickerTest, UniversalIncrementalSpace1) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction);
   ASSERT_EQ(4, compaction->output_level());
   ASSERT_EQ(3, compaction->start_level());
@@ -893,7 +1011,7 @@ TEST_F(CompactionPickerTest, UniversalIncrementalSpace2) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction);
   ASSERT_EQ(4, compaction->output_level());
   ASSERT_EQ(2, compaction->start_level());
@@ -936,7 +1054,7 @@ TEST_F(CompactionPickerTest, UniversalIncrementalSpace3) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction);
   ASSERT_EQ(4, compaction->output_level());
   ASSERT_EQ(2, compaction->start_level());
@@ -985,7 +1103,7 @@ TEST_F(CompactionPickerTest, UniversalIncrementalSpace4) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction);
   ASSERT_EQ(4, compaction->output_level());
   ASSERT_EQ(3, compaction->start_level());
@@ -1030,7 +1148,7 @@ TEST_F(CompactionPickerTest, UniversalIncrementalSpace5) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction);
   ASSERT_EQ(4, compaction->output_level());
   ASSERT_EQ(3, compaction->start_level());
@@ -1083,7 +1201,7 @@ TEST_F(CompactionPickerTest,
         universal_compaction_picker.PickCompaction(
             cf_name_, mutable_cf_options_, mutable_db_options_,
             /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-            vstorage_.get(), &log_buffer_));
+            vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
     ASSERT_TRUE(compaction.get() != nullptr);
     ASSERT_EQ(compaction->compaction_reason(),
               CompactionReason::kUniversalSizeAmplification);
@@ -1134,10 +1252,15 @@ TEST_F(CompactionPickerTest, FIFOToCold1) {
     fifo_options_.max_table_files_size = kMaxSize;
     fifo_options_.file_temperature_age_thresholds = {
         {Temperature::kCold, kColdThreshold}};
+    fifo_options_.allow_trivial_copy_when_change_temperature = true;
+    fifo_options_.trivial_copy_buffer_size = 16 * 1024 * 1024;
     mutable_cf_options_.compaction_options_fifo = fifo_options_;
     mutable_cf_options_.level0_file_num_compaction_trigger = 100;
     mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
-    FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+    auto copiedIOptions = ioptions_;
+    copiedIOptions.compaction_style = kCompactionStyleFIFO;
+    FIFOCompactionPicker fifo_compaction_picker(copiedIOptions, &icmp_);
 
     int64_t current_time = 0;
     ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
@@ -1162,11 +1285,11 @@ TEST_F(CompactionPickerTest, FIFOToCold1) {
         fifo_compaction_picker.PickCompaction(
             cf_name_, mutable_cf_options_, mutable_db_options_,
             /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-            vstorage_.get(), &log_buffer_));
+            vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
     ASSERT_TRUE(compaction.get() != nullptr);
     ASSERT_EQ(compaction->compaction_reason(),
               CompactionReason::kChangeTemperature);
-    ASSERT_EQ(compaction->output_temperature(), Temperature::kCold);
+    ASSERT_EQ(compaction->GetOutputTemperature(), Temperature::kCold);
     ASSERT_EQ(1U, compaction->num_input_files(0));
     ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber());
   }
@@ -1186,7 +1309,10 @@ TEST_F(CompactionPickerTest, FIFOToColdMaxCompactionSize) {
     mutable_cf_options_.compaction_options_fifo = fifo_options_;
     mutable_cf_options_.level0_file_num_compaction_trigger = 100;
     mutable_cf_options_.max_compaction_bytes = kFileSize * 9;
-    FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+    auto copiedIOptions = ioptions_;
+    copiedIOptions.compaction_style = kCompactionStyleFIFO;
+    FIFOCompactionPicker fifo_compaction_picker(copiedIOptions, &icmp_);
 
     int64_t current_time = 0;
     ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
@@ -1228,12 +1354,12 @@ TEST_F(CompactionPickerTest, FIFOToColdMaxCompactionSize) {
         fifo_compaction_picker.PickCompaction(
             cf_name_, mutable_cf_options_, mutable_db_options_,
             /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-            vstorage_.get(), &log_buffer_));
+            vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
     ASSERT_TRUE(compaction.get() != nullptr);
     ASSERT_EQ(compaction->compaction_reason(),
               CompactionReason::kChangeTemperature);
     // Compaction picker picks older files first and picks one file at a time.
-    ASSERT_EQ(compaction->output_temperature(), Temperature::kCold);
+    ASSERT_EQ(compaction->GetOutputTemperature(), Temperature::kCold);
     ASSERT_EQ(1U, compaction->num_input_files(0));
     ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
   }
@@ -1253,7 +1379,10 @@ TEST_F(CompactionPickerTest, FIFOToColdWithExistingCold) {
     mutable_cf_options_.compaction_options_fifo = fifo_options_;
     mutable_cf_options_.level0_file_num_compaction_trigger = 100;
     mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
-    FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+    auto copiedIOptions = ioptions_;
+    copiedIOptions.compaction_style = kCompactionStyleFIFO;
+    FIFOCompactionPicker fifo_compaction_picker(copiedIOptions, &icmp_);
 
     int64_t current_time = 0;
     ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
@@ -1293,12 +1422,12 @@ TEST_F(CompactionPickerTest, FIFOToColdWithExistingCold) {
         fifo_compaction_picker.PickCompaction(
             cf_name_, mutable_cf_options_, mutable_db_options_,
             /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-            vstorage_.get(), &log_buffer_));
+            vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
     ASSERT_TRUE(compaction.get() != nullptr);
     ASSERT_EQ(compaction->compaction_reason(),
               CompactionReason::kChangeTemperature);
     // Compaction picker picks older files first and picks one file at a time.
-    ASSERT_EQ(compaction->output_temperature(), Temperature::kCold);
+    ASSERT_EQ(compaction->GetOutputTemperature(), Temperature::kCold);
     ASSERT_EQ(1U, compaction->num_input_files(0));
     ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
   }
@@ -1318,7 +1447,10 @@ TEST_F(CompactionPickerTest, FIFOToColdWithHotBetweenCold) {
     mutable_cf_options_.compaction_options_fifo = fifo_options_;
     mutable_cf_options_.level0_file_num_compaction_trigger = 100;
     mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
-    FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+    auto copiedIOptions = ioptions_;
+    copiedIOptions.compaction_style = kCompactionStyleFIFO;
+    FIFOCompactionPicker fifo_compaction_picker(copiedIOptions, &icmp_);
 
     int64_t current_time = 0;
     ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
@@ -1358,11 +1490,11 @@ TEST_F(CompactionPickerTest, FIFOToColdWithHotBetweenCold) {
         fifo_compaction_picker.PickCompaction(
             cf_name_, mutable_cf_options_, mutable_db_options_,
             /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-            vstorage_.get(), &log_buffer_));
+            vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
     ASSERT_TRUE(compaction.get() != nullptr);
     ASSERT_EQ(compaction->compaction_reason(),
               CompactionReason::kChangeTemperature);
-    ASSERT_EQ(compaction->output_temperature(), Temperature::kCold);
+    ASSERT_EQ(compaction->GetOutputTemperature(), Temperature::kCold);
     ASSERT_EQ(1U, compaction->num_input_files(0));
     ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
   }
@@ -1385,7 +1517,10 @@ TEST_F(CompactionPickerTest, FIFOToHotAndWarm) {
     mutable_cf_options_.compaction_options_fifo = fifo_options_;
     mutable_cf_options_.level0_file_num_compaction_trigger = 100;
     mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
-    FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+    auto copiedIOptions = ioptions_;
+    copiedIOptions.compaction_style = kCompactionStyleFIFO;
+    FIFOCompactionPicker fifo_compaction_picker(copiedIOptions, &icmp_);
 
     int64_t current_time = 0;
     ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
@@ -1435,17 +1570,40 @@ TEST_F(CompactionPickerTest, FIFOToHotAndWarm) {
         fifo_compaction_picker.PickCompaction(
             cf_name_, mutable_cf_options_, mutable_db_options_,
             /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-            vstorage_.get(), &log_buffer_));
+            vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
     ASSERT_TRUE(compaction.get() != nullptr);
     ASSERT_EQ(compaction->compaction_reason(),
               CompactionReason::kChangeTemperature);
     // Compaction picker picks older files first and picks one file at a time.
-    ASSERT_EQ(compaction->output_temperature(), Temperature::kWarm);
+    ASSERT_EQ(compaction->GetOutputTemperature(), Temperature::kWarm);
     ASSERT_EQ(1U, compaction->num_input_files(0));
     ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
   }
 }
 
+TEST_F(CompactionPickerTest, CompactFilesOutputTemperature) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  auto file_number = 66U;
+  Add(0, file_number, "150", "200", 1000000000U);
+  UpdateVersionStorageInfo();
+
+  std::unordered_set<uint64_t> input{file_number};
+  std::vector<CompactionInputFiles> input_files;
+  ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input, vstorage_.get(), CompactionOptions()));
+
+  auto compaction_options = CompactionOptions();
+  compaction_options.output_temperature_override = Temperature::kCold;
+
+  std::unique_ptr<Compaction> compaction(
+      level_compaction_picker.PickCompactionForCompactFiles(
+          compaction_options, input_files, 1, vstorage_.get(),
+          mutable_cf_options_, mutable_db_options_, /*output_path_id=*/0));
+
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(compaction->GetOutputTemperature(), Temperature::kCold);
+}
+
 TEST_F(CompactionPickerTest, CompactionPriMinOverlapping1) {
   NewVersionStorage(6, kCompactionStyleLevel);
   ioptions_.compaction_pri = kMinOverlappingRatio;
@@ -1469,7 +1627,7 @@ TEST_F(CompactionPickerTest, CompactionPriMinOverlapping1) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_files(0));
   // Pick file 8 because it overlaps with 0 files on level 3.
@@ -1503,7 +1661,7 @@ TEST_F(CompactionPickerTest, CompactionPriMinOverlapping2) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_files(0));
   // Picking file 7 because overlapping ratio is the biggest.
@@ -1532,7 +1690,7 @@ TEST_F(CompactionPickerTest, CompactionPriMinOverlapping3) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_files(0));
   // Picking file 8 because overlapping ratio is the biggest.
@@ -1561,7 +1719,7 @@ TEST_F(CompactionPickerTest, CompactionPriMinOverlapping4) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_files(0));
   // Picking file 6 because overlapping ratio is the biggest.
@@ -1598,7 +1756,7 @@ TEST_F(CompactionPickerTest, CompactionPriRoundRobin) {
         local_level_compaction_picker.PickCompaction(
             cf_name_, mutable_cf_options_, mutable_db_options_,
             /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-            vstorage_.get(), &log_buffer_));
+            vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
     ASSERT_TRUE(compaction.get() != nullptr);
     // Since the max bytes for level 2 is 120M, picking one file to compact
     // makes the post-compaction level size less than 120M, there is exactly one
@@ -1639,7 +1797,7 @@ TEST_F(CompactionPickerTest, CompactionPriMultipleFilesRoundRobin1) {
       local_level_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
 
   // The maximum compaction bytes is very large in this case so we can igore its
@@ -1683,7 +1841,7 @@ TEST_F(CompactionPickerTest, CompactionPriMultipleFilesRoundRobin2) {
       local_level_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
 
   // The maximum compaction bytes is only 2500 bytes now. Even though we are
@@ -1728,7 +1886,7 @@ TEST_F(CompactionPickerTest, CompactionPriMultipleFilesRoundRobin3) {
       local_level_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
 
   // Cannot pick more files since we reach the last file in level 2
@@ -1788,7 +1946,7 @@ TEST_F(CompactionPickerTest, CompactionPriMinOverlappingManyFiles) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_files(0));
   // Picking file 8 because overlapping ratio is the biggest.
@@ -1817,7 +1975,7 @@ TEST_F(CompactionPickerTest, ParentIndexResetBug) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
 }
 
 // This test checks ExpandWhileOverlapping() by having overlapping user keys
@@ -1836,7 +1994,7 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_levels());
   ASSERT_EQ(2U, compaction->num_input_files(0));
@@ -1857,7 +2015,7 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys2) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_levels());
   ASSERT_EQ(2U, compaction->num_input_files(0));
@@ -1886,7 +2044,7 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys3) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_levels());
   ASSERT_EQ(5U, compaction->num_input_files(0));
@@ -1918,7 +2076,7 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys4) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_levels());
   ASSERT_EQ(1U, compaction->num_input_files(0));
@@ -1943,7 +2101,7 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys5) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() == nullptr);
 }
 
@@ -1966,7 +2124,7 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys6) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_levels());
   ASSERT_EQ(1U, compaction->num_input_files(0));
@@ -1988,7 +2146,7 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys7) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_levels());
   ASSERT_GE(1U, compaction->num_input_files(0));
@@ -2018,7 +2176,7 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys8) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_levels());
   ASSERT_EQ(3U, compaction->num_input_files(0));
@@ -2052,7 +2210,7 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys9) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_levels());
   ASSERT_EQ(5U, compaction->num_input_files(0));
@@ -2094,7 +2252,7 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys10) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_levels());
   ASSERT_EQ(1U, compaction->num_input_files(0));
@@ -2134,7 +2292,7 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys11) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_levels());
   ASSERT_EQ(1U, compaction->num_input_files(0));
@@ -2242,7 +2400,7 @@ TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri1) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() == nullptr);
 }
 
@@ -2274,7 +2432,7 @@ TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri2) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
 }
 
@@ -2309,7 +2467,7 @@ TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri3) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
 }
 
@@ -2611,7 +2769,7 @@ TEST_F(CompactionPickerTest, CompactionLimitWhenAddFileFromInputLevel) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_levels());
   ASSERT_EQ(4U, compaction->num_input_files(0));
@@ -2647,7 +2805,7 @@ TEST_F(CompactionPickerTest, HitCompactionLimitWhenAddFileFromInputLevel) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_levels());
   ASSERT_EQ(1U, compaction->num_input_files(0));
@@ -2672,13 +2830,14 @@ TEST_F(CompactionPickerTest, CompactRangeMaxCompactionBytes) {
   bool manual_conflict = false;
   InternalKey manual_end;
   InternalKey* manual_end_ptr = &manual_end;
-  std::unique_ptr<Compaction> compaction(level_compaction_picker.CompactRange(
-      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
-      /*input_level=*/1, /*output_level=*/2,
-      /*compact_range_options*/ {}, /*begin=*/nullptr, /*end=*/nullptr,
-      &manual_end_ptr, &manual_conflict,
-      /*max_file_num_to_ignore=*/std::numeric_limits<uint64_t>::max(),
-      /*trim_ts=*/""));
+  std::unique_ptr<Compaction> compaction(
+      level_compaction_picker.PickCompactionForCompactRange(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          /*input_level=*/1, /*output_level=*/2,
+          /*compact_range_options*/ {}, /*begin=*/nullptr, /*end=*/nullptr,
+          &manual_end_ptr, &manual_conflict,
+          /*max_file_num_to_ignore=*/std::numeric_limits<uint64_t>::max(),
+          /*trim_ts=*/"", /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_levels());
   ASSERT_EQ(2, compaction->output_level());
@@ -2707,7 +2866,7 @@ TEST_F(CompactionPickerTest, IsTrivialMoveOn) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_TRUE(compaction->IsTrivialMove());
 }
@@ -2733,7 +2892,7 @@ TEST_F(CompactionPickerTest, L0TrivialMove1) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1, compaction->num_input_levels());
   ASSERT_EQ(2, compaction->num_input_files(0));
@@ -2763,7 +2922,7 @@ TEST_F(CompactionPickerTest, L0TrivialMoveOneFile) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1, compaction->num_input_levels());
   ASSERT_EQ(1, compaction->num_input_files(0));
@@ -2790,7 +2949,7 @@ TEST_F(CompactionPickerTest, L0TrivialMoveWholeL0) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1, compaction->num_input_levels());
   ASSERT_EQ(4, compaction->num_input_files(0));
@@ -2819,7 +2978,7 @@ TEST_F(CompactionPickerTest, NonL0TrivialMoveExtendBothDirection) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1, compaction->num_input_levels());
   ASSERT_EQ(3, compaction->num_input_files(0));
@@ -2850,7 +3009,7 @@ TEST_F(CompactionPickerTest, L0TrivialMoveToEmptyLevel) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1, compaction->num_input_levels());
   ASSERT_EQ(1, compaction->num_input_files(0));
@@ -2879,7 +3038,7 @@ TEST_F(CompactionPickerTest, IsTrivialMoveOffSstPartitioned) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   // No trivial move, because partitioning is applied
   ASSERT_TRUE(!compaction->IsTrivialMove());
@@ -2903,7 +3062,7 @@ TEST_F(CompactionPickerTest, IsTrivialMoveOff) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_FALSE(compaction->IsTrivialMove());
 }
@@ -2933,7 +3092,7 @@ TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles1) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_TRUE(compaction->IsTrivialMove());
   ASSERT_EQ(1, compaction->num_input_levels());
@@ -2968,7 +3127,7 @@ TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles2) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_TRUE(compaction->IsTrivialMove());
   ASSERT_EQ(1, compaction->num_input_levels());
@@ -3002,7 +3161,7 @@ TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles3) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_TRUE(compaction->IsTrivialMove());
   ASSERT_EQ(1, compaction->num_input_levels());
@@ -3029,7 +3188,7 @@ TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles4) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_TRUE(compaction->IsTrivialMove());
   ASSERT_EQ(1, compaction->num_input_levels());
@@ -3060,7 +3219,7 @@ TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles5) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_TRUE(compaction->IsTrivialMove());
   ASSERT_EQ(1, compaction->num_input_levels());
@@ -3095,7 +3254,7 @@ TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles6) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_TRUE(compaction->IsTrivialMove());
   ASSERT_EQ(1, compaction->num_input_levels());
@@ -3131,7 +3290,7 @@ TEST_F(CompactionPickerTest, CacheNextCompactionIndex) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_levels());
   ASSERT_EQ(1U, compaction->num_input_files(0));
@@ -3142,7 +3301,7 @@ TEST_F(CompactionPickerTest, CacheNextCompactionIndex) {
   compaction.reset(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_levels());
   ASSERT_EQ(1U, compaction->num_input_files(0));
@@ -3153,7 +3312,7 @@ TEST_F(CompactionPickerTest, CacheNextCompactionIndex) {
   compaction.reset(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() == nullptr);
   ASSERT_EQ(4, vstorage_->NextCompactionIndex(1 /* level */));
 }
@@ -3180,7 +3339,7 @@ TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesNotHit) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_levels());
   ASSERT_EQ(5U, compaction->num_input_files(0));
@@ -3212,7 +3371,7 @@ TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesHit) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_levels());
   ASSERT_EQ(4U, compaction->num_input_files(0));
@@ -3262,7 +3421,7 @@ TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
 
   ASSERT_TRUE(compaction);
   // Validate that its a compaction to reduce sorted runs
@@ -3286,7 +3445,7 @@ TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_FALSE(compaction2);
 }
 
@@ -3317,7 +3476,7 @@ TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap2) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
 
   ASSERT_TRUE(compaction);
   // Validate that its a delete triggered compaction
@@ -3348,7 +3507,7 @@ TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap2) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_FALSE(compaction2);
 }
 
@@ -3390,7 +3549,7 @@ TEST_F(CompactionPickerTest, UniversalMarkedCompactionStartOutputOverlap) {
         universal_compaction_picker.PickCompaction(
             cf_name_, mutable_cf_options_, mutable_db_options_,
             /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-            vstorage_.get(), &log_buffer_));
+            vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
 
     ASSERT_TRUE(compaction);
     // Validate that its a delete triggered compaction
@@ -3415,14 +3574,15 @@ TEST_F(CompactionPickerTest, UniversalMarkedCompactionStartOutputOverlap) {
       ASSERT_EQ(1U, compaction->num_input_files(1));
     }
 
-    vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
+    vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_,
+                                      /*full_history_ts_low=*/"");
     // After recomputing the compaction score, only one marked file will remain
     random_index = 0;
     std::unique_ptr<Compaction> compaction2(
         universal_compaction_picker.PickCompaction(
             cf_name_, mutable_cf_options_, mutable_db_options_,
             /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-            vstorage_.get(), &log_buffer_));
+            vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
     ASSERT_FALSE(compaction2);
     DeleteVersionStorage();
   }
@@ -3449,7 +3609,7 @@ TEST_F(CompactionPickerTest, UniversalMarkedL0NoOverlap) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
 
   ASSERT_TRUE(compaction);
   // Validate that its a delete triggered compaction
@@ -3487,7 +3647,7 @@ TEST_F(CompactionPickerTest, UniversalMarkedL0WithOverlap) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
 
   ASSERT_TRUE(compaction);
   // Validate that its a delete triggered compaction
@@ -3545,7 +3705,7 @@ TEST_F(CompactionPickerTest, UniversalMarkedL0Overlap2) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
 
   ASSERT_TRUE(compaction);
   // Validate that its a delete triggered compaction
@@ -3579,7 +3739,7 @@ TEST_F(CompactionPickerTest, UniversalMarkedL0Overlap2) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction2);
   ASSERT_EQ(3U, compaction->num_input_files(0));
   ASSERT_TRUE(file_map_[1].first->being_compacted);
@@ -3610,11 +3770,12 @@ TEST_F(CompactionPickerTest, UniversalMarkedManualCompaction) {
   bool manual_conflict = false;
   InternalKey* manual_end = nullptr;
   std::unique_ptr<Compaction> compaction(
-      universal_compaction_picker.CompactRange(
+      universal_compaction_picker.PickCompactionForCompactRange(
           cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
           ColumnFamilyData::kCompactAllLevels, 6, CompactRangeOptions(),
           nullptr, nullptr, &manual_end, &manual_conflict,
-          std::numeric_limits<uint64_t>::max(), ""));
+          std::numeric_limits<uint64_t>::max(), "",
+          /*full_history_ts_low=*/""));
 
   ASSERT_TRUE(compaction);
 
@@ -3659,7 +3820,7 @@ TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionNonLastLevel) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
 
   // Make sure it's a size amp compaction and includes all files
   ASSERT_EQ(compaction->compaction_reason(),
@@ -3677,7 +3838,7 @@ TEST_F(CompactionPickerTest, UniversalSizeRatioTierCompactionLastLevel) {
   const uint64_t kFileSize = 100000;
   const int kNumLevels = 7;
   const int kLastLevel = kNumLevels - 1;
-  const int kPenultimateLevel = kLastLevel - 1;
+  const int kProximalLevel = kLastLevel - 1;
 
   ioptions_.compaction_style = kCompactionStyleUniversal;
   mutable_cf_options_.preclude_last_level_data_seconds = 1000;
@@ -3696,20 +3857,20 @@ TEST_F(CompactionPickerTest, UniversalSizeRatioTierCompactionLastLevel) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
 
   // Internally, size amp compaction is evaluated before size ratio compaction.
   // Here to make sure it's size ratio compaction instead of size amp
   ASSERT_EQ(compaction->compaction_reason(),
             CompactionReason::kUniversalSizeRatio);
-  ASSERT_EQ(compaction->output_level(), kPenultimateLevel - 1);
+  ASSERT_EQ(compaction->output_level(), kProximalLevel - 1);
   ASSERT_EQ(compaction->input_levels(0)->num_files, 2);
   ASSERT_EQ(compaction->input_levels(5)->num_files, 0);
   ASSERT_EQ(compaction->input_levels(6)->num_files, 0);
 }
 
 TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionNotSuport) {
-  // Tiered compaction only support level_num > 2 (otherwise the penultimate
+  // Tiered compaction only support level_num > 2 (otherwise the proximal
   // level is going to be level 0, which may make thing more complicated), so
   // when there's only 2 level, still treating level 1 as the last level for
   // size amp compaction
@@ -3737,7 +3898,7 @@ TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionNotSuport) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
 
   // size amp compaction is still triggered even preclude_last_level is set
   ASSERT_EQ(compaction->compaction_reason(),
@@ -3753,7 +3914,7 @@ TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionLastLevel) {
   const uint64_t kFileSize = 100000;
   const int kNumLevels = 7;
   const int kLastLevel = kNumLevels - 1;
-  const int kPenultimateLevel = kLastLevel - 1;
+  const int kProximalLevel = kLastLevel - 1;
 
   ioptions_.compaction_style = kCompactionStyleUniversal;
   mutable_cf_options_.preclude_last_level_data_seconds = 1000;
@@ -3772,13 +3933,13 @@ TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionLastLevel) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
 
   // It's a Size Amp compaction, but doesn't include the last level file and
-  // output to the penultimate level.
+  // output to the proximal level.
   ASSERT_EQ(compaction->compaction_reason(),
             CompactionReason::kUniversalSizeAmplification);
-  ASSERT_EQ(compaction->output_level(), kPenultimateLevel);
+  ASSERT_EQ(compaction->output_level(), kProximalLevel);
   ASSERT_EQ(compaction->input_levels(0)->num_files, 2);
   ASSERT_EQ(compaction->input_levels(5)->num_files, 1);
   ASSERT_EQ(compaction->input_levels(6)->num_files, 0);
@@ -3814,9 +3975,10 @@ TEST_F(CompactionPickerU64TsTest, Overlap) {
   std::vector<CompactionInputFiles> input_files;
   ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers(
       &input_files, &input, vstorage_.get(), CompactionOptions()));
-  std::unique_ptr<Compaction> comp1(level_compaction_picker.CompactFiles(
-      CompactionOptions(), input_files, level, vstorage_.get(),
-      mutable_cf_options_, mutable_db_options_, /*output_path_id=*/0));
+  std::unique_ptr<Compaction> comp1(
+      level_compaction_picker.PickCompactionForCompactFiles(
+          CompactionOptions(), input_files, level, vstorage_.get(),
+          mutable_cf_options_, mutable_db_options_, /*output_path_id=*/0));
 
   {
     // [600, ts=50000] to [600, ts=50000] is the range to check.
@@ -3884,7 +4046,7 @@ TEST_F(CompactionPickerU64TsTest, CannotTrivialMoveUniversal) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   assert(compaction);
   ASSERT_TRUE(!compaction->is_trivial_move());
 }
@@ -3925,9 +4087,10 @@ TEST_P(PerKeyPlacementCompactionPickerTest, OverlapWithNormalCompaction) {
   ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers(
       &input_files, &input_set, vstorage_.get(), comp_options));
 
-  std::unique_ptr<Compaction> comp1(level_compaction_picker.CompactFiles(
-      comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
-      mutable_db_options_, 0));
+  std::unique_ptr<Compaction> comp1(
+      level_compaction_picker.PickCompactionForCompactFiles(
+          comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
+          mutable_db_options_, 0));
 
   input_set.clear();
   input_files.clear();
@@ -3940,7 +4103,7 @@ TEST_P(PerKeyPlacementCompactionPickerTest, OverlapWithNormalCompaction) {
   ASSERT_EQ(enable_per_key_placement_,
             level_compaction_picker.FilesRangeOverlapWithCompaction(
                 input_files, 6,
-                Compaction::EvaluatePenultimateLevel(
+                Compaction::EvaluateProximalLevel(
                     vstorage_.get(), mutable_cf_options_, ioptions_, 0, 6)));
 }
 
@@ -3971,9 +4134,10 @@ TEST_P(PerKeyPlacementCompactionPickerTest, NormalCompactionOverlap) {
   ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers(
       &input_files, &input_set, vstorage_.get(), comp_options));
 
-  std::unique_ptr<Compaction> comp1(level_compaction_picker.CompactFiles(
-      comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
-      mutable_db_options_, 0));
+  std::unique_ptr<Compaction> comp1(
+      level_compaction_picker.PickCompactionForCompactFiles(
+          comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+          mutable_db_options_, 0));
 
   input_set.clear();
   input_files.clear();
@@ -4013,9 +4177,10 @@ TEST_P(PerKeyPlacementCompactionPickerTest,
   ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
       &input_files, &input_set, vstorage_.get(), comp_options));
 
-  std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
-      comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
-      mutable_db_options_, 0));
+  std::unique_ptr<Compaction> comp1(
+      universal_compaction_picker.PickCompactionForCompactFiles(
+          comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
+          mutable_db_options_, 0));
 
   input_set.clear();
   input_files.clear();
@@ -4028,7 +4193,7 @@ TEST_P(PerKeyPlacementCompactionPickerTest,
   ASSERT_EQ(enable_per_key_placement_,
             universal_compaction_picker.FilesRangeOverlapWithCompaction(
                 input_files, 6,
-                Compaction::EvaluatePenultimateLevel(
+                Compaction::EvaluateProximalLevel(
                     vstorage_.get(), mutable_cf_options_, ioptions_, 0, 6)));
 }
 
@@ -4060,9 +4225,10 @@ TEST_P(PerKeyPlacementCompactionPickerTest, NormalCompactionOverlapUniversal) {
   ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
       &input_files, &input_set, vstorage_.get(), comp_options));
 
-  std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
-      comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
-      mutable_db_options_, 0));
+  std::unique_ptr<Compaction> comp1(
+      universal_compaction_picker.PickCompactionForCompactFiles(
+          comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+          mutable_db_options_, 0));
 
   input_set.clear();
   input_files.clear();
@@ -4076,9 +4242,9 @@ TEST_P(PerKeyPlacementCompactionPickerTest, NormalCompactionOverlapUniversal) {
                 input_files, 5, Compaction::kInvalidLevel));
 }
 
-TEST_P(PerKeyPlacementCompactionPickerTest, PenultimateOverlapUniversal) {
+TEST_P(PerKeyPlacementCompactionPickerTest, ProximalOverlapUniversal) {
   // This test is make sure the Tiered compaction would lock whole range of
-  // both output level and penultimate level
+  // both output level and proximal level
   if (enable_per_key_placement_) {
     mutable_cf_options_.preclude_last_level_data_seconds = 10000;
   }
@@ -4098,7 +4264,7 @@ TEST_P(PerKeyPlacementCompactionPickerTest, PenultimateOverlapUniversal) {
   UpdateVersionStorageInfo();
 
   // the existing compaction is the 1st L4 file + L6 file
-  // then compaction of the 2nd L4 file to L5 (penultimate level) is overlapped
+  // then compaction of the 2nd L4 file to L5 (proximal level) is overlapped
   // when the tiered compaction feature is on.
   CompactionOptions comp_options;
   std::unordered_set<uint64_t> input_set;
@@ -4108,9 +4274,10 @@ TEST_P(PerKeyPlacementCompactionPickerTest, PenultimateOverlapUniversal) {
   ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
       &input_files, &input_set, vstorage_.get(), comp_options));
 
-  std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
-      comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
-      mutable_db_options_, 0));
+  std::unique_ptr<Compaction> comp1(
+      universal_compaction_picker.PickCompactionForCompactFiles(
+          comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+          mutable_db_options_, 0));
 
   input_set.clear();
   input_files.clear();
@@ -4159,9 +4326,10 @@ TEST_P(PerKeyPlacementCompactionPickerTest, LastLevelOnlyOverlapUniversal) {
   ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
       &input_files, &input_set, vstorage_.get(), comp_options));
 
-  std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
-      comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
-      mutable_db_options_, 0));
+  std::unique_ptr<Compaction> comp1(
+      universal_compaction_picker.PickCompactionForCompactFiles(
+          comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+          mutable_db_options_, 0));
 
   // cannot compact file 41 if the preclude_last_level feature is on, otherwise
   // compact file 41 is okay.
@@ -4187,9 +4355,9 @@ TEST_P(PerKeyPlacementCompactionPickerTest, LastLevelOnlyOverlapUniversal) {
 }
 
 TEST_P(PerKeyPlacementCompactionPickerTest,
-       LastLevelOnlyFailPenultimateUniversal) {
+       LastLevelOnlyFailProximalUniversal) {
   // This is to test last_level only compaction still unable to do the
-  // penultimate level compaction if there's already a file in the penultimate
+  // proximal level compaction if there's already a file in the proximal
   // level.
   // This should rarely happen in universal compaction, as the non-empty L5
   // should be included in the compaction.
@@ -4217,14 +4385,15 @@ TEST_P(PerKeyPlacementCompactionPickerTest,
   ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
       &input_files, &input_set, vstorage_.get(), comp_options));
 
-  std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
-      comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
-      mutable_db_options_, 0));
+  std::unique_ptr<Compaction> comp1(
+      universal_compaction_picker.PickCompactionForCompactFiles(
+          comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+          mutable_db_options_, 0));
 
   ASSERT_TRUE(comp1);
-  ASSERT_EQ(comp1->GetPenultimateLevel(), Compaction::kInvalidLevel);
+  ASSERT_EQ(comp1->GetProximalLevel(), Compaction::kInvalidLevel);
 
-  // As comp1 cannot be output to the penultimate level, compacting file 40 to
+  // As comp1 cannot be output to the proximal level, compacting file 40 to
   // L5 is always safe.
   input_set.clear();
   input_files.clear();
@@ -4235,18 +4404,19 @@ TEST_P(PerKeyPlacementCompactionPickerTest,
   ASSERT_FALSE(universal_compaction_picker.FilesRangeOverlapWithCompaction(
       input_files, 5, Compaction::kInvalidLevel));
 
-  std::unique_ptr<Compaction> comp2(universal_compaction_picker.CompactFiles(
-      comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
-      mutable_db_options_, 0));
+  std::unique_ptr<Compaction> comp2(
+      universal_compaction_picker.PickCompactionForCompactFiles(
+          comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
+          mutable_db_options_, 0));
   ASSERT_TRUE(comp2);
-  ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetPenultimateLevel());
+  ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetProximalLevel());
 }
 
 TEST_P(PerKeyPlacementCompactionPickerTest,
        LastLevelOnlyConflictWithOngoingUniversal) {
   // This is to test last_level only compaction still unable to do the
-  // penultimate level compaction if there's already an ongoing compaction to
-  // the penultimate level
+  // proximal level compaction if there's already an ongoing compaction to
+  // the proximal level
   if (enable_per_key_placement_) {
     mutable_cf_options_.preclude_last_level_data_seconds = 10000;
   }
@@ -4265,7 +4435,7 @@ TEST_P(PerKeyPlacementCompactionPickerTest,
   Add(6, 60U, "101", "351", 60000000U);
   UpdateVersionStorageInfo();
 
-  // create an ongoing compaction to L5 (penultimate level)
+  // create an ongoing compaction to L5 (proximal level)
   CompactionOptions comp_options;
   std::unordered_set<uint64_t> input_set;
   input_set.insert(40);
@@ -4273,12 +4443,13 @@ TEST_P(PerKeyPlacementCompactionPickerTest,
   ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
       &input_files, &input_set, vstorage_.get(), comp_options));
 
-  std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
-      comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
-      mutable_db_options_, 0));
+  std::unique_ptr<Compaction> comp1(
+      universal_compaction_picker.PickCompactionForCompactFiles(
+          comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
+          mutable_db_options_, 0));
 
   ASSERT_TRUE(comp1);
-  ASSERT_EQ(comp1->GetPenultimateLevel(), Compaction::kInvalidLevel);
+  ASSERT_EQ(comp1->GetProximalLevel(), Compaction::kInvalidLevel);
 
   input_set.clear();
   input_files.clear();
@@ -4289,15 +4460,16 @@ TEST_P(PerKeyPlacementCompactionPickerTest,
   ASSERT_EQ(enable_per_key_placement_,
             universal_compaction_picker.FilesRangeOverlapWithCompaction(
                 input_files, 6,
-                Compaction::EvaluatePenultimateLevel(
+                Compaction::EvaluateProximalLevel(
                     vstorage_.get(), mutable_cf_options_, ioptions_, 6, 6)));
 
   if (!enable_per_key_placement_) {
-    std::unique_ptr<Compaction> comp2(universal_compaction_picker.CompactFiles(
-        comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
-        mutable_db_options_, 0));
+    std::unique_ptr<Compaction> comp2(
+        universal_compaction_picker.PickCompactionForCompactFiles(
+            comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+            mutable_db_options_, 0));
     ASSERT_TRUE(comp2);
-    ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetPenultimateLevel());
+    ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetProximalLevel());
   }
 }
 
@@ -4306,7 +4478,7 @@ TEST_P(PerKeyPlacementCompactionPickerTest,
   // This is similar to `LastLevelOnlyConflictWithOngoingUniversal`, the only
   // change is the ongoing compaction to L5 has no overlap with the last level
   // compaction, so it's safe to move data from the last level to the
-  // penultimate level.
+  // proximal level.
   if (enable_per_key_placement_) {
     mutable_cf_options_.preclude_last_level_data_seconds = 10000;
   }
@@ -4325,7 +4497,7 @@ TEST_P(PerKeyPlacementCompactionPickerTest,
   Add(6, 60U, "101", "351", 60000000U);
   UpdateVersionStorageInfo();
 
-  // create an ongoing compaction to L5 (penultimate level)
+  // create an ongoing compaction to L5 (proximal level)
   CompactionOptions comp_options;
   std::unordered_set<uint64_t> input_set;
   input_set.insert(42);
@@ -4333,12 +4505,13 @@ TEST_P(PerKeyPlacementCompactionPickerTest,
   ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
       &input_files, &input_set, vstorage_.get(), comp_options));
 
-  std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
-      comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
-      mutable_db_options_, 0));
+  std::unique_ptr<Compaction> comp1(
+      universal_compaction_picker.PickCompactionForCompactFiles(
+          comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
+          mutable_db_options_, 0));
 
   ASSERT_TRUE(comp1);
-  ASSERT_EQ(comp1->GetPenultimateLevel(), Compaction::kInvalidLevel);
+  ASSERT_EQ(comp1->GetProximalLevel(), Compaction::kInvalidLevel);
 
   input_set.clear();
   input_files.clear();
@@ -4349,18 +4522,19 @@ TEST_P(PerKeyPlacementCompactionPickerTest,
   // always safe to move data up
   ASSERT_FALSE(universal_compaction_picker.FilesRangeOverlapWithCompaction(
       input_files, 6,
-      Compaction::EvaluatePenultimateLevel(vstorage_.get(), mutable_cf_options_,
-                                           ioptions_, 6, 6)));
+      Compaction::EvaluateProximalLevel(vstorage_.get(), mutable_cf_options_,
+                                        ioptions_, 6, 6)));
 
   // 2 compactions can be run in parallel
-  std::unique_ptr<Compaction> comp2(universal_compaction_picker.CompactFiles(
-      comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
-      mutable_db_options_, 0));
+  std::unique_ptr<Compaction> comp2(
+      universal_compaction_picker.PickCompactionForCompactFiles(
+          comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+          mutable_db_options_, 0));
   ASSERT_TRUE(comp2);
   if (enable_per_key_placement_) {
-    ASSERT_NE(Compaction::kInvalidLevel, comp2->GetPenultimateLevel());
+    ASSERT_NE(Compaction::kInvalidLevel, comp2->GetProximalLevel());
   } else {
-    ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetPenultimateLevel());
+    ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetProximalLevel());
   }
 }
 
@@ -4417,7 +4591,7 @@ TEST_F(CompactionPickerTest,
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction);
   ASSERT_EQ(num_levels - 2, compaction->start_level());
   ASSERT_EQ(num_levels - 1, compaction->output_level());
@@ -4428,7 +4602,7 @@ TEST_F(CompactionPickerTest,
       level_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(second_compaction);
   ASSERT_EQ(num_levels - 1, compaction->output_level());
   ASSERT_EQ(num_levels - 2, compaction->start_level());
@@ -4475,7 +4649,7 @@ TEST_F(CompactionPickerTest,
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction);
   ASSERT_EQ(num_levels - 3, compaction->start_level());
   ASSERT_EQ(num_levels - 2, compaction->output_level());
@@ -4525,7 +4699,7 @@ TEST_F(CompactionPickerTest, IntraL0WhenL0IsSmall) {
     std::unique_ptr<Compaction> compaction(compaction_picker.PickCompaction(
         cf_name_, mutable_cf_options_, mutable_db_options_,
         /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-        vstorage_.get(), &log_buffer_));
+        vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
     ASSERT_TRUE(compaction.get() != nullptr);
     ASSERT_EQ(CompactionReason::kLevelL0FilesNum,
               compaction->compaction_reason());
@@ -4602,7 +4776,7 @@ TEST_F(CompactionPickerTest, UniversalMaxReadAmpLargeDB) {
           universal_compaction_picker.PickCompaction(
               cf_name_, mutable_cf_options_, mutable_db_options_,
               /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-              vstorage_.get(), &log_buffer_));
+              vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
       if (i == kMaxRuns) {
         // There are in total i + 1 > kMaxRuns sorted runs.
         // This triggers compaction ignoring size_ratio.
@@ -4650,11 +4824,1203 @@ TEST_F(CompactionPickerTest, UniversalMaxReadAmpSmallDB) {
         universal_compaction_picker.PickCompaction(
             cf_name_, mutable_cf_options_, mutable_db_options_,
             /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-            vstorage_.get(), &log_buffer_));
+            vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
     ASSERT_EQ(nullptr, compaction);
   }
 }
 
+TEST_F(CompactionPickerTest, StandaloneRangeDeletionOnlyPicksOlderFiles) {
+  NewVersionStorage(6, kCompactionStyleUniversal);
+
+  // Create L0 files with overlapping ranges
+  // File 1: newest regular file (epoch 5), keys [100, 200]
+  Add(0, 1U, "100", "200", 1U, 0, 100, 100, 0, false, Temperature::kUnknown,
+      kUnknownOldestAncesterTime, kUnknownNewestKeyTime, Slice(), Slice(), 5);
+
+  // File 2: standalone range deletion (epoch 4), keys [150, 250]
+  // This file should be marked as having only range deletions
+  Add(0, 2U, "150", "250", 1U, 0, 200, 200, 0, true, Temperature::kUnknown,
+      kUnknownOldestAncesterTime, kUnknownNewestKeyTime, Slice(), Slice(), 4);
+
+  // Manually set file 2 as standalone range deletion
+  FileMetaData* range_del_file = file_map_[2U].first;
+  range_del_file->num_entries = 1;
+  range_del_file->num_range_deletions = 1;
+  ASSERT_TRUE(range_del_file->FileIsStandAloneRangeTombstone());
+
+  Add(4, 10U, "000", "400", 1U);
+  Add(5, 20U, "000", "400", 100);
+
+  UpdateVersionStorageInfo();
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+  ASSERT_TRUE(universal_compaction_picker.NeedsCompaction(vstorage_.get()));
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_,
+          /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
+
+  ASSERT_NE(nullptr, compaction);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  // First input level should be L0 with only the standalone range del file
+  // (file 2)
+  ASSERT_EQ(0, compaction->level(0));
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_TRUE(compaction->input(0, 0)->FileIsStandAloneRangeTombstone());
+
+  // Second input level should be L4 with file 10
+  ASSERT_EQ(4, compaction->level(1));
+  ASSERT_EQ(1U, compaction->num_input_files(1));
+  ASSERT_EQ(10U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+// Tests for full_history_ts_low parameter in compaction picker.
+// The full_history_ts_low parameter is used to control bottommost file marking
+// for compaction when user-defined timestamps (UDT) are enabled.
+
+// Level compaction tests for full_history_ts_low:
+// These tests verify that bottommost files are correctly marked/unmarked
+// for compaction based on their max timestamp relative to full_history_ts_low.
+
+TEST_F(CompactionPickerU64TsTest,
+       BottommostNotMarkedWhenTimestampAboveFullHistoryTsLow) {
+  // Test that bottommost files are NOT marked for compaction when their
+  // max timestamp is >= full_history_ts_low. This prevents infinite
+  // compaction loops where timestamp could not be collapsed.
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  // File has max_ts = 1000, full_history_ts_low = 500
+  // Since 1000 >= 500, the file should NOT be marked for compaction.
+  SetupBottommostFileWithTimestamps(
+      /*min_ts=*/500, /*max_ts=*/1000, /*full_history_ts_low_val=*/500,
+      /*oldest_snapshot_seqnum=*/50, /*out_full_history_ts_low=*/nullptr);
+
+  // File's max_ts (1000) >= full_history_ts_low (500), so it should NOT
+  // be marked for bottommost compaction
+  ASSERT_TRUE(vstorage_->BottommostFilesMarkedForCompaction().empty());
+}
+
+TEST_F(CompactionPickerU64TsTest,
+       BottommostMarkedWhenTimestampBelowFullHistoryTsLow) {
+  // Test that bottommost files ARE marked for compaction when their
+  // max timestamp is < full_history_ts_low.
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  // File has max_ts = 100, full_history_ts_low = 500
+  // Since 100 < 500, the file SHOULD be marked for compaction.
+  SetupBottommostFileWithTimestamps(
+      /*min_ts=*/50, /*max_ts=*/100, /*full_history_ts_low_val=*/500,
+      /*oldest_snapshot_seqnum=*/50, /*out_full_history_ts_low=*/nullptr);
+
+  // File's max_ts (100) < full_history_ts_low (500), so it SHOULD be
+  // marked for bottommost compaction
+  ASSERT_EQ(1U, vstorage_->BottommostFilesMarkedForCompaction().size());
+  ASSERT_EQ(5, vstorage_->BottommostFilesMarkedForCompaction()[0].first);
+  ASSERT_EQ(1U, vstorage_->BottommostFilesMarkedForCompaction()[0]
+                    .second->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerU64TsTest,
+       BottommostNotMarkedWithEmptyFullHistoryTsLow) {
+  // Test that when full_history_ts_low is empty, files are still marked
+  // based on seqno condition (backward compatibility behavior).
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  std::string ts_small = MakeU64Timestamp(500);
+  std::string ts_large = MakeU64Timestamp(1000);
+
+  // Add a file at bottommost level with seqno < oldest_snapshot
+  Add(5, 1U, "100", "200", /*file_size=*/1000, /*path_id=*/0,
+      /*smallest_seq=*/10, /*largest_seq=*/40,
+      /*compensated_file_size=*/1000,
+      /*marked_for_compact=*/false, Temperature::kUnknown,
+      kUnknownOldestAncesterTime, kUnknownNewestKeyTime, ts_small, ts_large);
+
+  // Update version storage with empty full_history_ts_low
+  UpdateVersionStorageInfo();
+
+  // Update oldest snapshot with empty full_history_ts_low
+  vstorage_->UpdateOldestSnapshot(
+      /*oldest_snapshot_seqnum=*/50,
+      /*allow_ingest_behind=*/false,
+      /*ucmp=*/ucmp_,
+      /*full_history_ts_low=*/"");
+
+  // With empty full_history_ts_low and UDT enabled, the file should NOT be
+  // marked. When full_history_ts_low is empty, it means it was never set,
+  // effectively 0, which is smaller than any valid timestamp. Since the file's
+  // max_timestamp would be >= full_history_ts_low, it won't be marked.
+  ASSERT_EQ(0U, vstorage_->BottommostFilesMarkedForCompaction().size());
+}
+
+TEST_F(CompactionPickerU64TsTest, LevelPickCompactionWithFullHistoryTsLow) {
+  // Test that level compaction correctly passes full_history_ts_low
+  // and picks compaction appropriately
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+
+  AddL0FilesWithTimestamps(/*ts1_val=*/100, /*ts2_val=*/200);
+
+  UpdateVersionStorageInfo();
+
+  std::string full_history_ts_low = MakeU64Timestamp(150);
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_,
+      /*existing_snapshots=*/{}, /*snapshot_checker=*/nullptr, vstorage_.get(),
+      &log_buffer_, full_history_ts_low, /*require_max_output_level=*/false));
+
+  // Compaction should be picked for L0 files
+  ASSERT_NE(nullptr, compaction);
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(0, compaction->start_level());
+}
+
+TEST_F(CompactionPickerU64TsTest, UniversalPickCompactionWithFullHistoryTsLow) {
+  // Test that universal compaction correctly accepts full_history_ts_low
+  constexpr uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  NewVersionStorage(1, kCompactionStyleUniversal);
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  AddL0FilesWithTimestamps(/*ts1_val=*/100, /*ts2_val=*/200, kFileSize);
+
+  UpdateVersionStorageInfo();
+
+  std::string full_history_ts_low = MakeU64Timestamp(150);
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_,
+          /*existing_snapshots=*/{}, /*snapshot_checker=*/nullptr,
+          vstorage_.get(), &log_buffer_, full_history_ts_low,
+          /*require_max_output_level=*/false));
+
+  // Universal compaction should be picked
+  ASSERT_NE(nullptr, compaction);
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+}
+
+// ============================================================================
+// FIFO Ratio-Based Compaction Picker Unit Tests
+// Tests the actual FIFOCompactionPicker with use_kv_ratio_compaction option
+// (PickRatioBasedIntraL0Compaction path).
+// ============================================================================
+
+TEST_F(CompactionPickerTest, FIFORatioBasedCompactionFileCountThreshold) {
+  // Test three file count scenarios relative to trigger (= 4):
+  //   - fewer than trigger: no compaction
+  //   - exactly trigger: compaction fires
+  //   - more than trigger: compaction fires, picks >= 2 files
+
+  // Sub-test 1: fewer than trigger (3 files < trigger 4) -> no compaction
+  {
+    SetupFIFORatioBased(10 * 1024 * 1024, 1ULL * 1024 * 1024 * 1024, 4);
+    FIFOCompactionPicker picker(ioptions_, &icmp_);
+    Add(0, 1U, "100", "200", 64 * 1024);
+    Add(0, 2U, "200", "300", 64 * 1024);
+    Add(0, 3U, "300", "400", 64 * 1024);
+    AddBlobFile(100, 64ULL * 1024 * 1024);
+    AddBlobFile(101, 64ULL * 1024 * 1024);
+    AddBlobFile(102, 64ULL * 1024 * 1024);
+
+    auto compaction = PickFIFOCompaction(picker);
+    ASSERT_EQ(nullptr, compaction.get())
+        << "Should not compact when file count < trigger";
+  }
+
+  // Sub-test 2: exactly trigger (4 files = trigger 4) -> compaction fires
+  {
+    SetupFIFORatioBased(10 * 1024 * 1024, 1ULL * 1024 * 1024 * 1024, 4);
+    FIFOCompactionPicker picker(ioptions_, &icmp_);
+    Add(0, 1U, "100", "200", 64 * 1024);
+    Add(0, 2U, "200", "300", 32 * 1024);
+    Add(0, 3U, "300", "400", 48 * 1024);
+    Add(0, 4U, "400", "500", 96 * 1024);
+    // sst_ratio ~ 240KB/256MB ~ 0.001, target ~ 250KB
+    AddBlobFile(100, 64ULL * 1024 * 1024);
+    AddBlobFile(101, 64ULL * 1024 * 1024);
+    AddBlobFile(102, 64ULL * 1024 * 1024);
+    AddBlobFile(103, 64ULL * 1024 * 1024);
+
+    auto compaction = PickFIFOCompaction(picker);
+    ASSERT_NE(nullptr, compaction.get())
+        << "Should compact when file count == trigger";
+    ASSERT_EQ(CompactionReason::kFIFOReduceNumFiles,
+              compaction->compaction_reason());
+    ASSERT_EQ(0, compaction->output_level());
+  }
+
+  // Sub-test 3: more than trigger (8 files > trigger 4) -> compaction fires
+  {
+    SetupFIFORatioBased(100 * 1024 * 1024, 500ULL * 1024 * 1024, 4);
+    FIFOCompactionPicker picker(ioptions_, &icmp_);
+    Add(0, 1U, "100", "199", 64 * 1024);
+    Add(0, 2U, "200", "299", 32 * 1024);
+    Add(0, 3U, "300", "399", 48 * 1024);
+    Add(0, 4U, "400", "499", 96 * 1024);
+    Add(0, 5U, "500", "599", 64 * 1024);
+    Add(0, 6U, "600", "699", 48 * 1024);
+    Add(0, 7U, "700", "799", 64 * 1024);
+    Add(0, 8U, "800", "899", 64 * 1024);
+    for (uint64_t i = 0; i < 8; i++) {
+      AddBlobFile(100 + i, 50ULL * 1024 * 1024);
+    }
+
+    auto compaction = PickFIFOCompaction(picker);
+    ASSERT_NE(nullptr, compaction.get())
+        << "Should compact when file count > trigger";
+    ASSERT_EQ(CompactionReason::kFIFOReduceNumFiles,
+              compaction->compaction_reason());
+    ASSERT_GE(compaction->num_input_files(0), 2);
+  }
+}
+
+TEST_F(CompactionPickerTest, FIFORatioBasedCompactionNoBlobsFallback) {
+  // When total_blob == 0, sst_ratio = 1.0 and target becomes huge
+  // (max_data_files_size / trigger). With the tiered algorithm, the tier
+  // boundaries descend from target, and the lowest boundary where files
+  // can accumulate will be found. The algorithm should still work
+  // correctly (not crash) and produce a compaction at a low tier boundary.
+  SetupFIFORatioBased(10 * 1024 * 1024, 10ULL * 1024 * 1024 * 1024, 4);
+  FIFOCompactionPicker picker(ioptions_, &icmp_);
+
+  // Small SST files, no blob files
+  Add(0, 1U, "100", "200", 64 * 1024);
+  Add(0, 2U, "200", "300", 64 * 1024);
+  Add(0, 3U, "300", "400", 64 * 1024);
+  Add(0, 4U, "400", "500", 64 * 1024);
+
+  // No blob files added -- total_blob == 0
+
+  // With sst_ratio=1.0 and 10GB cap, target = 10GB/4 = 2.5GB.
+  // Tiered boundaries descend: 2.5GB, 625MB, ..., ~152KB, ~38KB, ...
+  // At boundary ~152KB, 4 files of 64KB accumulate to 256KB >= 152KB.
+  // The tiered algorithm finds a viable batch and compacts.
+  auto compaction = PickFIFOCompaction(picker);
+  ASSERT_NE(nullptr, compaction.get());
+  ASSERT_EQ(CompactionReason::kFIFOReduceNumFiles,
+            compaction->compaction_reason());
+  ASSERT_GE(compaction->num_input_files(0), 2);
+}
+
+TEST_F(CompactionPickerTest, FIFORatioBasedCompactionNoRecompaction) {
+  // When all files are at or above the target size (graduated),
+  // no re-compaction should happen. Files >= target are skipped at every
+  // tier boundary.
+  SetupFIFORatioBased(100 * 1024 * 1024, 500ULL * 1024 * 1024, 4);
+  FIFOCompactionPicker picker(ioptions_, &icmp_);
+  // Use max_compaction_bytes to set an explicit target of 256KB.
+  // Make all files >= 256KB so they are "graduated" (at or above target).
+  mutable_cf_options_.max_compaction_bytes = 256 * 1024;
+
+  // All files at 300KB, which is >= target (256KB) -> graduated
+  Add(0, 1U, "100", "199", 300 * 1024);
+  Add(0, 2U, "200", "299", 300 * 1024);
+  Add(0, 3U, "300", "399", 300 * 1024);
+  Add(0, 4U, "400", "499", 300 * 1024);
+
+  // All files are at/above target -> graduated -> no compaction.
+  auto compaction = PickFIFOCompaction(picker);
+  ASSERT_EQ(nullptr, compaction.get());
+}
+
+TEST_F(CompactionPickerTest,
+       FIFORatioBasedCompactionWithExplicitMaxCompactionBytes) {
+  // When max_compaction_bytes > 0, it overrides the auto-calculated target.
+  SetupFIFORatioBased(100 * 1024 * 1024, 10ULL * 1024 * 1024 * 1024, 4);
+  FIFOCompactionPicker picker(ioptions_, &icmp_);
+  // Explicitly set target to 256KB
+  mutable_cf_options_.max_compaction_bytes = 256 * 1024;
+
+  // 6 small SST files
+  Add(0, 1U, "100", "199", 64 * 1024);
+  Add(0, 2U, "200", "299", 64 * 1024);
+  Add(0, 3U, "300", "399", 64 * 1024);
+  Add(0, 4U, "400", "499", 64 * 1024);
+  Add(0, 5U, "500", "599", 64 * 1024);
+  Add(0, 6U, "600", "699", 64 * 1024);
+
+  // No blob files needed when max_compaction_bytes is explicitly set
+
+  // target = max_compaction_bytes = 256KB.
+  // Tier boundaries descend from 256KB: [25KB, 256KB] (trigger=4, floor=10KB).
+  // At boundary 25KB: each 64KB file >= 25KB -> skipped.
+  // At boundary 256KB: all 64KB files < 256KB -> accumulated until >= 256KB.
+  auto compaction = PickFIFOCompaction(picker);
+  ASSERT_NE(nullptr, compaction.get());
+  ASSERT_EQ(CompactionReason::kFIFOReduceNumFiles,
+            compaction->compaction_reason());
+  ASSERT_EQ(0, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, FIFORatioBasedCompactionFallbackToOldPath) {
+  // When use_kv_ratio_compaction is false, PickIntraL0Compaction should
+  // fall through to the old PickCostBasedIntraL0Compaction path.
+
+  // Sub-test 1: allow_compaction = false -> no intra-L0 at all
+  {
+    SetupFIFORatioBased(10 * 1024 * 1024, 0, 4,
+                        /*allow_compaction=*/false, /*use_kv_ratio=*/false);
+    FIFOCompactionPicker picker(ioptions_, &icmp_);
+    Add(0, 1U, "100", "200", 64 * 1024);
+    Add(0, 2U, "200", "300", 64 * 1024);
+    Add(0, 3U, "300", "400", 64 * 1024);
+    Add(0, 4U, "400", "500", 64 * 1024);
+
+    // Total size (256KB) < max_table_files_size (10MB), so no deletion.
+    // allow_compaction=false, so no intra-L0 either.
+    auto compaction = PickFIFOCompaction(picker);
+    ASSERT_EQ(nullptr, compaction.get());
+  }
+
+  // Sub-test 2: allow_compaction = true, use_kv_ratio = false
+  // -> falls through to old PickCostBasedIntraL0Compaction path
+  {
+    SetupFIFORatioBased(10 * 1024 * 1024, 0, 4,
+                        /*allow_compaction=*/true, /*use_kv_ratio=*/false);
+    // The old path uses max_compaction_bytes to cap total input size.
+    // In production this is sanitized to target_file_size_base * 25,
+    // but tests bypass sanitization, so set it explicitly.
+    mutable_cf_options_.max_compaction_bytes = 64 * 1024 * 1024;  // 64MB
+    FIFOCompactionPicker picker(ioptions_, &icmp_);
+    Add(0, 1U, "100", "200", 64 * 1024);
+    Add(0, 2U, "200", "300", 64 * 1024);
+    Add(0, 3U, "300", "400", 64 * 1024);
+    Add(0, 4U, "400", "500", 64 * 1024);
+
+    // Total size (256KB) < max_table_files_size (10MB), so no deletion.
+    // allow_compaction=true and use_kv_ratio=false -> old path.
+    // 4 files >= trigger(4), per_del = 256KB/3 ~ 85KB < 1.1*WBS -> passes.
+    auto compaction = PickFIFOCompaction(picker);
+    ASSERT_NE(nullptr, compaction.get())
+        << "Old path should compact when allow_compaction=true";
+    ASSERT_EQ(CompactionReason::kFIFOReduceNumFiles,
+              compaction->compaction_reason());
+  }
+}
+
+// ============================================================================
+// FIFO Option Validation Tests
+// Tests that ColumnFamilyData::ValidateOptions rejects invalid configurations
+// for use_kv_ratio_compaction.
+// ============================================================================
+
+TEST_F(CompactionPickerTest, FIFOOptionValidation) {
+  auto validate = [](std::function<void(ColumnFamilyOptions&)> configure) {
+    ColumnFamilyOptions cf_opts;
+    cf_opts.compaction_style = kCompactionStyleFIFO;
+    cf_opts.compaction_options_fifo.allow_compaction = true;
+    cf_opts.compaction_options_fifo.use_kv_ratio_compaction = true;
+    cf_opts.compaction_options_fifo.max_data_files_size =
+        1ULL * 1024 * 1024 * 1024;
+    cf_opts.num_levels = 1;
+    configure(cf_opts);
+    return ColumnFamilyData::ValidateOptions(DBOptions(), cf_opts);
+  };
+
+  // use_kv_ratio_compaction requires FIFO compaction style
+  ASSERT_TRUE(validate([](auto& o) {
+                o.compaction_style = kCompactionStyleLevel;
+              }).IsInvalidArgument());
+
+  // use_kv_ratio_compaction requires allow_compaction
+  ASSERT_TRUE(validate([](auto& o) {
+                o.compaction_options_fifo.allow_compaction = false;
+              }).IsInvalidArgument());
+
+  // use_kv_ratio_compaction requires max_data_files_size > 0
+  ASSERT_TRUE(validate([](auto& o) {
+                o.compaction_options_fifo.max_data_files_size = 0;
+              }).IsInvalidArgument());
+
+  // Accepts multi-level (for migration from level/universal to FIFO)
+  ASSERT_OK(validate([](auto& o) { o.num_levels = 4; }));
+
+  // Accepts valid single-level config
+  ASSERT_OK(validate([](auto& /*o*/) {}));
+
+  // max_data_files_size < max_table_files_size is invalid when non-zero
+  ASSERT_TRUE(validate([](auto& o) {
+                o.compaction_options_fifo.use_kv_ratio_compaction = false;
+                o.compaction_options_fifo.max_data_files_size = 0;
+                o.compaction_options_fifo.max_table_files_size =
+                    1ULL * 1024 * 1024 * 1024;
+                o.compaction_options_fifo.max_data_files_size =
+                    500ULL * 1024 * 1024;
+              }).IsInvalidArgument());
+
+  // max_data_files_size == max_table_files_size is valid
+  ASSERT_OK(validate([](auto& o) {
+    o.compaction_options_fifo.use_kv_ratio_compaction = false;
+    o.compaction_options_fifo.max_data_files_size = 0;
+    o.compaction_options_fifo.max_table_files_size = 1ULL * 1024 * 1024 * 1024;
+    o.compaction_options_fifo.max_data_files_size = 1ULL * 1024 * 1024 * 1024;
+  }));
+}
+
+// ============================================================================
+// FIFO Ratio-Based Compaction: Multi-Level Migration Graceful Skip
+// Tests that PickRatioBasedIntraL0Compaction gracefully skips when non-L0
+// levels still contain files (e.g., during migration from level/universal
+// to FIFO), and resumes once all data has been drained to L0.
+// ============================================================================
+
+TEST_F(CompactionPickerTest, FIFORatioBasedMultiLevelMigration) {
+  // Sub-case 1: During migration (non-L0 levels have files).
+  // Ratio-based intra-L0 compaction should be skipped.
+  {
+    SetupFIFORatioBased(/*max_table_files_size=*/100 * 1024 * 1024,
+                        /*max_data_files_size=*/1ULL * 1024 * 1024 * 1024,
+                        /*trigger=*/4,
+                        /*allow_compaction=*/true,
+                        /*use_kv_ratio=*/true,
+                        /*num_levels=*/4);
+    FIFOCompactionPicker picker(ioptions_, &icmp_);
+
+    Add(0, 1U, "100", "200", 64 * 1024);
+    Add(0, 2U, "200", "300", 64 * 1024);
+    Add(0, 3U, "300", "400", 64 * 1024);
+    Add(0, 4U, "400", "500", 64 * 1024);
+    Add(0, 5U, "500", "600", 64 * 1024);
+    Add(2, 10U, "100", "600", 50 * 1024 * 1024);
+    AddBlobFile(100, 64ULL * 1024 * 1024);
+    AddBlobFile(101, 64ULL * 1024 * 1024);
+
+    auto compaction = PickFIFOCompaction(picker);
+    if (compaction != nullptr) {
+      if (compaction->compaction_reason() ==
+          CompactionReason::kFIFOReduceNumFiles) {
+        // Cost-based path is fine; verify it's not ratio-based.
+        ASSERT_EQ(16 * 1024 * 1024, compaction->max_output_file_size());
+      }
+    }
+  }
+
+  // Sub-case 2: After migration (only L0 has files).
+  // Ratio-based compaction should resume normally.
+  {
+    SetupFIFORatioBased(/*max_table_files_size=*/100 * 1024 * 1024,
+                        /*max_data_files_size=*/1ULL * 1024 * 1024 * 1024,
+                        /*trigger=*/4,
+                        /*allow_compaction=*/true,
+                        /*use_kv_ratio=*/true,
+                        /*num_levels=*/4);
+    FIFOCompactionPicker picker(ioptions_, &icmp_);
+
+    Add(0, 1U, "100", "200", 64 * 1024);
+    Add(0, 2U, "200", "300", 32 * 1024);
+    Add(0, 3U, "300", "400", 48 * 1024);
+    Add(0, 4U, "400", "500", 96 * 1024);
+    AddBlobFile(100, 64ULL * 1024 * 1024);
+    AddBlobFile(101, 64ULL * 1024 * 1024);
+    AddBlobFile(102, 64ULL * 1024 * 1024);
+    AddBlobFile(103, 64ULL * 1024 * 1024);
+
+    auto compaction = PickFIFOCompaction(picker);
+    ASSERT_NE(nullptr, compaction.get())
+        << "Should compact when non-L0 levels are empty (migration complete)";
+    ASSERT_EQ(CompactionReason::kFIFOReduceNumFiles,
+              compaction->compaction_reason());
+    ASSERT_EQ(0, compaction->output_level());
+  }
+}
+
+// ============================================================================
+// FIFO TTL Compaction with Blob-Aware Estimation Tests
+// Tests that PickTTLCompaction correctly estimates remaining data (SST + blob)
+// in both single-level and multi-level FIFO configurations.
+// ============================================================================
+
+TEST_F(CompactionPickerTest, FIFOTTLBlobEstimationSingleLevel) {
+  // Single-level FIFO with TTL and max_data_files_size.
+  // After dropping expired L0 SSTs, the blob estimate should be proportional
+  // to the remaining SST fraction.
+  //
+  // Common setup: L0 = 4 files x 50KB = 200KB, files 3,4 expired.
+  // Remaining SST after drop = 100KB = 50%.
+
+  auto run = [&](uint64_t blob_total, uint64_t limit, bool expect_ttl_fires) {
+    ioptions_.compaction_style = kCompactionStyleFIFO;
+    NewVersionStorage(1, kCompactionStyleFIFO);
+    mutable_cf_options_.compaction_options_fifo.max_table_files_size = limit;
+    mutable_cf_options_.compaction_options_fifo.max_data_files_size = limit;
+    mutable_cf_options_.compaction_options_fifo.allow_compaction = true;
+    mutable_cf_options_.ttl = 3600;
+    FIFOCompactionPicker picker(ioptions_, &icmp_);
+
+    uint64_t recent_time = static_cast<uint64_t>(time(nullptr));
+    Add(0, 1U, "100", "200", 50 * 1024, 0, 100, 100, 0, false,
+        Temperature::kUnknown, kUnknownOldestAncesterTime, recent_time);
+    Add(0, 2U, "200", "300", 50 * 1024, 0, 100, 100, 0, false,
+        Temperature::kUnknown, kUnknownOldestAncesterTime, recent_time);
+    Add(0, 3U, "300", "400", 50 * 1024, 0, 100, 100, 0, false,
+        Temperature::kUnknown, kUnknownOldestAncesterTime, 1);
+    Add(0, 4U, "400", "500", 50 * 1024, 0, 100, 100, 0, false,
+        Temperature::kUnknown, kUnknownOldestAncesterTime, 1);
+    if (blob_total > 0) {
+      AddBlobFile(100, blob_total / 2);
+      AddBlobFile(101, blob_total / 2);
+    }
+
+    auto compaction = PickFIFOCompaction(picker);
+    if (expect_ttl_fires) {
+      ASSERT_NE(nullptr, compaction.get())
+          << "TTL compaction should fire when remaining data < limit";
+      ASSERT_EQ(CompactionReason::kFIFOTtl, compaction->compaction_reason());
+      ASSERT_EQ(2U, compaction->num_input_files(0));
+    } else {
+      if (compaction != nullptr) {
+        ASSERT_NE(CompactionReason::kFIFOTtl, compaction->compaction_reason())
+            << "TTL should not fire when remaining data still exceeds limit";
+      }
+    }
+  };
+
+  // Sub-case 1: Under limit after drop.
+  //   blob=400KB, limit=500KB.
+  //   effective = 100KB + (100KB/200KB)*400KB = 300KB < 500KB -> fires.
+  run(400 * 1024, 500 * 1024, /*expect_ttl_fires=*/true);
+
+  // Sub-case 2: Over limit after drop.
+  //   blob=4MB, limit=100KB.
+  //   effective = 100KB + (100KB/200KB)*4MB ~ 2MB >> 100KB -> does NOT fire.
+  run(4ULL * 1024 * 1024, 100 * 1024, /*expect_ttl_fires=*/false);
+
+  // Sub-case 3: No blob files. Falls back to SST-only estimation.
+  //   blob=0, limit=150KB. remaining SST = 100KB < 150KB -> fires.
+  run(0, 150 * 1024, /*expect_ttl_fires=*/true);
+}
+
+TEST_F(CompactionPickerTest, FIFOTTLBlobEstimationMultiLevel) {
+  // Multi-level FIFO (migration) with TTL and max_data_files_size.
+  // This is the ritical bug fix scenario:
+  //   - L0 has some SSTs, L2 has legacy SSTs from migration
+  //   - Blob files cover ALL levels
+  //   - The estimation must use total SST across ALL levels (not just L0)
+  //     to avoid inflating the blob proportion.
+  //
+  // Setup:
+  //   L0: 4 files x 50KB = 200KB SST (files 3,4 expired)
+  //   L2: 1 file x 200KB SST (legacy migration data)
+  //   Total SST = 400KB
+  //   Blob: 800KB total
+  //   max_data_files_size = 1000KB
+  //   Remaining SST after TTL drop = 400KB - 100KB = 300KB
+  //
+  //   CORRECT (fixed): effective = 300KB + (300KB/400KB)*800KB = 300+600 =
+  //   900KB < 1000KB -> fires BUG (old):        effective = 100KB +
+  //   (100KB/200KB)*800KB = 100+400 = 500KB < 1000KB -> fires
+  //                     (coincidentally fires too, but with wrong estimate)
+  //
+  // To distinguish correct vs buggy behavior, use a limit that triggers the
+  // difference: set max_data_files_size = 850KB.
+  //   CORRECT: effective = 300KB + (300KB/400KB)*800KB = 900KB > 850KB -> does
+  //   NOT fire BUG:     effective = 100KB + (100KB/200KB)*800KB = 500KB < 850KB
+  //   -> fires (wrong!)
+  ioptions_.compaction_style = kCompactionStyleFIFO;
+  NewVersionStorage(4, kCompactionStyleFIFO);
+  mutable_cf_options_.compaction_options_fifo.max_table_files_size =
+      850 * 1024;  // match max_data_files_size
+  mutable_cf_options_.compaction_options_fifo.max_data_files_size = 850 * 1024;
+  mutable_cf_options_.compaction_options_fifo.allow_compaction = true;
+  mutable_cf_options_.ttl = 3600;
+  FIFOCompactionPicker picker(ioptions_, &icmp_);
+
+  uint64_t recent_time = static_cast<uint64_t>(time(nullptr));
+  // L0 files: 2 recent, 2 expired
+  Add(0, 1U, "100", "200", 50 * 1024, 0, 100, 100, 0, false,
+      Temperature::kUnknown, kUnknownOldestAncesterTime, recent_time);
+  Add(0, 2U, "200", "300", 50 * 1024, 0, 100, 100, 0, false,
+      Temperature::kUnknown, kUnknownOldestAncesterTime, recent_time);
+  Add(0, 3U, "300", "400", 50 * 1024, 0, 100, 100, 0, false,
+      Temperature::kUnknown, kUnknownOldestAncesterTime, 1);
+  Add(0, 4U, "400", "500", 50 * 1024, 0, 100, 100, 0, false,
+      Temperature::kUnknown, kUnknownOldestAncesterTime, 1);
+  // L2 legacy migration file
+  Add(2, 10U, "100", "600", 200 * 1024);
+  // Blob files (associated with ALL levels)
+  AddBlobFile(100, 400 * 1024);
+  AddBlobFile(101, 400 * 1024);
+
+  auto compaction = PickFIFOCompaction(picker);
+  // With correct all-levels estimation:
+  //   remaining_sst_all = 400KB - 100KB(dropped) = 300KB
+  //   effective = 300KB + (300KB/400KB)*800KB = 900KB > 850KB
+  //   -> TTL should NOT fire (falls through to size-based)
+  if (compaction != nullptr) {
+    ASSERT_NE(CompactionReason::kFIFOTtl, compaction->compaction_reason())
+        << "Multi-level FIFO: TTL should not fire when correct all-levels "
+           "blob estimation shows data still exceeds limit";
+  }
+}
+
+TEST_F(CompactionPickerTest, FIFOBlobAwareSizeDropping) {
+  // PickSizeCompaction with max_data_files_size should account for blob data.
+  //
+  // Sub-case 1: Single-level. SST = 200KB, blob = 500MB, limit = 200MB.
+  //   effective_size ~ 500MB >> 200MB -> drops from L0.
+  {
+    SetupFIFORatioBased(/*max_table=*/200ULL * 1024 * 1024,
+                        /*max_data=*/200ULL * 1024 * 1024,
+                        /*trigger=*/4,
+                        /*allow_compaction=*/true,
+                        /*use_kv_ratio=*/false);
+    FIFOCompactionPicker picker(ioptions_, &icmp_);
+
+    Add(0, 1U, "100", "199", 40 * 1024);
+    Add(0, 2U, "200", "299", 40 * 1024);
+    Add(0, 3U, "300", "399", 40 * 1024);
+    Add(0, 4U, "400", "499", 40 * 1024);
+    Add(0, 5U, "500", "599", 40 * 1024);
+    AddBlobFile(100, 100ULL * 1024 * 1024);
+    AddBlobFile(101, 100ULL * 1024 * 1024);
+    AddBlobFile(102, 100ULL * 1024 * 1024);
+    AddBlobFile(103, 100ULL * 1024 * 1024);
+    AddBlobFile(104, 100ULL * 1024 * 1024);
+
+    auto compaction = PickFIFOCompaction(picker);
+    ASSERT_NE(nullptr, compaction.get());
+    ASSERT_EQ(CompactionReason::kFIFOMaxSize, compaction->compaction_reason());
+    ASSERT_GE(compaction->num_input_files(0), 1);
+  }
+
+  // Sub-case 2: Multi-level (migration). L0=100KB, L2=150KB, blob=500KB.
+  //   effective_size = 250KB + 500KB = 750KB > 400KB -> drops from L2.
+  {
+    ioptions_.compaction_style = kCompactionStyleFIFO;
+    NewVersionStorage(4, kCompactionStyleFIFO);
+    mutable_cf_options_.compaction_options_fifo.max_table_files_size =
+        400 * 1024;
+    mutable_cf_options_.compaction_options_fifo.max_data_files_size =
+        400 * 1024;
+    mutable_cf_options_.compaction_options_fifo.allow_compaction = true;
+    mutable_cf_options_.ttl = 0;
+    FIFOCompactionPicker picker(ioptions_, &icmp_);
+
+    Add(0, 1U, "100", "200", 50 * 1024);
+    Add(0, 2U, "200", "300", 50 * 1024);
+    Add(2, 10U, "100", "300", 50 * 1024);
+    Add(2, 11U, "300", "500", 50 * 1024);
+    Add(2, 12U, "500", "700", 50 * 1024);
+    AddBlobFile(100, 250 * 1024);
+    AddBlobFile(101, 250 * 1024);
+
+    auto compaction = PickFIFOCompaction(picker);
+    ASSERT_NE(nullptr, compaction.get());
+    ASSERT_EQ(CompactionReason::kFIFOMaxSize, compaction->compaction_reason());
+    ASSERT_EQ(2, compaction->start_level());
+    ASSERT_GE(compaction->num_input_files(0), 1U);
+  }
+
+  // Sub-case 3: Under limit. SST = 256KB, blob = 200MB, limit = 1GB.
+  //   effective_size ~ 200MB < 1GB -> no dropping.
+  {
+    SetupFIFORatioBased(/*max_table=*/1ULL * 1024 * 1024 * 1024,
+                        /*max_data=*/1ULL * 1024 * 1024 * 1024,
+                        /*trigger=*/4,
+                        /*allow_compaction=*/true,
+                        /*use_kv_ratio=*/true);
+    FIFOCompactionPicker picker(ioptions_, &icmp_);
+
+    Add(0, 1U, "100", "199", 64 * 1024);
+    Add(0, 2U, "200", "299", 64 * 1024);
+    Add(0, 3U, "300", "399", 64 * 1024);
+    Add(0, 4U, "400", "499", 64 * 1024);
+    AddBlobFile(100, 50ULL * 1024 * 1024);
+    AddBlobFile(101, 50ULL * 1024 * 1024);
+    AddBlobFile(102, 50ULL * 1024 * 1024);
+    AddBlobFile(103, 50ULL * 1024 * 1024);
+
+    auto compaction = PickFIFOCompaction(picker);
+    if (compaction) {
+      ASSERT_NE(CompactionReason::kFIFOMaxSize,
+                compaction->compaction_reason());
+    }
+  }
+}
+
+// ============================================================================
+// FIFO Blob-Aware Score Computation Test
+// Tests that ComputeCompactionScore includes blob sizes when
+// max_data_files_size > 0.
+// ============================================================================
+
+TEST_F(CompactionPickerTest, FIFOBlobAwareScoreComputation) {
+  // Sub-case 1: With max_data_files_size, score includes blob sizes.
+  //   SST = 100KB, blob = 500MB, max_data = 200MB -> score ~ 2.5
+  {
+    ioptions_.compaction_style = kCompactionStyleFIFO;
+    NewVersionStorage(1, kCompactionStyleFIFO);
+    mutable_cf_options_.compaction_options_fifo.max_table_files_size =
+        200ULL * 1024 * 1024;
+    mutable_cf_options_.compaction_options_fifo.max_data_files_size =
+        200ULL * 1024 * 1024;
+    mutable_cf_options_.compaction_options_fifo.allow_compaction = false;
+    mutable_cf_options_.level0_file_num_compaction_trigger = 4;
+
+    Add(0, 1U, "100", "199", 25 * 1024);
+    Add(0, 2U, "200", "299", 25 * 1024);
+    Add(0, 3U, "300", "399", 25 * 1024);
+    Add(0, 4U, "400", "499", 25 * 1024);
+    AddBlobFile(100, 500ULL * 1024 * 1024);
+    UpdateVersionStorageInfo();
+
+    double score = vstorage_->CompactionScore(0);
+    ASSERT_GT(score, 2.0) << "Score should reflect 500MB/200MB ~ 2.5";
+  }
+
+  // Sub-case 2: Without max_data_files_size, score ignores blobs.
+  //   SST = 400KB < 1MB, blob = 500MB ignored -> score ~ 0.4
+  {
+    ioptions_.compaction_style = kCompactionStyleFIFO;
+    NewVersionStorage(1, kCompactionStyleFIFO);
+    mutable_cf_options_.compaction_options_fifo.max_table_files_size =
+        1ULL * 1024 * 1024;
+    mutable_cf_options_.compaction_options_fifo.max_data_files_size = 0;
+    mutable_cf_options_.compaction_options_fifo.allow_compaction = false;
+    mutable_cf_options_.level0_file_num_compaction_trigger = 4;
+
+    Add(0, 1U, "100", "199", 100 * 1024);
+    Add(0, 2U, "200", "299", 100 * 1024);
+    Add(0, 3U, "300", "399", 100 * 1024);
+    Add(0, 4U, "400", "499", 100 * 1024);
+    AddBlobFile(100, 500ULL * 1024 * 1024);
+    UpdateVersionStorageInfo();
+
+    double score = vstorage_->CompactionScore(0);
+    ASSERT_LT(score, 1.0)
+        << "Score should be < 1 when only SST sizes are counted";
+  }
+}
+
+// ============================================================================
+// FIFO + BlobDB Intra-L0 Compaction Picking Tests
+//
+// These tests validate the tiered intra-L0 compaction picking algorithm
+// over multiple flush/compaction cycles. Each round:
+//   1. Add a flush file to the L0 file list
+//   2. Rebuild VersionStorageInfo and call FIFOCompactionPicker::PickCompaction
+//   3. If compaction is picked, update the file list accordingly
+//   4. Repeat
+//
+// The compaction PICKING uses the real FIFOCompactionPicker -- this ensures
+// the tests always match the production picking logic. The rest of the
+// system (compaction execution, file metadata updates, FIFO dropping) is
+// handled by test helpers, since wiring up the full compaction execution
+// pipeline (CompactionJob, VersionEdit, etc.) would add significant
+// complexity without testing the picking logic more thoroughly.
+//
+// ============================================================================
+
+class FIFORatioBasedCompactionPickingTest : public CompactionPickerTest {
+ protected:
+  struct L0File {
+    uint64_t size;       // SST file size in bytes
+    uint64_t blob_size;  // Associated blob data size
+    uint64_t age;        // Creation order (lower = older)
+    bool is_compacted;   // Created by compaction (vs flush)
+  };
+
+  // Pick compaction using FIFOCompactionPicker.
+  //
+  // Rebuilds VersionStorageInfo from the files vector and calls
+  // PickCompaction on the given picker. Maps the returned
+  // Compaction's input files back to vector indices.
+  //
+  // Returns the picked indices, or empty if no compaction.
+  // Also returns the compaction reason via out-parameter.
+  std::vector<size_t> PickCompactionFromFiles(
+      FIFOCompactionPicker& picker, const std::vector<L0File>& files,
+      uint64_t max_table_files_size, uint64_t max_data_files_size, int trigger,
+      CompactionReason* out_reason = nullptr) {
+    // Rebuild VersionStorageInfo from the current file list
+    NewVersionStorage(1, kCompactionStyleFIFO);
+    mutable_cf_options_.compaction_options_fifo.max_table_files_size =
+        max_table_files_size;
+    mutable_cf_options_.compaction_options_fifo.max_data_files_size =
+        max_data_files_size;
+    mutable_cf_options_.compaction_options_fifo.allow_compaction = true;
+    mutable_cf_options_.compaction_options_fifo.use_kv_ratio_compaction = true;
+    mutable_cf_options_.level0_file_num_compaction_trigger = trigger;
+
+    // Add files: newest first. Use descending file numbers so L0 sort
+    // (newest-first by epoch/seqno/file_number) matches our order.
+    uint32_t base_fn = static_cast<uint32_t>(files.size());
+    for (size_t i = 0; i < files.size(); i++) {
+      uint32_t fn = base_fn - static_cast<uint32_t>(i);
+      std::string smallest = "k" + std::to_string(10000 + fn * 10);
+      std::string largest = "k" + std::to_string(10000 + fn * 10 + 9);
+      Add(0, fn, smallest.c_str(), largest.c_str(), files[i].size);
+    }
+
+    // Add one blob file with the total blob size
+    uint64_t total_blob = 0;
+    for (const auto& f : files) {
+      total_blob += f.blob_size;
+    }
+    if (total_blob > 0) {
+      AddBlobFile(9999, total_blob);
+    }
+
+    UpdateVersionStorageInfo();
+
+    std::unique_ptr<Compaction> compaction(picker.PickCompaction(
+        cf_name_, mutable_cf_options_, mutable_db_options_,
+        /*existing_snapshots=*/{}, /*snapshot_checker=*/nullptr,
+        vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
+
+    if (!compaction) return {};
+
+    if (out_reason) {
+      *out_reason = compaction->compaction_reason();
+    }
+
+    // For size-based dropping (kFIFOMaxSize / kFIFOTtl), map input files
+    // back to sim indices, same as for intra-L0.
+    std::vector<size_t> result;
+    for (size_t j = 0; j < compaction->num_input_files(0); j++) {
+      uint32_t fn =
+          static_cast<uint32_t>(compaction->input(0, j)->fd.GetNumber());
+      size_t idx = base_fn - fn;
+      result.push_back(idx);
+    }
+
+    // Unregister so the picker allows the next compaction
+    picker.UnregisterCompaction(compaction.get());
+
+    return result;
+  }
+
+  // Execute one compaction: merge input files into 1 output
+  void ExecuteCompaction(std::vector<L0File>& files,
+                         const std::vector<size_t>& input_indices,
+                         uint64_t& global_age) {
+    uint64_t output_size = 0;
+    uint64_t output_blob = 0;
+    for (size_t idx : input_indices) {
+      output_size += files[idx].size;
+      output_blob += files[idx].blob_size;
+    }
+
+    size_t oldest_input_pos = 0;
+    for (size_t idx : input_indices) {
+      oldest_input_pos = std::max(oldest_input_pos, idx);
+    }
+
+    std::vector<size_t> sorted_indices = input_indices;
+    std::sort(sorted_indices.rbegin(), sorted_indices.rend());
+    for (size_t idx : sorted_indices) {
+      files.erase(files.begin() + idx);
+    }
+
+    size_t insert_pos = oldest_input_pos;
+    for (size_t idx : sorted_indices) {
+      if (idx < oldest_input_pos) insert_pos--;
+    }
+    insert_pos = std::min(insert_pos, files.size());
+    files.insert(files.begin() + insert_pos,
+                 {output_size, output_blob, global_age++, true});
+  }
+
+  // Compute statistics about compacted file sizes
+  struct FileStats {
+    uint64_t count;
+    uint64_t min_size;
+    uint64_t max_size;
+    double mean_size;
+    double cv;
+  };
+
+  FileStats ComputeStats(const std::vector<L0File>& files,
+                         bool compacted_only) {
+    std::vector<uint64_t> sizes;
+    for (const auto& f : files) {
+      if (!compacted_only || f.is_compacted) {
+        sizes.push_back(f.size);
+      }
+    }
+    if (sizes.empty()) return {0, 0, 0, 0.0, 0.0};
+
+    uint64_t sum = 0;
+    uint64_t min_s = UINT64_MAX, max_s = 0;
+    for (uint64_t s : sizes) {
+      sum += s;
+      min_s = std::min(min_s, s);
+      max_s = std::max(max_s, s);
+    }
+    double mean = static_cast<double>(sum) / sizes.size();
+
+    double variance = 0;
+    for (uint64_t s : sizes) {
+      double diff = static_cast<double>(s) - mean;
+      variance += diff * diff;
+    }
+    variance /= sizes.size();
+    double stddev = std::sqrt(variance);
+    double cv = mean > 0 ? stddev / mean : 0;
+
+    return {sizes.size(), min_s, max_s, mean, cv};
+  }
+
+  // Track write amplification
+  struct WriteAmpTracker {
+    uint64_t bytes_flushed = 0;
+    uint64_t bytes_compacted = 0;
+
+    double sst_write_amp() const {
+      return bytes_flushed > 0
+                 ? static_cast<double>(bytes_flushed + bytes_compacted) /
+                       bytes_flushed
+                 : 1.0;
+    }
+  };
+
+  struct TestState {
+    std::vector<L0File> files;
+    uint64_t global_age = 0;
+    WriteAmpTracker wa;
+    int compaction_count = 0;
+    uint64_t max_file_count_seen = 0;
+  };
+
+  using FlushGenerator =
+      std::function<std::pair<uint64_t, uint64_t>(int round)>;
+
+  // Core test loop: flush -> pick -> execute -> repeat.
+  void RunFlushAndCompact(TestState& s, int num_rounds, int trigger,
+                          uint64_t max_data_files_size,
+                          const FlushGenerator& gen) {
+    ioptions_.compaction_style = kCompactionStyleFIFO;
+    FIFOCompactionPicker picker(ioptions_, &icmp_);
+
+    // Use max_data_files_size for both limits. When max_data_files_size > 0,
+    // it takes precedence and max_table_files_size is ignored, but keeping
+    // them consistent avoids contradictory configurations.
+    const uint64_t max_table_files_size = max_data_files_size;
+
+    for (int round = 0; round < num_rounds; round++) {
+      auto [sst_size, blob_size] = gen(round);
+      s.files.insert(s.files.begin(),
+                     {sst_size, blob_size, s.global_age++, false});
+      s.wa.bytes_flushed += sst_size;
+
+      // Pick compaction. Handle both dropping and intra-L0 results.
+      CompactionReason reason;
+      auto inputs =
+          PickCompactionFromFiles(picker, s.files, max_table_files_size,
+                                  max_data_files_size, trigger, &reason);
+      if (!inputs.empty()) {
+        if (reason == CompactionReason::kFIFOMaxSize ||
+            reason == CompactionReason::kFIFOTtl) {
+          // Size/TTL dropping: remove the picked files
+          std::vector<size_t> sorted = inputs;
+          std::sort(sorted.rbegin(), sorted.rend());
+          for (size_t idx : sorted) {
+            s.files.erase(s.files.begin() + idx);
+          }
+        } else {
+          // Intra-L0 compaction: merge picked files
+          uint64_t compaction_input = 0;
+          for (size_t idx : inputs) {
+            compaction_input += s.files[idx].size;
+          }
+          s.wa.bytes_compacted += compaction_input;
+          ExecuteCompaction(s.files, inputs, s.global_age);
+          s.compaction_count++;
+        }
+      }
+      s.max_file_count_seen = std::max(s.max_file_count_seen,
+                                       static_cast<uint64_t>(s.files.size()));
+    }
+  }
+
+  // Assertion helpers
+  void AssertFileCountBounded(const std::vector<L0File>& files,
+                              uint64_t max_count, uint64_t multiplier = 3) {
+    ASSERT_LE(files.size(), max_count * multiplier)
+        << "File count " << files.size() << " exceeds "
+        << max_count * multiplier;
+  }
+
+  void AssertCompactedUniform(const std::vector<L0File>& files, double max_cv) {
+    auto stats = ComputeStats(files, true);
+    if (stats.count >= 2) {
+      ASSERT_LE(stats.cv, max_cv)
+          << "Compacted CV=" << stats.cv << " exceeds " << max_cv
+          << " (min=" << stats.min_size << " max=" << stats.max_size
+          << " mean=" << stats.mean_size << " count=" << stats.count << ")";
+    }
+  }
+
+  void AssertLowWriteAmp(const WriteAmpTracker& wa, double max_wa = 3.0) {
+    ASSERT_LE(wa.sst_write_amp(), max_wa)
+        << "Write amp=" << wa.sst_write_amp() << " exceeds " << max_wa;
+  }
+
+  void AssertStandardGoals(const TestState& s, uint64_t max_count,
+                           double max_cv = 0.30, double max_wa = 3.0,
+                           uint64_t file_mult = 3) {
+    AssertFileCountBounded(s.files, max_count, file_mult);
+    AssertCompactedUniform(s.files, max_cv);
+    AssertLowWriteAmp(s.wa, max_wa);
+  }
+
+  // Verify that graduated files (>= target) are never picked for compaction.
+  void AssertGraduatedNotPicked(const std::vector<L0File>& files, int trigger,
+                                uint64_t max_data_files_size) {
+    ioptions_.compaction_style = kCompactionStyleFIFO;
+    FIFOCompactionPicker picker(ioptions_, &icmp_);
+    const uint64_t max_table_files_size = max_data_files_size;
+
+    CompactionReason reason;
+    auto inputs =
+        PickCompactionFromFiles(picker, files, max_table_files_size,
+                                max_data_files_size, trigger, &reason);
+    if (!inputs.empty() && reason == CompactionReason::kFIFOReduceNumFiles) {
+      // Compute target from the picker's perspective: we need to estimate
+      // it the same way the picker does.
+      uint64_t total_sst = 0, total_blob = 0;
+      for (const auto& f : files) {
+        total_sst += f.size;
+        total_blob += f.blob_size;
+      }
+      double sst_ratio = total_blob > 0 ? static_cast<double>(total_sst) /
+                                              (total_sst + total_blob)
+                                        : 1.0;
+      uint64_t target =
+          static_cast<uint64_t>(max_data_files_size * sst_ratio) / trigger;
+
+      for (size_t idx : inputs) {
+        ASSERT_LT(files[idx].size, target)
+            << "Should not re-compact graduated file at index " << idx
+            << " size=" << files[idx].size << " target=" << target;
+      }
+    }
+  }
+};
+
+// Variable flush + FIFO dropping -- the full scenario.
+// Variable SST sizes (32-128KB), variable blob sizes (32-96MB), with
+// FIFO size-based dropping active. This covers constant flush, variable
+// flush, and FIFO dropping behaviors in a single test.
+TEST_F(FIFORatioBasedCompactionPickingTest, VariableFlushWithFIFODropping) {
+  const uint64_t kCap = 500ULL * 1024 * 1024;
+  Random rng(42);
+  TestState s;
+  RunFlushAndCompact(s, 200, /*trigger=*/10, kCap, [&](int) {
+    return std::make_pair((32 + rng.Next() % 97) * 1024ULL,
+                          (32 + rng.Next() % 65) * 1024ULL * 1024);
+  });
+  AssertStandardGoals(s, 10, /*max_cv=*/0.40);
+}
+
+// Verify graduated files are never re-compacted.
+// With the tiered algorithm, intermediate compacted files CAN be merged
+// at higher tier boundaries (that's the whole point of tiering). But files
+// that have reached the target size ("graduated") should never be picked.
+TEST_F(FIFORatioBasedCompactionPickingTest, NoCascadingReCompaction) {
+  const uint64_t kCap = 10ULL * 1024 * 1024 * 1024;
+  TestState s;
+  RunFlushAndCompact(s, 200, /*trigger=*/10, kCap, [](int) {
+    return std::make_pair(64ULL * 1024, 64ULL * 1024 * 1024);
+  });
+
+  AssertGraduatedNotPicked(s.files, 10, kCap);
+  // Write amp should be bounded (k=2 tiers for this config, so wa <= 3+margin)
+  AssertLowWriteAmp(s.wa, 4.0);
+}
+
+// Early memtable flush -- very small flushes
+TEST_F(FIFORatioBasedCompactionPickingTest, EarlyMemtableFlush) {
+  const uint64_t kCap = 1ULL * 1024 * 1024 * 1024;
+  Random rng(123);
+  TestState s;
+  RunFlushAndCompact(s, 100, /*trigger=*/10, kCap, [&](int) {
+    uint64_t sst = (rng.Next() % 5 == 0) ? (64 + rng.Next() % 65) * 1024ULL
+                                         : (8 + rng.Next() % 25) * 1024ULL;
+    return std::make_pair(sst, 32ULL * 1024 * 1024);
+  });
+
+  AssertStandardGoals(s, 10, /*max_cv=*/0.50, /*max_wa=*/4.0,
+                      /*file_mult=*/5);
+}
+
+// Blob compression variation -- data per flush varies, shifting
+// the SST/blob ratio. The target is recomputed on every PickCompaction call
+// (no caching), so the picker naturally adapts to ratio changes.
+TEST_F(FIFORatioBasedCompactionPickingTest, BlobCompressionVariation) {
+  const uint64_t kCap = 300ULL * 1024 * 1024;
+  Random rng(456);
+  TestState s;
+  RunFlushAndCompact(s, 150, /*trigger=*/10, kCap, [&](int) {
+    return std::make_pair(64ULL * 1024,
+                          (20 + rng.Next() % 61) * 1024ULL * 1024);
+  });
+  AssertCompactedUniform(s.files, 0.30);
+}
+
+// Large target/flush ratio -- verify logarithmic write amp with tiering
+TEST_F(FIFORatioBasedCompactionPickingTest, TieredLargeRatio) {
+  // target/flush ~ 1000x with trigger=10 -> k=3 tiers, write amp ~ 4.
+  // Without tiering (flat merge), write amp would be ~57x.
+  const uint64_t kCap = 10ULL * 1024 * 1024 * 1024;  // 10GB
+  TestState s;
+  // SST = 1KB, blob = 1MB. sst_ratio ~ 0.001.
+  // target = 10GB * 0.001 / 10 = 1MB. ratio = 1MB/1KB = 1024.
+  // k = ceil(log_10(1024)) = 4. Tier boundaries: ~10KB, ~100KB, 1MB.
+  // (10KB floor means lowest boundary is 10KB, not 1KB)
+  RunFlushAndCompact(s, 500, /*trigger=*/10, kCap, [](int) {
+    return std::make_pair(1ULL * 1024, 1ULL * 1024 * 1024);
+  });
+
+  // Write amp should be logarithmic: k+1 = 4 (with 10KB floor, 3 tiers).
+  // Allow some margin for ramp-up and boundary effects.
+  AssertLowWriteAmp(s.wa, 6.0);
+
+  // File count should be bounded: trigger * (k+1) ~ 10 * 4 = 40
+  AssertFileCountBounded(s.files, 10, /*multiplier=*/6);
+}
+
+// Tiered progression -- verify intermediate tiers form and merge up
+TEST_F(FIFORatioBasedCompactionPickingTest, TieredProgression) {
+  // SST = 10KB, blob = 1MB, cap = 100MB, trigger=4.
+  // sst_ratio ~ 10KB/1010KB ~ 0.0099.
+  // target = 100MB * 0.0099 / 4 ~ 248KB. ratio ~ 25.
+  // k = ceil(log_4(25)) = ceil(2.32) = 3. Boundaries: ~16KB, ~62KB, ~248KB.
+  const uint64_t kCap = 100ULL * 1024 * 1024;
+  TestState s;
+  RunFlushAndCompact(s, 200, /*trigger=*/4, kCap, [](int) {
+    return std::make_pair(10ULL * 1024, 1ULL * 1024 * 1024);
+  });
+
+  // Should have compacted files at multiple tier sizes
+  auto stats = ComputeStats(s.files, true);
+  ASSERT_GE(stats.count, 1u) << "Should have at least one compacted file";
+
+  // Write amp should be bounded: k+1 = 4, plus margin
+  AssertLowWriteAmp(s.wa, 5.0);
+}
+
+// Graduated files should never be re-compacted
+TEST_F(FIFORatioBasedCompactionPickingTest, GraduatedFilesNotRecompacted) {
+  // Build a state with graduated files (>= target), then verify they are
+  // never selected for compaction.
+  const uint64_t kCap = 500ULL * 1024 * 1024;  // 500MB
+  TestState s;
+  // SST = 64KB, blob = 50MB. sst_ratio ~ 0.00125.
+  // target = 500MB * 0.00125 / 4 ~ 156KB.
+  // k = ceil(log_4(156/64)) = ceil(log_4(2.44)) = 1.
+  RunFlushAndCompact(s, 60, /*trigger=*/4, kCap, [](int) {
+    return std::make_pair(64ULL * 1024, 50ULL * 1024 * 1024);
+  });
+
+  AssertGraduatedNotPicked(s.files, 4, kCap);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/compaction/compaction_picker_universal.cc b/db/compaction/compaction_picker_universal.cc
index 427abb9eabc7..173e317a1006 100644
--- a/db/compaction/compaction_picker_universal.cc
+++ b/db/compaction/compaction_picker_universal.cc
@@ -38,7 +38,8 @@ class UniversalCompactionBuilder {
       const MutableDBOptions& mutable_db_options,
       const std::vector<SequenceNumber>& existing_snapshots,
       const SnapshotChecker* snapshot_checker, VersionStorageInfo* vstorage,
-      UniversalCompactionPicker* picker, LogBuffer* log_buffer)
+      UniversalCompactionPicker* picker, LogBuffer* log_buffer,
+      bool require_max_output_level, const std::string& full_history_ts_low)
       : ioptions_(ioptions),
         icmp_(icmp),
         cf_name_(cf_name),
@@ -46,7 +47,11 @@ class UniversalCompactionBuilder {
         mutable_db_options_(mutable_db_options),
         vstorage_(vstorage),
         picker_(picker),
-        log_buffer_(log_buffer) {
+        log_buffer_(log_buffer),
+        require_max_output_level_(require_max_output_level),
+        allow_ingest_behind_(ioptions.cf_allow_ingest_behind ||
+                             ioptions.allow_ingest_behind),
+        full_history_ts_low_(full_history_ts_low) {
     assert(icmp_);
     const auto* ucmp = icmp_->user_comparator();
     assert(ucmp);
@@ -102,6 +107,174 @@ class UniversalCompactionBuilder {
     bool level_has_marked_standalone_rangedel;
   };
 
+  unsigned int GetMaxNumFilesToCompactBasedOnMaxReadAmp(
+      const int file_num_compaction_trigger, const unsigned int ratio,
+      int* num_sr_not_compacted_output, int* max_num_runs_output) const {
+    assert(num_sr_not_compacted_output);
+    assert(max_num_runs_output);
+    int max_num_runs =
+        mutable_cf_options_.compaction_options_universal.max_read_amp;
+    if (max_num_runs < 0) {
+      // any value < -1 is not valid
+      assert(max_num_runs == -1);
+      // By default, fall back to `level0_file_num_compaction_trigger`
+      max_num_runs = file_num_compaction_trigger;
+    } else if (max_num_runs == 0) {
+      if (mutable_cf_options_.compaction_options_universal.stop_style ==
+          kCompactionStopStyleTotalSize) {
+        // 0 means auto-tuning by RocksDB. We estimate max num run based on
+        // max_run_size, size_ratio and write buffer size:
+        // Assume the size of the lowest level size is equal to
+        // write_buffer_size. Each subsequent level is the max size without
+        // triggering size_ratio compaction. `max_num_runs` is the minimum
+        // number of levels required such that the target size of the
+        // largest level is at least `max_run_size_`.
+        max_num_runs = 1;
+        double cur_level_max_size =
+            static_cast<double>(mutable_cf_options_.write_buffer_size);
+        double total_run_size = 0;
+        while (cur_level_max_size < static_cast<double>(max_run_size_)) {
+          // This loop should not take too many iterations since
+          // cur_level_max_size at least doubles each iteration.
+          total_run_size += cur_level_max_size;
+          cur_level_max_size = (100.0 + ratio) / 100.0 * total_run_size;
+          ++max_num_runs;
+        }
+      } else {
+        // TODO: implement the auto-tune logic for this stop style
+        max_num_runs = file_num_compaction_trigger;
+      }
+    } else {
+      // max_num_runs > 0, it's the limit on the number of sorted run
+    }
+
+    // Get the total number of sorted runs that are not being compacted
+    int num_sr_not_compacted = 0;
+    for (size_t i = 0; i < sorted_runs_.size(); i++) {
+      if (sorted_runs_[i].being_compacted == false &&
+          !sorted_runs_[i].level_has_marked_standalone_rangedel) {
+        num_sr_not_compacted++;
+      }
+    }
+
+    *num_sr_not_compacted_output = num_sr_not_compacted;
+    *max_num_runs_output = max_num_runs;
+
+    if (num_sr_not_compacted > max_num_runs) {
+      return num_sr_not_compacted - max_num_runs + 1;
+    } else {
+      return 0;
+    }
+  }
+
+  Compaction* MaybePickPeriodicCompaction(Compaction* const prev_picked_c) {
+    if (prev_picked_c != nullptr ||
+        vstorage_->FilesMarkedForPeriodicCompaction().empty()) {
+      return prev_picked_c;
+    }
+    // Always need to do a full compaction for periodic compaction.
+    Compaction* c = PickPeriodicCompaction();
+    TEST_SYNC_POINT_CALLBACK("PostPickPeriodicCompaction", c);
+    if (c != nullptr) {
+      ROCKS_LOG_BUFFER(log_buffer_,
+                       "[%s] Universal: picked for periodic compaction\n",
+                       cf_name_.c_str());
+    }
+    return c;
+  }
+
+  Compaction* MaybePickSizeAmpCompaction(Compaction* const prev_picked_c,
+                                         int file_num_compaction_trigger) {
+    if (prev_picked_c != nullptr ||
+        sorted_runs_.size() <
+            static_cast<size_t>(file_num_compaction_trigger)) {
+      return prev_picked_c;
+    }
+    Compaction* c = PickCompactionToReduceSizeAmp();
+    if (c != nullptr) {
+      TEST_SYNC_POINT("PickCompactionToReduceSizeAmpReturnNonnullptr");
+      ROCKS_LOG_BUFFER(log_buffer_,
+                       "[%s] Universal: picked for size amp compaction \n",
+                       cf_name_.c_str());
+    }
+    return c;
+  }
+
+  Compaction* MaybePickCompactionToReduceSortedRunsBasedFileRatio(
+      Compaction* const prev_picked_c, int file_num_compaction_trigger,
+      unsigned int ratio) {
+    if (prev_picked_c != nullptr ||
+        sorted_runs_.size() <
+            static_cast<size_t>(file_num_compaction_trigger)) {
+      return prev_picked_c;
+    }
+    Compaction* c = PickCompactionToReduceSortedRuns(ratio, UINT_MAX);
+    if (c != nullptr) {
+      TEST_SYNC_POINT("PickCompactionToReduceSortedRunsReturnNonnullptr");
+      ROCKS_LOG_BUFFER(log_buffer_,
+                       "[%s] Universal: picked for size ratio compaction to "
+                       "reduce sorted run\n",
+                       cf_name_.c_str());
+    }
+    return c;
+  }
+
+  Compaction* MaybePickCompactionToReduceSortedRuns(
+      Compaction* const prev_picked_c, int file_num_compaction_trigger,
+      unsigned int ratio) {
+    if (prev_picked_c != nullptr ||
+        sorted_runs_.size() <
+            static_cast<size_t>(file_num_compaction_trigger)) {
+      return prev_picked_c;
+    }
+
+    int num_sr_not_compacted = 0;
+    int max_num_runs = 0;
+    const unsigned int max_num_files_to_compact =
+        GetMaxNumFilesToCompactBasedOnMaxReadAmp(file_num_compaction_trigger,
+                                                 ratio, &num_sr_not_compacted,
+                                                 &max_num_runs);
+    if (max_num_files_to_compact == 0) {
+      ROCKS_LOG_BUFFER(
+          log_buffer_,
+          "[%s] Universal: skipping compaction to reduce sorted run, num "
+          "sorted runs not "
+          "being compacted -- %u, max num runs allowed -- %d, max_run_size "
+          "-- %" PRIu64 "\n",
+          cf_name_.c_str(), num_sr_not_compacted, max_num_runs, max_run_size_);
+      return nullptr;
+    }
+
+    Compaction* c =
+        PickCompactionToReduceSortedRuns(UINT_MAX, max_num_files_to_compact);
+    if (c != nullptr) {
+      ROCKS_LOG_BUFFER(log_buffer_,
+                       "[%s] Universal: picked for sorted run num compaction "
+                       "to reduce sorted run, to "
+                       "compact file num -- %u, max num runs allowed"
+                       "-- %d, max_run_size -- %" PRIu64 "\n",
+                       cf_name_.c_str(), max_num_files_to_compact, max_num_runs,
+                       max_run_size_);
+    }
+    return c;
+  }
+
+  Compaction* MaybePickDeleteTriggeredCompaction(
+      Compaction* const prev_picked_c) {
+    if (prev_picked_c != nullptr) {
+      return prev_picked_c;
+    }
+    Compaction* c = PickDeleteTriggeredCompaction();
+    if (c != nullptr) {
+      TEST_SYNC_POINT("PickDeleteTriggeredCompactionReturnNonnullptr");
+      ROCKS_LOG_BUFFER(
+          log_buffer_,
+          "[%s] Universal: picked for delete triggered compaction\n",
+          cf_name_.c_str());
+    }
+    return c;
+  }
+
   // Pick Universal compaction to limit read amplification
   Compaction* PickCompactionToReduceSortedRuns(
       unsigned int ratio, unsigned int max_number_of_files_to_compact);
@@ -249,6 +422,12 @@ class UniversalCompactionBuilder {
     return num_l0_to_exclude;
   }
 
+  bool MeetsOutputLevelRequirements(int output_level) const {
+    return !require_max_output_level_ ||
+           Compaction::OutputToNonZeroMaxOutputLevel(
+               output_level, vstorage_->MaxOutputLevel(allow_ingest_behind_));
+  }
+
   const ImmutableOptions& ioptions_;
   const InternalKeyComparator* icmp_;
   double score_;
@@ -270,6 +449,9 @@ class UniversalCompactionBuilder {
   // marked for compaction. This is only populated when snapshot info is
   // populated.
   std::map<uint64_t, size_t> file_marked_for_compaction_to_sorted_run_index_;
+  bool require_max_output_level_;
+  bool allow_ingest_behind_;
+  const std::string& full_history_ts_low_;
 
   std::vector<UniversalCompactionBuilder::SortedRun> CalculateSortedRuns(
       const VersionStorageInfo& vstorage, int last_level,
@@ -288,7 +470,9 @@ class UniversalCompactionBuilder {
 // and the index of the file in that level
 
 struct InputFileInfo {
-  InputFileInfo() : f(nullptr), level(0), index(0) {}
+  InputFileInfo() : InputFileInfo(nullptr, 0, 0) {}
+  InputFileInfo(FileMetaData* file_meta, size_t l, size_t i)
+      : f(file_meta), level(l), index(i) {}
 
   FileMetaData* f;
   size_t level;
@@ -321,22 +505,14 @@ SmallestKeyHeap create_level_heap(Compaction* c, const Comparator* ucmp) {
   SmallestKeyHeap smallest_key_priority_q =
       SmallestKeyHeap(SmallestKeyHeapComparator(ucmp));
 
-  InputFileInfo input_file;
-
   for (size_t l = 0; l < c->num_input_levels(); l++) {
     if (c->num_input_files(l) != 0) {
       if (l == 0 && c->start_level() == 0) {
         for (size_t i = 0; i < c->num_input_files(0); i++) {
-          input_file.f = c->input(0, i);
-          input_file.level = 0;
-          input_file.index = i;
-          smallest_key_priority_q.push(std::move(input_file));
+          smallest_key_priority_q.emplace(c->input(0, i), 0, i);
         }
       } else {
-        input_file.f = c->input(l, 0);
-        input_file.level = l;
-        input_file.index = 0;
-        smallest_key_priority_q.push(std::move(input_file));
+        smallest_key_priority_q.emplace(c->input(l, 0), l, 0);
       }
     }
   }
@@ -374,7 +550,7 @@ bool UniversalCompactionBuilder::IsInputFilesNonOverlapping(Compaction* c) {
   auto comparator = icmp_->user_comparator();
   int first_iter = 1;
 
-  InputFileInfo prev, curr, next;
+  InputFileInfo prev, curr;
 
   SmallestKeyHeap smallest_key_priority_q =
       create_level_heap(c, icmp_->user_comparator());
@@ -397,17 +573,10 @@ bool UniversalCompactionBuilder::IsInputFilesNonOverlapping(Compaction* c) {
       prev = curr;
     }
 
-    next.f = nullptr;
-
     if (c->level(curr.level) != 0 &&
         curr.index < c->num_input_files(curr.level) - 1) {
-      next.f = c->input(curr.level, curr.index + 1);
-      next.level = curr.level;
-      next.index = curr.index + 1;
-    }
-
-    if (next.f) {
-      smallest_key_priority_q.push(std::move(next));
+      smallest_key_priority_q.emplace(c->input(curr.level, curr.index + 1),
+                                      curr.level, curr.index + 1);
     }
   }
   return true;
@@ -428,15 +597,20 @@ bool UniversalCompactionPicker::NeedsCompaction(
   return false;
 }
 
+// TODO leverage full_history_ts_low in universal compaction picking. It could
+// help reduce the same infinite compaction loop issue found in level
+// compaction.
 Compaction* UniversalCompactionPicker::PickCompaction(
     const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
     const MutableDBOptions& mutable_db_options,
     const std::vector<SequenceNumber>& existing_snapshots,
     const SnapshotChecker* snapshot_checker, VersionStorageInfo* vstorage,
-    LogBuffer* log_buffer) {
+    LogBuffer* log_buffer, const std::string& full_history_ts_low,
+    bool require_max_output_level) {
   UniversalCompactionBuilder builder(
       ioptions_, icmp_, cf_name, mutable_cf_options, mutable_db_options,
-      existing_snapshots, snapshot_checker, vstorage, this, log_buffer);
+      existing_snapshots, snapshot_checker, vstorage, this, log_buffer,
+      require_max_output_level, full_history_ts_low);
   return builder.PickCompaction();
 }
 
@@ -567,13 +741,20 @@ bool UniversalCompactionBuilder::ShouldSkipMarkedFile(
 Compaction* UniversalCompactionBuilder::PickCompaction() {
   const int kLevel0 = 0;
   score_ = vstorage_->CompactionScore(kLevel0);
-  int max_output_level =
-      vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind);
+  const int max_output_level = vstorage_->MaxOutputLevel(allow_ingest_behind_);
+  const int file_num_compaction_trigger =
+      mutable_cf_options_.level0_file_num_compaction_trigger;
+  const unsigned int ratio =
+      mutable_cf_options_.compaction_options_universal.size_ratio;
+
+  if (max_output_level == 0 &&
+      !MeetsOutputLevelRequirements(0 /* output_level */)) {
+    return nullptr;
+  }
+
   max_run_size_ = 0;
   sorted_runs_ =
       CalculateSortedRuns(*vstorage_, max_output_level, &max_run_size_);
-  int file_num_compaction_trigger =
-      mutable_cf_options_.level0_file_num_compaction_trigger;
 
   if (sorted_runs_.size() == 0 ||
       (vstorage_->FilesMarkedForPeriodicCompaction().empty() &&
@@ -585,6 +766,7 @@ Compaction* UniversalCompactionBuilder::PickCompaction() {
         "UniversalCompactionBuilder::PickCompaction:Return", nullptr);
     return nullptr;
   }
+
   VersionStorageInfo::LevelSummaryStorage tmp;
   ROCKS_LOG_BUFFER_MAX_SZ(
       log_buffer_, 3072,
@@ -592,127 +774,22 @@ Compaction* UniversalCompactionBuilder::PickCompaction() {
       cf_name_.c_str(), sorted_runs_.size(), vstorage_->LevelSummary(&tmp));
 
   Compaction* c = nullptr;
-  // Periodic compaction has higher priority than other type of compaction
-  // because it's a hard requirement.
-  if (!vstorage_->FilesMarkedForPeriodicCompaction().empty()) {
-    // Always need to do a full compaction for periodic compaction.
-    c = PickPeriodicCompaction();
-    TEST_SYNC_POINT_CALLBACK("PostPickPeriodicCompaction", c);
-  }
-
-  if (c == nullptr &&
-      sorted_runs_.size() >= static_cast<size_t>(file_num_compaction_trigger)) {
-    // Check for size amplification.
-    if ((c = PickCompactionToReduceSizeAmp()) != nullptr) {
-      TEST_SYNC_POINT("PickCompactionToReduceSizeAmpReturnNonnullptr");
-      ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: compacting for size amp\n",
-                       cf_name_.c_str());
-    } else {
-      // Size amplification is within limits. Try reducing read
-      // amplification while maintaining file size ratios.
-      unsigned int ratio =
-          mutable_cf_options_.compaction_options_universal.size_ratio;
-
-      if ((c = PickCompactionToReduceSortedRuns(ratio, UINT_MAX)) != nullptr) {
-        TEST_SYNC_POINT("PickCompactionToReduceSortedRunsReturnNonnullptr");
-        ROCKS_LOG_BUFFER(log_buffer_,
-                         "[%s] Universal: compacting for size ratio\n",
-                         cf_name_.c_str());
-      } else {
-        // Size amplification and file size ratios are within configured limits.
-        // If max read amplification exceeds configured limits, then force
-        // compaction to reduce the number sorted runs without looking at file
-        // size ratios.
-
-        // This is guaranteed by NeedsCompaction()
-        assert(sorted_runs_.size() >=
-               static_cast<size_t>(file_num_compaction_trigger));
-        int max_num_runs =
-            mutable_cf_options_.compaction_options_universal.max_read_amp;
-        if (max_num_runs < 0) {
-          // any value < -1 is not valid
-          assert(max_num_runs == -1);
-          // By default, fall back to `level0_file_num_compaction_trigger`
-          max_num_runs = file_num_compaction_trigger;
-        } else if (max_num_runs == 0) {
-          if (mutable_cf_options_.compaction_options_universal.stop_style ==
-              kCompactionStopStyleTotalSize) {
-            // 0 means auto-tuning by RocksDB. We estimate max num run based on
-            // max_run_size, size_ratio and write buffer size:
-            // Assume the size of the lowest level size is equal to
-            // write_buffer_size. Each subsequent level is the max size without
-            // triggering size_ratio compaction. `max_num_runs` is the minimum
-            // number of levels required such that the target size of the
-            // largest level is at least `max_run_size_`.
-            max_num_runs = 1;
-            double cur_level_max_size =
-                static_cast<double>(mutable_cf_options_.write_buffer_size);
-            double total_run_size = 0;
-            while (cur_level_max_size < static_cast<double>(max_run_size_)) {
-              // This loop should not take too many iterations since
-              // cur_level_max_size at least doubles each iteration.
-              total_run_size += cur_level_max_size;
-              cur_level_max_size = (100.0 + ratio) / 100.0 * total_run_size;
-              ++max_num_runs;
-            }
-          } else {
-            // TODO: implement the auto-tune logic for this stop style
-            max_num_runs = file_num_compaction_trigger;
-          }
-        } else {
-          // max_num_runs > 0, it's the limit on the number of sorted run
-        }
-        // Get the total number of sorted runs that are not being compacted
-        int num_sr_not_compacted = 0;
-        for (size_t i = 0; i < sorted_runs_.size(); i++) {
-          if (sorted_runs_[i].being_compacted == false &&
-              !sorted_runs_[i].level_has_marked_standalone_rangedel) {
-            num_sr_not_compacted++;
-          }
-        }
 
-        // The number of sorted runs that are not being compacted is greater
-        // than the maximum allowed number of sorted runs
-        if (num_sr_not_compacted > max_num_runs) {
-          unsigned int num_files = num_sr_not_compacted - max_num_runs + 1;
-          if ((c = PickCompactionToReduceSortedRuns(UINT_MAX, num_files)) !=
-              nullptr) {
-            ROCKS_LOG_BUFFER(log_buffer_,
-                             "[%s] Universal: compacting for file num, to "
-                             "compact file num -- %u, max num runs allowed"
-                             "-- %d, max_run_size -- %" PRIu64 "\n",
-                             cf_name_.c_str(), num_files, max_num_runs,
-                             max_run_size_);
-          }
-        } else {
-          ROCKS_LOG_BUFFER(
-              log_buffer_,
-              "[%s] Universal: skipping compaction for file num, num runs not "
-              "being compacted -- %u, max num runs allowed -- %d, max_run_size "
-              "-- %" PRIu64 "\n",
-              cf_name_.c_str(), num_sr_not_compacted, max_num_runs,
-              max_run_size_);
-        }
-      }
-    }
-  }
-
-  if (c == nullptr) {
-    if ((c = PickDeleteTriggeredCompaction()) != nullptr) {
-      TEST_SYNC_POINT("PickDeleteTriggeredCompactionReturnNonnullptr");
-      ROCKS_LOG_BUFFER(log_buffer_,
-                       "[%s] Universal: delete triggered compaction\n",
-                       cf_name_.c_str());
-    }
-  }
+  c = MaybePickPeriodicCompaction(c);
+  c = MaybePickSizeAmpCompaction(c, file_num_compaction_trigger);
+  c = MaybePickCompactionToReduceSortedRunsBasedFileRatio(
+      c, file_num_compaction_trigger, ratio);
+  c = MaybePickCompactionToReduceSortedRuns(c, file_num_compaction_trigger,
+                                            ratio);
+  c = MaybePickDeleteTriggeredCompaction(c);
 
   if (c == nullptr) {
     TEST_SYNC_POINT_CALLBACK(
         "UniversalCompactionBuilder::PickCompaction:Return", nullptr);
     return nullptr;
   }
-  assert(c->output_level() <=
-         vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind));
+  assert(c->output_level() <= vstorage_->MaxOutputLevel(allow_ingest_behind_));
+  assert(MeetsOutputLevelRequirements(c->output_level()));
 
   if (mutable_cf_options_.compaction_options_universal.allow_trivial_move ==
           true &&
@@ -754,7 +831,8 @@ Compaction* UniversalCompactionBuilder::PickCompaction() {
   RecordInHistogram(ioptions_.stats, NUM_FILES_IN_SINGLE_COMPACTION, num_files);
 
   picker_->RegisterCompaction(c);
-  vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
+  vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_,
+                                    full_history_ts_low_);
 
   TEST_SYNC_POINT_CALLBACK("UniversalCompactionBuilder::PickCompaction:Return",
                            c);
@@ -838,14 +916,16 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns(
       if (sr->being_compacted) {
         ROCKS_LOG_BUFFER(log_buffer_,
                          "[%s] Universal: %s"
-                         "[%d] being compacted, skipping",
+                         "[%d] being compacted, skipping for compaction to "
+                         "reduce sorted runs",
                          cf_name_.c_str(), file_num_buf, loop);
       } else if (sr->level_has_marked_standalone_rangedel) {
-        ROCKS_LOG_BUFFER(log_buffer_,
-                         "[%s] Universal: %s"
-                         "[%d] has standalone range tombstone files marked for "
-                         "compaction, skipping",
-                         cf_name_.c_str(), file_num_buf, loop);
+        ROCKS_LOG_BUFFER(
+            log_buffer_,
+            "[%s] Universal: %s"
+            "[%d] has standalone range tombstone files marked for "
+            "compaction, skipping for compaction to reduce sorted runs",
+            cf_name_.c_str(), file_num_buf, loop);
       }
 
       sr = nullptr;
@@ -858,7 +938,8 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns(
       char file_num_buf[kFormatFileNumberBufSize];
       sr->Dump(file_num_buf, sizeof(file_num_buf), true);
       ROCKS_LOG_BUFFER(log_buffer_,
-                       "[%s] Universal: Possible candidate %s[%d].",
+                       "[%s] Universal: Possible candidate for compaction to "
+                       "reduce sorted runs %s[%d].",
                        cf_name_.c_str(), file_num_buf, loop);
     }
 
@@ -950,8 +1031,7 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns(
   int start_level = sorted_runs_[start_index].level;
   int output_level;
   // last level is reserved for the files ingested behind
-  int max_output_level =
-      vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind);
+  int max_output_level = vstorage_->MaxOutputLevel(allow_ingest_behind_);
   if (first_index_after == sorted_runs_.size()) {
     output_level = max_output_level;
   } else if (sorted_runs_[first_index_after].level == 0) {
@@ -960,6 +1040,10 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns(
     output_level = sorted_runs_[first_index_after].level - 1;
   }
 
+  if (!MeetsOutputLevelRequirements(output_level)) {
+    return nullptr;
+  }
+
   std::vector<CompactionInputFiles> inputs(max_output_level + 1);
   for (size_t i = 0; i < inputs.size(); ++i) {
     inputs[i].level = start_level + static_cast<int>(i);
@@ -996,7 +1080,7 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns(
 
   if (output_level != 0 && picker_->FilesRangeOverlapWithCompaction(
                                inputs, output_level,
-                               Compaction::EvaluatePenultimateLevel(
+                               Compaction::EvaluateProximalLevel(
                                    vstorage_, mutable_cf_options_, ioptions_,
                                    start_level, output_level))) {
     return nullptr;
@@ -1016,13 +1100,12 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns(
                                            output_level, 1, enable_compression),
                         GetCompressionOptions(mutable_cf_options_, vstorage_,
                                               output_level, enable_compression),
-                        mutable_cf_options_.default_write_temperature,
+                        Temperature::kUnknown,
                         /* max_subcompactions */ 0, grandparents,
                         /* earliest_snapshot */ std::nullopt,
-                        /* snapshot_checker */ nullptr,
-                        /* is manual */ false, /* trim_ts */ "", score_,
-                        false /* deletion_compaction */,
-                        /* l0_files_might_overlap */ true, compaction_reason);
+                        /* snapshot_checker */ nullptr, compaction_reason,
+                        /* trim_ts */ "", score_,
+                        /* l0_files_might_overlap */ true);
 }
 
 // Look at overall size amplification. If size amplification
@@ -1052,18 +1135,19 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() {
       char file_num_buf[kFormatFileNumberBufSize];
       sr->Dump(file_num_buf, sizeof(file_num_buf), true);
       if (sr->being_compacted) {
-        ROCKS_LOG_BUFFER(
-            log_buffer_,
-            "[%s] Universal: stopping at sorted run undergoing compaction: "
-            "%s[%" ROCKSDB_PRIszt "]",
-            cf_name_.c_str(), file_num_buf, start_index - 1);
+        ROCKS_LOG_BUFFER(log_buffer_,
+                         "[%s] Universal: stopping for size amp compaction at "
+                         "sorted run undergoing compaction: "
+                         "%s[%" ROCKSDB_PRIszt "]",
+                         cf_name_.c_str(), file_num_buf, start_index - 1);
       } else if (sr->level_has_marked_standalone_rangedel) {
-        ROCKS_LOG_BUFFER(
-            log_buffer_,
-            "[%s] Universal: stopping at sorted run that has standalone range "
-            "tombstone files marked for compaction: "
-            "%s[%" ROCKSDB_PRIszt "]",
-            cf_name_.c_str(), file_num_buf, start_index - 1);
+        ROCKS_LOG_BUFFER(log_buffer_,
+                         "[%s] Universal: stopping for size amp compaction at "
+                         "sorted run that has "
+                         "standalone range "
+                         "tombstone files marked for compaction: "
+                         "%s[%" ROCKSDB_PRIszt "]",
+                         cf_name_.c_str(), file_num_buf, start_index - 1);
       }
       break;
     }
@@ -1079,11 +1163,12 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() {
   {
     const size_t num_l0_to_exclude = MightExcludeNewL0sToReduceWriteStop(
         num_l0_files, end_index, start_index, candidate_size);
-    ROCKS_LOG_BUFFER(log_buffer_,
-                     "[%s] Universal: Excluding %" ROCKSDB_PRIszt
-                     " latest L0 files to reduce potential write stop "
-                     "triggered by `level0_stop_writes_trigger`",
-                     cf_name_.c_str(), num_l0_to_exclude);
+    ROCKS_LOG_BUFFER(
+        log_buffer_,
+        "[%s] Universal: Excluding for size amp compaction %" ROCKSDB_PRIszt
+        " latest L0 files to reduce potential write stop "
+        "triggered by `level0_stop_writes_trigger`",
+        cf_name_.c_str(), num_l0_to_exclude);
   }
 
   {
@@ -1101,18 +1186,18 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() {
 
   // size amplification = percentage of additional size
   if (candidate_size * 100 < ratio * base_sr_size) {
-    ROCKS_LOG_BUFFER(
-        log_buffer_,
-        "[%s] Universal: size amp not needed. newer-files-total-size %" PRIu64
-        " earliest-file-size %" PRIu64,
-        cf_name_.c_str(), candidate_size, base_sr_size);
+    ROCKS_LOG_BUFFER(log_buffer_,
+                     "[%s] Universal: size amp compction not needed. "
+                     "newer-files-total-size %" PRIu64
+                     " earliest-file-size %" PRIu64,
+                     cf_name_.c_str(), candidate_size, base_sr_size);
     return nullptr;
   } else {
-    ROCKS_LOG_BUFFER(
-        log_buffer_,
-        "[%s] Universal: size amp needed. newer-files-total-size %" PRIu64
-        " earliest-file-size %" PRIu64,
-        cf_name_.c_str(), candidate_size, base_sr_size);
+    ROCKS_LOG_BUFFER(log_buffer_,
+                     "[%s] Universal: size amp compaction needed. "
+                     "newer-files-total-size %" PRIu64
+                     " earliest-file-size %" PRIu64,
+                     cf_name_.c_str(), candidate_size, base_sr_size);
   }
   // Since incremental compaction can't include more than second last
   // level, it can introduce penalty, compared to full compaction. We
@@ -1345,7 +1430,7 @@ Compaction* UniversalCompactionBuilder::PickIncrementalForReduceSizeAmp(
   // intra L0 compactions outputs could have overlap
   if (output_level != 0 && picker_->FilesRangeOverlapWithCompaction(
                                inputs, output_level,
-                               Compaction::EvaluatePenultimateLevel(
+                               Compaction::EvaluateProximalLevel(
                                    vstorage_, mutable_cf_options_, ioptions_,
                                    start_level, output_level))) {
     return nullptr;
@@ -1363,14 +1448,13 @@ Compaction* UniversalCompactionBuilder::PickIncrementalForReduceSizeAmp(
                          true /* enable_compression */),
       GetCompressionOptions(mutable_cf_options_, vstorage_, output_level,
                             true /* enable_compression */),
-      mutable_cf_options_.default_write_temperature,
+      Temperature::kUnknown,
       /* max_subcompactions */ 0, /* grandparents */ {},
       /* earliest_snapshot */ std::nullopt,
       /* snapshot_checker */ nullptr,
-      /* is manual */ false,
-      /* trim_ts */ "", score_, false /* deletion_compaction */,
-      /* l0_files_might_overlap */ true,
-      CompactionReason::kUniversalSizeAmplification);
+      CompactionReason::kUniversalSizeAmplification,
+      /* trim_ts */ "", score_,
+      /* l0_files_might_overlap */ true);
 }
 
 // Pick files marked for compaction. Typically, files are marked by
@@ -1439,8 +1523,7 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
       return nullptr;
     }
 
-    int max_output_level =
-        vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind);
+    int max_output_level = vstorage_->MaxOutputLevel(allow_ingest_behind_);
     // Pick the first non-empty level after the start_level
     for (output_level = start_level + 1; output_level <= max_output_level;
          output_level++) {
@@ -1463,10 +1546,23 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
     }
     assert(output_level <= max_output_level);
 
+    if (!MeetsOutputLevelRequirements(output_level)) {
+      return nullptr;
+    }
+
     if (output_level != 0) {
+      // For standalone range deletion, we don't want to compact it with newer
+      // L0 files that it doesn't cover.
+      const FileMetaData* starting_l0_file =
+          (start_level == 0 && start_level_inputs.size() == 1 &&
+           start_level_inputs.files[0]->FileIsStandAloneRangeTombstone())
+              ? start_level_inputs.files[0]
+              : nullptr;
+
       if (start_level == 0) {
         if (!picker_->GetOverlappingL0Files(vstorage_, &start_level_inputs,
-                                            output_level, nullptr)) {
+                                            output_level, nullptr,
+                                            starting_l0_file)) {
           return nullptr;
         }
       }
@@ -1477,7 +1573,8 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
       output_level_inputs.level = output_level;
       if (!picker_->SetupOtherInputs(cf_name_, mutable_cf_options_, vstorage_,
                                      &start_level_inputs, &output_level_inputs,
-                                     &parent_index, -1)) {
+                                     &parent_index, -1, false,
+                                     starting_l0_file)) {
         return nullptr;
       }
       inputs.push_back(start_level_inputs);
@@ -1486,9 +1583,9 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
       }
       if (picker_->FilesRangeOverlapWithCompaction(
               inputs, output_level,
-              Compaction::EvaluatePenultimateLevel(
-                  vstorage_, mutable_cf_options_, ioptions_, start_level,
-                  output_level))) {
+              Compaction::EvaluateProximalLevel(vstorage_, mutable_cf_options_,
+                                                ioptions_, start_level,
+                                                output_level))) {
         return nullptr;
       }
 
@@ -1514,13 +1611,11 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
       /* max_grandparent_overlap_bytes */ GetMaxOverlappingBytes(), path_id,
       GetCompressionType(vstorage_, mutable_cf_options_, output_level, 1),
       GetCompressionOptions(mutable_cf_options_, vstorage_, output_level),
-      mutable_cf_options_.default_write_temperature,
+      Temperature::kUnknown,
       /* max_subcompactions */ 0, grandparents, earliest_snapshot_,
-      snapshot_checker_,
-      /* is manual */ false,
-      /* trim_ts */ "", score_, false /* deletion_compaction */,
-      /* l0_files_might_overlap */ true,
-      CompactionReason::kFilesMarkedForCompaction);
+      snapshot_checker_, CompactionReason::kFilesMarkedForCompaction,
+      /* trim_ts */ "", score_,
+      /* l0_files_might_overlap */ true);
 }
 
 Compaction* UniversalCompactionBuilder::PickCompactionToOldest(
@@ -1541,8 +1636,7 @@ Compaction* UniversalCompactionBuilder::PickCompactionWithSortedRunRange(
   uint32_t path_id =
       GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
   int start_level = sorted_runs_[start_index].level;
-  int max_output_level =
-      vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind);
+  int max_output_level = vstorage_->MaxOutputLevel(allow_ingest_behind_);
   std::vector<CompactionInputFiles> inputs(max_output_level + 1);
   for (size_t i = 0; i < inputs.size(); ++i) {
     inputs[i].level = start_level + static_cast<int>(i);
@@ -1587,10 +1681,14 @@ Compaction* UniversalCompactionBuilder::PickCompactionWithSortedRunRange(
     output_level = sorted_runs_[end_index + 1].level - 1;
   }
 
+  if (!MeetsOutputLevelRequirements(output_level)) {
+    return nullptr;
+  }
+
   // intra L0 compactions outputs could have overlap
   if (output_level != 0 && picker_->FilesRangeOverlapWithCompaction(
                                inputs, output_level,
-                               Compaction::EvaluatePenultimateLevel(
+                               Compaction::EvaluateProximalLevel(
                                    vstorage_, mutable_cf_options_, ioptions_,
                                    start_level, output_level))) {
     return nullptr;
@@ -1609,13 +1707,12 @@ Compaction* UniversalCompactionBuilder::PickCompactionWithSortedRunRange(
                          true /* enable_compression */),
       GetCompressionOptions(mutable_cf_options_, vstorage_, output_level,
                             true /* enable_compression */),
-      mutable_cf_options_.default_write_temperature,
+      Temperature::kUnknown,
       /* max_subcompactions */ 0, /* grandparents */ {},
       /* earliest_snapshot */ std::nullopt,
-      /* snapshot_checker */ nullptr,
-      /* is manual */ false,
-      /* trim_ts */ "", score_, false /* deletion_compaction */,
-      /* l0_files_might_overlap */ true, compaction_reason);
+      /* snapshot_checker */ nullptr, compaction_reason,
+      /* trim_ts */ "", score_,
+      /* l0_files_might_overlap */ true);
 }
 
 Compaction* UniversalCompactionBuilder::PickPeriodicCompaction() {
diff --git a/db/compaction/compaction_picker_universal.h b/db/compaction/compaction_picker_universal.h
index 18c0f27afbf4..175c11c9f0c3 100644
--- a/db/compaction/compaction_picker_universal.h
+++ b/db/compaction/compaction_picker_universal.h
@@ -18,12 +18,16 @@ class UniversalCompactionPicker : public CompactionPicker {
   UniversalCompactionPicker(const ImmutableOptions& ioptions,
                             const InternalKeyComparator* icmp)
       : CompactionPicker(ioptions, icmp) {}
+
+  // If `require_max_output_level` is true, only pick compaction
+  // with max output level or return nullptr if no such compaction exists.
   Compaction* PickCompaction(
       const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
       const MutableDBOptions& mutable_db_options,
       const std::vector<SequenceNumber>& existing_snapshots,
       const SnapshotChecker* snapshot_checker, VersionStorageInfo* vstorage,
-      LogBuffer* log_buffer) override;
+      LogBuffer* log_buffer, const std::string& full_history_ts_low,
+      bool require_max_output_level = false) override;
   int MaxOutputLevel() const override { return NumberLevels() - 1; }
 
   bool NeedsCompaction(const VersionStorageInfo* vstorage) const override;
diff --git a/db/compaction/compaction_service_job.cc b/db/compaction/compaction_service_job.cc
index d571dbbc0c5e..cb88c53d8f8d 100644
--- a/db/compaction/compaction_service_job.cc
+++ b/db/compaction/compaction_service_job.cc
@@ -41,7 +41,7 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
   }
 
   compaction_input.cf_name = compaction->column_family_data()->GetName();
-  compaction_input.snapshots = existing_snapshots_;
+  compaction_input.snapshots = job_context_->snapshot_seqs;
   compaction_input.has_begin = sub_compact->start.has_value();
   compaction_input.begin =
       compaction_input.has_begin ? sub_compact->start->ToString() : "";
@@ -74,15 +74,27 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
       compaction->column_family_data()->GetName().c_str(), job_id_,
       compaction_input.output_level, input_files_oss.str().c_str());
   CompactionServiceJobInfo info(
-      dbname_, db_id_, db_session_id_, GetCompactionId(sub_compact),
+      dbname_, db_id_, db_session_id_,
+      compaction->column_family_data()->GetID(),
+      compaction->column_family_data()->GetName(), GetCompactionId(sub_compact),
       thread_pri_, compaction->compaction_reason(),
       compaction->is_full_compaction(), compaction->is_manual_compaction(),
-      compaction->bottommost_level());
+      compaction->bottommost_level(), compaction->start_level(),
+      compaction->output_level());
+
   CompactionServiceScheduleResponse response =
       db_options_.compaction_service->Schedule(info, compaction_input_binary);
   switch (response.status) {
     case CompactionServiceJobStatus::kSuccess:
       break;
+    case CompactionServiceJobStatus::kAborted:
+      sub_compact->status =
+          Status::Aborted("Scheduling a remote compaction job was aborted");
+      ROCKS_LOG_WARN(
+          db_options_.info_log,
+          "[%s] [JOB %d] Remote compaction was aborted at Schedule()",
+          compaction->column_family_data()->GetName().c_str(), job_id_);
+      return response.status;
     case CompactionServiceJobStatus::kFailure:
       sub_compact->status = Status::Incomplete(
           "CompactionService failed to schedule a remote compaction job.");
@@ -102,6 +114,17 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
       break;
   }
 
+  std::string debug_str_before_wait =
+      compaction->input_version()->DebugString(/*hex=*/true);
+
+  // TODO: Update CompactionService API to support abort and resume
+  // functionality. Currently, remote compaction jobs cannot be aborted via
+  // AbortAllCompactions() because the CompactionService interface lacks methods
+  // to signal abort to remote workers and to properly resume after an abort.
+  // The API needs to be extended with:
+  // - A method to signal abort to running remote compaction jobs
+  // - A method to resume/re-enable scheduling after an abort is lifted
+
   ROCKS_LOG_INFO(db_options_.info_log,
                  "[%s] [JOB %d] Waiting for remote compaction...",
                  compaction->column_family_data()->GetName().c_str(), job_id_);
@@ -110,6 +133,17 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
       db_options_.compaction_service->Wait(response.scheduled_job_id,
                                            &compaction_result_binary);
 
+  if (compaction_status != CompactionServiceJobStatus::kSuccess) {
+    ROCKS_LOG_ERROR(
+        db_options_.info_log,
+        "[%s] [JOB %d] Wait() status is not kSuccess. "
+        "\nDebugString Before Wait():\n%s"
+        "\nDebugString After Wait():\n%s",
+        compaction->column_family_data()->GetName().c_str(), job_id_,
+        debug_str_before_wait.c_str(),
+        compaction->input_version()->DebugString(/*hex=*/true).c_str());
+  }
+
   if (compaction_status == CompactionServiceJobStatus::kUseLocal) {
     ROCKS_LOG_INFO(
         db_options_.info_log,
@@ -118,6 +152,16 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
     return compaction_status;
   }
 
+  if (compaction_status == CompactionServiceJobStatus::kAborted) {
+    sub_compact->status =
+        Status::Aborted("Waiting a remote compaction job was aborted");
+    ROCKS_LOG_INFO(db_options_.info_log,
+                   "[%s] [JOB %d] Remote compaction was aborted during Wait()",
+                   compaction->column_family_data()->GetName().c_str(),
+                   job_id_);
+    return compaction_status;
+  }
+
   CompactionServiceResult compaction_result;
   s = CompactionServiceResult::Read(compaction_result_binary,
                                     &compaction_result);
@@ -185,18 +229,24 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
     }
 
     FileMetaData meta;
-    uint64_t file_size;
-    // FIXME: file_size should be part of CompactionServiceOutputFile so that
-    // we don't get DB corruption if the full file size has not been propagated
-    // back to the caller through the file system (which could have metadata
-    // lag or caching bugs).
-    s = fs_->GetFileSize(tgt_file, IOOptions(), &file_size, nullptr);
+    uint64_t file_size = file.file_size;
+
+    // TODO - Clean this up in the next release.
+    // For backward compatibility - in case the remote worker does not populate
+    // the file_size yet. If missing, continue to populate this from the file
+    // system.
+    if (file_size == 0) {
+      s = fs_->GetFileSize(tgt_file, IOOptions(), &file_size, nullptr);
+    }
+
     if (!s.ok()) {
       sub_compact->status = s;
       db_options_.compaction_service->OnInstallation(
           response.scheduled_job_id, CompactionServiceJobStatus::kFailure);
       return CompactionServiceJobStatus::kFailure;
     }
+    assert(file_size > 0);
+
     meta.fd = FileDescriptor(file_num, compaction->output_path_id(), file_size,
                              file.smallest_seqno, file.largest_seqno);
     meta.smallest.DecodeFrom(file.smallest_internal_key);
@@ -208,19 +258,35 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
     meta.file_checksum_func_name = file.file_checksum_func_name;
     meta.marked_for_compaction = file.marked_for_compaction;
     meta.unique_id = file.unique_id;
-
+    meta.temperature = file.file_temperature;
+    meta.tail_size =
+        FileMetaData::CalculateTailSize(file_size, file.table_properties);
     auto cfd = compaction->column_family_data();
-    sub_compact->Current().AddOutput(std::move(meta),
-                                     cfd->internal_comparator(), false, true,
-                                     file.paranoid_hash);
-    sub_compact->Current().UpdateTableProperties(file.table_properties);
+    CompactionOutputs* compaction_outputs =
+        sub_compact->Outputs(file.is_proximal_level_output);
+    assert(compaction_outputs);
+    compaction_outputs->AddOutput(std::move(meta), cfd->internal_comparator(),
+                                  false, true, file.paranoid_hash);
+    compaction_outputs->UpdateTableProperties(file.table_properties);
+  }
+
+  // Set per-level stats
+  auto compaction_output_stats =
+      sub_compact->OutputStats(false /* is_proximal_level */);
+  assert(compaction_output_stats);
+  compaction_output_stats->Add(
+      compaction_result.internal_stats.output_level_stats);
+  if (compaction->SupportsPerKeyPlacement()) {
+    compaction_output_stats =
+        sub_compact->OutputStats(true /* is_proximal_level */);
+    assert(compaction_output_stats);
+    compaction_output_stats->Add(
+        compaction_result.internal_stats.proximal_level_stats);
   }
+
+  // Set job stats
   sub_compact->compaction_job_stats = compaction_result.stats;
-  sub_compact->Current().SetNumOutputRecords(
-      compaction_result.stats.num_output_records);
-  sub_compact->Current().SetNumOutputFiles(
-      compaction_result.stats.num_output_files);
-  sub_compact->Current().AddBytesWritten(compaction_result.bytes_written);
+
   RecordTick(stats_, REMOTE_COMPACT_READ_BYTES, compaction_result.bytes_read);
   RecordTick(stats_, REMOTE_COMPACT_WRITE_BYTES,
              compaction_result.bytes_written);
@@ -240,48 +306,38 @@ void CompactionServiceCompactionJob::RecordCompactionIOStats() {
   CompactionJob::RecordCompactionIOStats();
 }
 
-void CompactionServiceCompactionJob::UpdateCompactionJobStats(
-    const InternalStats::CompactionStats& stats) const {
-  compaction_job_stats_->elapsed_micros = stats.micros;
-
-  // output information only in remote compaction
-  compaction_job_stats_->total_output_bytes = stats.bytes_written;
-  compaction_job_stats_->total_output_bytes_blob = stats.bytes_written_blob;
-  compaction_job_stats_->num_output_records = stats.num_output_records;
-  compaction_job_stats_->num_output_files = stats.num_output_files;
-  compaction_job_stats_->num_output_files_blob = stats.num_output_files_blob;
-}
-
 CompactionServiceCompactionJob::CompactionServiceCompactionJob(
     int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
     const MutableDBOptions& mutable_db_options, const FileOptions& file_options,
     VersionSet* versions, const std::atomic<bool>* shutting_down,
     LogBuffer* log_buffer, FSDirectory* output_directory, Statistics* stats,
     InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
-    std::vector<SequenceNumber> existing_snapshots,
-    std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
-    const std::string& dbname, const std::shared_ptr<IOTracer>& io_tracer,
+    JobContext* job_context, std::shared_ptr<Cache> table_cache,
+    EventLogger* event_logger, const std::string& dbname,
+    const std::shared_ptr<IOTracer>& io_tracer,
     const std::atomic<bool>& manual_compaction_canceled,
     const std::string& db_id, const std::string& db_session_id,
     std::string output_path,
     const CompactionServiceInput& compaction_service_input,
     CompactionServiceResult* compaction_service_result)
-    : CompactionJob(job_id, compaction, db_options, mutable_db_options,
-                    file_options, versions, shutting_down, log_buffer, nullptr,
-                    output_directory, nullptr, stats, db_mutex,
-                    db_error_handler, std::move(existing_snapshots),
-                    kMaxSequenceNumber, nullptr, nullptr,
-                    std::move(table_cache), event_logger,
-                    compaction->mutable_cf_options().paranoid_file_checks,
-                    compaction->mutable_cf_options().report_bg_io_stats, dbname,
-                    &(compaction_service_result->stats), Env::Priority::USER,
-                    io_tracer, manual_compaction_canceled, db_id, db_session_id,
-                    compaction->column_family_data()->GetFullHistoryTsLow()),
+    : CompactionJob(
+          job_id, compaction, db_options, mutable_db_options, file_options,
+          versions, shutting_down, log_buffer, nullptr, output_directory,
+          nullptr, stats, db_mutex, db_error_handler, job_context,
+          std::move(table_cache), event_logger,
+          compaction->mutable_cf_options().paranoid_file_checks,
+          compaction->mutable_cf_options().report_bg_io_stats, dbname,
+          &(compaction_service_result->stats), Env::Priority::USER, io_tracer,
+          manual_compaction_canceled, CompactionJob::kCompactionAbortedFalse,
+          db_id, db_session_id,
+          compaction->column_family_data()->GetFullHistoryTsLow()),
       output_path_(std::move(output_path)),
       compaction_input_(compaction_service_input),
       compaction_result_(compaction_service_result) {}
 
-void CompactionServiceCompactionJob::Prepare() {
+void CompactionServiceCompactionJob::Prepare(
+    const CompactionProgress& compaction_progress,
+    log::Writer* compaction_progress_writer) {
   std::optional<Slice> begin;
   if (compaction_input_.has_begin) {
     begin = compaction_input_.begin;
@@ -290,7 +346,8 @@ void CompactionServiceCompactionJob::Prepare() {
   if (compaction_input_.has_end) {
     end = compaction_input_.end;
   }
-  CompactionJob::Prepare(std::make_pair(begin, end));
+  CompactionJob::Prepare(std::make_pair(begin, end), compaction_progress,
+                         compaction_progress_writer);
 }
 
 Status CompactionServiceCompactionJob::Run() {
@@ -313,15 +370,14 @@ Status CompactionServiceCompactionJob::Run() {
 
   ProcessKeyValueCompaction(sub_compact);
 
-  compaction_stats_.stats.micros =
-      db_options_.clock->NowMicros() - start_micros;
-  compaction_stats_.stats.cpu_micros =
-      sub_compact->compaction_job_stats.cpu_micros;
+  uint64_t elapsed_micros = db_options_.clock->NowMicros() - start_micros;
+  internal_stats_.SetMicros(elapsed_micros);
+  internal_stats_.AddCpuMicros(elapsed_micros);
 
   RecordTimeToHistogram(stats_, COMPACTION_TIME,
-                        compaction_stats_.stats.micros);
+                        internal_stats_.output_level_stats.micros);
   RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME,
-                        compaction_stats_.stats.cpu_micros);
+                        internal_stats_.output_level_stats.cpu_micros);
 
   Status status = sub_compact->status;
   IOStatus io_s = sub_compact->io_status;
@@ -351,38 +407,45 @@ Status CompactionServiceCompactionJob::Run() {
 
   // Build Compaction Job Stats
 
-  // 1. Aggregate CompactionOutputStats into Internal Compaction Stats
-  // (compaction_stats_) and aggregate Compaction Job Stats
-  // (compaction_job_stats_) from the sub compactions
-  compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_);
-
-  // 2. Update the Output information in the Compaction Job Stats with
-  // aggregated Internal Compaction Stats.
-  UpdateCompactionJobStats(compaction_stats_.stats);
-
-  // 3. Set fields that are not propagated as part of aggregations above
+  // 1. Aggregate internal stats and job stats for all subcompactions
+  // internal stats: sub_compact.proximal_level_outputs_.stats and
+  //                 sub_compact.compaction_outputs_.stats into
+  //                 internal_stats_.output_level_stats and
+  //                 internal_stats_.proximal_level_stats
+  // job-level stats: sub_compact.compaction_job_stats into compact.job_stats_
+  //
+  // For remote compaction, there's only one subcompaction.
+  compact_->AggregateCompactionStats(internal_stats_, *job_stats_);
+
+  // 2. Update job-level output stats with the aggregated internal_stats_
+  // Please note that input stats will be updated by primary host when all
+  // subcompactions are finished
+  UpdateCompactionJobOutputStatsFromInternalStats(status, internal_stats_);
+  // and set fields that are not propagated as part of the update
   compaction_result_->stats.is_manual_compaction = c->is_manual_compaction();
   compaction_result_->stats.is_full_compaction = c->is_full_compaction();
   compaction_result_->stats.is_remote_compaction = true;
 
-  // 4. Update IO Stats that are not part of the aggregations above (bytes_read,
-  // bytes_written)
+  // 3. Update IO Stats that are not part of the the update above
+  // (bytes_read, bytes_written)
   RecordCompactionIOStats();
 
   // Build Output
+  compaction_result_->internal_stats = internal_stats_;
   compaction_result_->output_level = compact_->compaction->output_level();
   compaction_result_->output_path = output_path_;
   if (status.ok()) {
     for (const auto& output_file : sub_compact->GetOutputs()) {
       auto& meta = output_file.meta;
       compaction_result_->output_files.emplace_back(
-          MakeTableFileName(meta.fd.GetNumber()), meta.fd.smallest_seqno,
-          meta.fd.largest_seqno, meta.smallest.Encode().ToString(),
-          meta.largest.Encode().ToString(), meta.oldest_ancester_time,
-          meta.file_creation_time, meta.epoch_number, meta.file_checksum,
-          meta.file_checksum_func_name, output_file.validator.GetHash(),
-          meta.marked_for_compaction, meta.unique_id,
-          *output_file.table_properties);
+          MakeTableFileName(meta.fd.GetNumber()), meta.fd.GetFileSize(),
+          meta.fd.smallest_seqno, meta.fd.largest_seqno,
+          meta.smallest.Encode().ToString(), meta.largest.Encode().ToString(),
+          meta.oldest_ancester_time, meta.file_creation_time, meta.epoch_number,
+          meta.file_checksum, meta.file_checksum_func_name,
+          output_file.validator.GetHash(), meta.marked_for_compaction,
+          meta.unique_id, *output_file.table_properties,
+          output_file.is_proximal_level, meta.temperature);
     }
   }
 
@@ -482,6 +545,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct CompactionServiceOutputFile, file_name),
           OptionType::kEncodedString, OptionVerificationType::kNormal,
           OptionTypeFlags::kNone}},
+        {"file_size",
+         {offsetof(struct CompactionServiceOutputFile, file_size),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
         {"smallest_seqno",
          {offsetof(struct CompactionServiceOutputFile, smallest_seqno),
           OptionType::kUInt64T, OptionVerificationType::kNormal,
@@ -554,7 +621,16 @@ static std::unordered_map<std::string, OptionTypeInfo>
             const auto this_one = static_cast<const TableProperties*>(addr1);
             const auto that_one = static_cast<const TableProperties*>(addr2);
             return this_one->AreEqual(opts, that_one, mismatch);
-          }}}};
+          }}},
+        {"is_proximal_level_output",
+         {offsetof(struct CompactionServiceOutputFile,
+                   is_proximal_level_output),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"file_temperature",
+         {offsetof(struct CompactionServiceOutputFile, file_temperature),
+          OptionType::kTemperature, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}}};
 
 static std::unordered_map<std::string, OptionTypeInfo>
     compaction_job_stats_type_info = {
@@ -679,6 +755,125 @@ static std::unordered_map<std::string, OptionTypeInfo>
           OptionTypeFlags::kNone}},
 };
 
+static std::unordered_map<std::string, OptionTypeInfo>
+    compaction_stats_type_info = {
+        {"micros",
+         {offsetof(struct InternalStats::CompactionStats, micros),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"cpu_micros",
+         {offsetof(struct InternalStats::CompactionStats, cpu_micros),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"bytes_read_non_output_levels",
+         {offsetof(struct InternalStats::CompactionStats,
+                   bytes_read_non_output_levels),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"bytes_read_output_level",
+         {offsetof(struct InternalStats::CompactionStats,
+                   bytes_read_output_level),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"bytes_skipped_non_output_levels",
+         {offsetof(struct InternalStats::CompactionStats,
+                   bytes_skipped_non_output_levels),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"bytes_skipped_output_level",
+         {offsetof(struct InternalStats::CompactionStats,
+                   bytes_skipped_output_level),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"bytes_read_blob",
+         {offsetof(struct InternalStats::CompactionStats, bytes_read_blob),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"bytes_written",
+         {offsetof(struct InternalStats::CompactionStats, bytes_written),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"bytes_written_blob",
+         {offsetof(struct InternalStats::CompactionStats, bytes_written_blob),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"bytes_moved",
+         {offsetof(struct InternalStats::CompactionStats, bytes_moved),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_input_files_in_non_output_levels",
+         {offsetof(struct InternalStats::CompactionStats,
+                   num_input_files_in_non_output_levels),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_input_files_in_output_level",
+         {offsetof(struct InternalStats::CompactionStats,
+                   num_input_files_in_output_level),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_filtered_input_files_in_non_output_levels",
+         {offsetof(struct InternalStats::CompactionStats,
+                   num_filtered_input_files_in_non_output_levels),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_filtered_input_files_in_output_level",
+         {offsetof(struct InternalStats::CompactionStats,
+                   num_filtered_input_files_in_output_level),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_output_files",
+         {offsetof(struct InternalStats::CompactionStats, num_output_files),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_output_files_blob",
+         {offsetof(struct InternalStats::CompactionStats,
+                   num_output_files_blob),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_input_records",
+         {offsetof(struct InternalStats::CompactionStats, num_input_records),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_dropped_records",
+         {offsetof(struct InternalStats::CompactionStats, num_dropped_records),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_output_records",
+         {offsetof(struct InternalStats::CompactionStats, num_output_records),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"count",
+         {offsetof(struct InternalStats::CompactionStats, count),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"counts", OptionTypeInfo::Array<
+                       int, static_cast<int>(CompactionReason::kNumOfReasons)>(
+                       offsetof(struct InternalStats::CompactionStats, counts),
+                       OptionVerificationType::kNormal, OptionTypeFlags::kNone,
+                       {0, OptionType::kInt})},
+};
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    compaction_internal_stats_type_info = {
+        {"output_level_stats",
+         OptionTypeInfo::Struct(
+             "output_level_stats", &compaction_stats_type_info,
+             offsetof(struct InternalStats::CompactionStatsFull,
+                      output_level_stats),
+             OptionVerificationType::kNormal, OptionTypeFlags::kNone)},
+        {"has_proximal_level_output",
+         {offsetof(struct InternalStats::CompactionStatsFull,
+                   has_proximal_level_output),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"proximal_level_stats",
+         OptionTypeInfo::Struct(
+             "proximal_level_stats", &compaction_stats_type_info,
+             offsetof(struct InternalStats::CompactionStatsFull,
+                      proximal_level_stats),
+             OptionVerificationType::kNormal, OptionTypeFlags::kNone)},
+};
+
 namespace {
 // this is a helper struct to serialize and deserialize class Status, because
 // Status's members are not public.
@@ -785,6 +980,11 @@ static std::unordered_map<std::string, OptionTypeInfo> cs_result_type_info = {
                   "stats", &compaction_job_stats_type_info,
                   offsetof(struct CompactionServiceResult, stats),
                   OptionVerificationType::kNormal, OptionTypeFlags::kNone)},
+    {"internal_stats",
+     OptionTypeInfo::Struct(
+         "internal_stats", &compaction_internal_stats_type_info,
+         offsetof(struct CompactionServiceResult, internal_stats),
+         OptionVerificationType::kNormal, OptionTypeFlags::kNone)},
 };
 
 Status CompactionServiceInput::Read(const std::string& data_str,
diff --git a/db/compaction/compaction_service_test.cc b/db/compaction/compaction_service_test.cc
index 694466ce0c70..f76a25092974 100644
--- a/db/compaction/compaction_service_test.cc
+++ b/db/compaction/compaction_service_test.cc
@@ -4,9 +4,11 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include "db/db_test_util.h"
+#include "file/file_util.h"
 #include "port/stack_trace.h"
 #include "rocksdb/utilities/options_util.h"
 #include "table/unique_id_impl.h"
+#include "utilities/merge_operators/string_append/stringappend.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -15,17 +17,17 @@ class MyTestCompactionService : public CompactionService {
   MyTestCompactionService(
       std::string db_path, Options& options,
       std::shared_ptr<Statistics>& statistics,
-      std::vector<std::shared_ptr<EventListener>>& listeners,
+      std::vector<std::shared_ptr<EventListener>> listeners,
       std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
           table_properties_collector_factories)
       : db_path_(std::move(db_path)),
-        options_(options),
         statistics_(statistics),
-        start_info_("na", "na", "na", 0, Env::TOTAL, CompactionReason::kUnknown,
-                    false, false, false),
-        wait_info_("na", "na", "na", 0, Env::TOTAL, CompactionReason::kUnknown,
-                   false, false, false),
-        listeners_(listeners),
+        options_(options),
+        start_info_("na", "na", "na", 0, "na", 0, Env::TOTAL,
+                    CompactionReason::kUnknown, false, false, false, -1, -1),
+        wait_info_("na", "na", "na", 0, "na", 0, Env::TOTAL,
+                   CompactionReason::kUnknown, false, false, false, -1, -1),
+        listeners_(std::move(listeners)),
         table_properties_collector_factories_(
             std::move(table_properties_collector_factories)) {}
 
@@ -71,6 +73,31 @@ class MyTestCompactionService : public CompactionService {
     if (is_override_wait_status_) {
       return override_wait_status_;
     }
+
+    CompactionServiceOptionsOverride options_override = GetOptionsOverride();
+
+    OpenAndCompactOptions options;
+    options.canceled = &canceled_;
+
+    Status s =
+        DB::OpenAndCompact(options, db_path_, GetOutputPath(scheduled_job_id),
+                           compaction_input, result, options_override);
+    {
+      InstrumentedMutexLock l(&mutex_);
+      if (is_override_wait_result_) {
+        *result = override_wait_result_;
+      }
+      result_ = *result;
+    }
+    compaction_num_.fetch_add(1);
+    if (s.ok()) {
+      return CompactionServiceJobStatus::kSuccess;
+    } else {
+      return CompactionServiceJobStatus::kFailure;
+    }
+  }
+
+  CompactionServiceOptionsOverride GetOptionsOverride() {
     CompactionServiceOptionsOverride options_override;
     options_override.env = options_.env;
     options_override.file_checksum_gen_factory =
@@ -84,6 +111,7 @@ class MyTestCompactionService : public CompactionService {
     options_override.table_factory = options_.table_factory;
     options_override.sst_partitioner_factory = options_.sst_partitioner_factory;
     options_override.statistics = statistics_;
+    options_override.info_log = options_.info_log;
     if (!listeners_.empty()) {
       options_override.listeners = listeners_;
     }
@@ -92,26 +120,7 @@ class MyTestCompactionService : public CompactionService {
       options_override.table_properties_collector_factories =
           table_properties_collector_factories_;
     }
-
-    OpenAndCompactOptions options;
-    options.canceled = &canceled_;
-
-    Status s =
-        DB::OpenAndCompact(options, db_path_, db_path_ + "/" + scheduled_job_id,
-                           compaction_input, result, options_override);
-    {
-      InstrumentedMutexLock l(&mutex_);
-      if (is_override_wait_result_) {
-        *result = override_wait_result_;
-      }
-      result_ = *result;
-    }
-    compaction_num_.fetch_add(1);
-    if (s.ok()) {
-      return CompactionServiceJobStatus::kSuccess;
-    } else {
-      return CompactionServiceJobStatus::kFailure;
-    }
+    return options_override;
   }
 
   void CancelAwaitingJobs() override { canceled_ = true; }
@@ -158,14 +167,21 @@ class MyTestCompactionService : public CompactionService {
     return final_updated_status_.load();
   }
 
- private:
+ protected:
   InstrumentedMutex mutex_;
-  std::atomic_int compaction_num_{0};
+  const std::string db_path_;
+  std::shared_ptr<Statistics> statistics_;
   std::map<std::string, std::string> jobs_;
   std::map<std::string, CompactionServiceJobInfo> infos_;
-  const std::string db_path_;
+  std::string result_;
+
+  std::string GetOutputPath(const std::string& scheduled_job_id) {
+    return db_path_ + "/" + scheduled_job_id;
+  }
+
+ private:
+  std::atomic_int compaction_num_{0};
   Options options_;
-  std::shared_ptr<Statistics> statistics_;
   CompactionServiceJobInfo start_info_;
   CompactionServiceJobInfo wait_info_;
   bool is_override_start_status_ = false;
@@ -175,14 +191,15 @@ class MyTestCompactionService : public CompactionService {
   CompactionServiceJobStatus override_wait_status_ =
       CompactionServiceJobStatus::kFailure;
   bool is_override_wait_result_ = false;
-  std::string result_;
   std::string override_wait_result_;
   std::vector<std::shared_ptr<EventListener>> listeners_;
   std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
       table_properties_collector_factories_;
-  std::atomic_bool canceled_{false};
   std::atomic<CompactionServiceJobStatus> final_updated_status_{
       CompactionServiceJobStatus::kUseLocal};
+
+ protected:
+  std::atomic_bool canceled_{false};
 };
 
 class CompactionServiceTest : public DBTestBase {
@@ -277,8 +294,17 @@ TEST_F(CompactionServiceTest, BasicCompactions) {
   Statistics* primary_statistics = GetPrimaryStatistics();
   Statistics* compactor_statistics = GetCompactorStatistics();
 
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTable::PrefetchTail::TaiSizeNotRecorded",
+      [&](void* /* arg */) {
+        // Trigger assertion to verify precise tail prefetch size calculation
+        assert(false);
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
   GenerateTestData();
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  SyncPoint::GetInstance()->DisableProcessing();
   VerifyTestData();
 
   auto my_cs = GetCompactionService();
@@ -357,11 +383,12 @@ TEST_F(CompactionServiceTest, BasicCompactions) {
   } else {
     ASSERT_OK(result.status);
   }
-  ASSERT_GE(result.stats.elapsed_micros, 1);
-  ASSERT_GE(result.stats.cpu_micros, 1);
+  ASSERT_GE(result.internal_stats.output_level_stats.micros, 1);
+  ASSERT_GE(result.internal_stats.output_level_stats.cpu_micros, 1);
 
-  ASSERT_EQ(20, result.stats.num_output_records);
-  ASSERT_EQ(result.output_files.size(), result.stats.num_output_files);
+  ASSERT_EQ(20, result.internal_stats.output_level_stats.num_output_records);
+  ASSERT_EQ(result.output_files.size(),
+            result.internal_stats.output_level_stats.num_output_files);
 
   uint64_t total_size = 0;
   for (auto output_file : result.output_files) {
@@ -372,13 +399,14 @@ TEST_F(CompactionServiceTest, BasicCompactions) {
     ASSERT_GT(file_size, 0);
     total_size += file_size;
   }
-  ASSERT_EQ(total_size, result.stats.total_output_bytes);
+  ASSERT_EQ(total_size, result.internal_stats.TotalBytesWritten());
 
   ASSERT_TRUE(result.stats.is_remote_compaction);
   ASSERT_TRUE(result.stats.is_manual_compaction);
   ASSERT_FALSE(result.stats.is_full_compaction);
 
   Close();
+  SyncPoint::GetInstance()->DisableProcessing();
 }
 
 TEST_F(CompactionServiceTest, ManualCompaction) {
@@ -422,6 +450,133 @@ TEST_F(CompactionServiceTest, ManualCompaction) {
   ASSERT_OK(result.status);
   ASSERT_TRUE(result.stats.is_manual_compaction);
   ASSERT_TRUE(result.stats.is_remote_compaction);
+
+  auto info = my_cs->GetCompactionInfoForStart();
+  ASSERT_EQ(0, info.cf_id);
+  ASSERT_EQ(kDefaultColumnFamilyName, info.cf_name);
+
+  info = my_cs->GetCompactionInfoForWait();
+  ASSERT_EQ(0, info.cf_id);
+  ASSERT_EQ(kDefaultColumnFamilyName, info.cf_name);
+
+  // Test non-default CF
+  ASSERT_OK(
+      db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr));
+  my_cs->GetResult(&result);
+  ASSERT_OK(result.status);
+  ASSERT_TRUE(result.stats.is_manual_compaction);
+  ASSERT_TRUE(result.stats.is_remote_compaction);
+
+  info = my_cs->GetCompactionInfoForStart();
+  ASSERT_EQ(handles_[1]->GetID(), info.cf_id);
+  ASSERT_EQ(handles_[1]->GetName(), info.cf_name);
+
+  info = my_cs->GetCompactionInfoForWait();
+  ASSERT_EQ(handles_[1]->GetID(), info.cf_id);
+  ASSERT_EQ(handles_[1]->GetName(), info.cf_name);
+}
+
+TEST_F(CompactionServiceTest, StandaloneDeleteRangeTombstoneOptimization) {
+  Options options = CurrentOptions();
+
+  size_t num_files_after_filtered = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::MakeInputIterator:NewCompactionMergingIterator",
+      [&](void* arg) {
+        num_files_after_filtered = *static_cast<size_t*>(arg);
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  for (auto compaction_style : {CompactionStyle::kCompactionStyleLevel,
+                                CompactionStyle::kCompactionStyleUniversal}) {
+    SCOPED_TRACE("Style: " + std::to_string(compaction_style));
+    options.compaction_style = compaction_style;
+    ReopenWithCompactionService(&options);
+
+    num_files_after_filtered = 0;
+
+    std::vector<std::string> files;
+    {
+      // Writes first version of data in range partitioned files.
+      SstFileWriter sst_file_writer(EnvOptions(), options);
+      std::string file1 = dbname_ + "file1.sst";
+      ASSERT_OK(sst_file_writer.Open(file1));
+      ASSERT_OK(sst_file_writer.Put("a", "a1"));
+      ASSERT_OK(sst_file_writer.Put("b", "b1"));
+      ExternalSstFileInfo file1_info;
+      ASSERT_OK(sst_file_writer.Finish(&file1_info));
+      files.push_back(std::move(file1));
+
+      std::string file2 = dbname_ + "file2.sst";
+      ASSERT_OK(sst_file_writer.Open(file2));
+      ASSERT_OK(sst_file_writer.Put("x", "x1"));
+      ASSERT_OK(sst_file_writer.Put("y", "y1"));
+      ExternalSstFileInfo file2_info;
+      ASSERT_OK(sst_file_writer.Finish(&file2_info));
+      files.push_back(std::move(file2));
+    }
+
+    IngestExternalFileOptions ifo;
+    ASSERT_OK(db_->IngestExternalFile(files, ifo));
+    ASSERT_EQ(Get("a"), "a1");
+    ASSERT_EQ(Get("b"), "b1");
+    ASSERT_EQ(Get("x"), "x1");
+    ASSERT_EQ(Get("y"), "y1");
+    ASSERT_EQ(2, NumTableFilesAtLevel(6));
+
+    auto my_cs = GetCompactionService();
+    uint64_t comp_num = my_cs->GetCompactionNum();
+
+    {
+      // Atomically delete old version of data with one range delete file.
+      // And a new batch of range partitioned files with new version of data.
+      files.clear();
+      SstFileWriter sst_file_writer(EnvOptions(), options);
+      std::string file2 = dbname_ + "file2.sst";
+      ASSERT_OK(sst_file_writer.Open(file2));
+      ASSERT_OK(sst_file_writer.DeleteRange("a", "z"));
+      ExternalSstFileInfo file2_info;
+      ASSERT_OK(sst_file_writer.Finish(&file2_info));
+      files.push_back(std::move(file2));
+
+      std::string file3 = dbname_ + "file3.sst";
+      ASSERT_OK(sst_file_writer.Open(file3));
+      ASSERT_OK(sst_file_writer.Put("a", "a2"));
+      ASSERT_OK(sst_file_writer.Put("b", "b2"));
+      ExternalSstFileInfo file3_info;
+      ASSERT_OK(sst_file_writer.Finish(&file3_info));
+      files.push_back(std::move(file3));
+
+      std::string file4 = dbname_ + "file4.sst";
+      ASSERT_OK(sst_file_writer.Open(file4));
+      ASSERT_OK(sst_file_writer.Put("x", "x2"));
+      ASSERT_OK(sst_file_writer.Put("y", "y2"));
+      ExternalSstFileInfo file4_info;
+      ASSERT_OK(sst_file_writer.Finish(&file4_info));
+      files.push_back(std::move(file4));
+    }
+
+    ASSERT_OK(db_->IngestExternalFile(files, ifo));
+    ASSERT_OK(db_->WaitForCompact(WaitForCompactOptions()));
+    ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+
+    CompactionServiceResult result;
+    my_cs->GetResult(&result);
+    ASSERT_OK(result.status);
+    ASSERT_TRUE(result.stats.is_manual_compaction);
+    ASSERT_TRUE(result.stats.is_remote_compaction);
+
+    if (compaction_style == kCompactionStyleUniversal) {
+      ASSERT_EQ(num_files_after_filtered, 1);
+    } else {
+      // Not filtered
+      ASSERT_EQ(num_files_after_filtered, 3);
+    }
+
+    Close();
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
 }
 
 TEST_F(CompactionServiceTest, CompactionOutputFileIOError) {
@@ -716,6 +871,119 @@ TEST_F(CompactionServiceTest, VerifyStatsLocalFallback) {
   VerifyTestData();
 }
 
+TEST_F(CompactionServiceTest, VerifyInputRecordCount) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  ReopenWithCompactionService(&options);
+  GenerateTestData();
+
+  auto my_cs = GetCompactionService();
+
+  std::string start_str = Key(15);
+  std::string end_str = Key(45);
+  Slice start(start_str);
+  Slice end(end_str);
+  uint64_t comp_num = my_cs->GetCompactionNum();
+
+  // Only iterator through 10 keys and force compaction to finish.
+  int num_iter = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::ProcessKeyValueCompaction()::stop", [&](void* stop_ptr) {
+        num_iter++;
+        if (num_iter == 10) {
+          *(bool*)stop_ptr = true;
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // CompactRange() should fail
+  Status s = db_->CompactRange(CompactRangeOptions(), &start, &end);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsCorruption());
+  const char* expected_message =
+      "Compaction number of input keys does not match number of keys "
+      "processed.";
+  ASSERT_TRUE(std::strstr(s.getState(), expected_message));
+
+  ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(CompactionServiceTest, EmptyResult) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  ReopenWithCompactionService(&options);
+  GenerateTestData();
+
+  auto my_cs = GetCompactionService();
+
+  uint64_t comp_num = my_cs->GetCompactionNum();
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+
+  // Delete range to cover entire range
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), "key", "keyz"));
+  ASSERT_OK(Flush());
+
+  // In this unit test, both remote compaction and primary db instance are
+  // running in the same process, so NewFileNumber will never have a collision.
+  // In the real-world remote compactions, when the compaction is indeed running
+  // in another process, this is not going to be the case.
+  // To simulate the SST file with the same name created in the tmp directory,
+  // override the file number in remote compaction to re-use old SST file
+  // number.
+  bool need_to_override_file_number = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImplSecondary::OpenAndCompact::BeforeLoadingOptions:0",
+      [&](void*) { need_to_override_file_number = true; });
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::OpenCompactionOutputFile::NewFileNumber",
+      [&](void* file_number) {
+        if (need_to_override_file_number) {
+          auto n = static_cast<uint64_t*>(file_number);
+          ColumnFamilyMetaData cf_meta;
+          db_->GetColumnFamilyMetaData(&cf_meta);
+          for (const auto& level : cf_meta.levels) {
+            for (const auto& file : level.files) {
+              // Use one of the existing file name
+              *n = test::GetFileNumber(file.name);
+              need_to_override_file_number = false;
+              return;
+            }
+          }
+        }
+      });
+
+  // Inject failure, so that the remote compaction fails after
+  // ProcessKeyValueCompaction()
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImplSecondary::CompactWithoutInstallation::End", [&](void* status) {
+        // override job status
+        auto s = static_cast<Status*>(status);
+        *s = Status::Aborted("MyTestCompactionService failed to compact!");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Compaction should fail and SST files in the primary db should exist
+  {
+    ASSERT_NOK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    ColumnFamilyMetaData meta;
+    db_->GetColumnFamilyMetaData(&meta);
+    for (const auto& level : meta.levels) {
+      for (const auto& file : level.files) {
+        std::string fname = file.db_path + "/" + file.name;
+        ASSERT_OK(db_->GetEnv()->FileExists(fname));
+      }
+    }
+  }
+  Close();
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
 TEST_F(CompactionServiceTest, CorruptedOutput) {
   Options options = CurrentOptions();
   options.disable_auto_compactions = true;
@@ -781,6 +1049,7 @@ TEST_F(CompactionServiceTest, CorruptedOutputParanoidFileCheck) {
     Destroy(options);
     options.disable_auto_compactions = true;
     options.paranoid_file_checks = paranoid_file_check_enabled;
+    options.verify_output_flags = VerifyOutputFlags::kVerifyNone;
     ReopenWithCompactionService(&options);
     GenerateTestData();
 
@@ -835,6 +1104,87 @@ TEST_F(CompactionServiceTest, CorruptedOutputParanoidFileCheck) {
   }
 }
 
+TEST_F(CompactionServiceTest, CorruptedOutputVerifyOutputFlags) {
+  for (VerifyOutputFlags verify_output_flags :
+       {VerifyOutputFlags::kVerifyNone,
+        VerifyOutputFlags::kEnableForLocalCompaction |
+            VerifyOutputFlags::kVerifyBlockChecksum,
+        VerifyOutputFlags::kEnableForRemoteCompaction |
+            VerifyOutputFlags::kVerifyBlockChecksum,
+        VerifyOutputFlags::kEnableForRemoteCompaction |
+            VerifyOutputFlags::kVerifyIteration,
+        VerifyOutputFlags::kVerifyAll}) {
+    SCOPED_TRACE(
+        "verify_output_flags=" +
+        std::to_string(static_cast<std::underlying_type_t<VerifyOutputFlags>>(
+            verify_output_flags)));
+
+    Options options = CurrentOptions();
+    Destroy(options);
+    options.disable_auto_compactions = true;
+    options.paranoid_file_checks = false;
+    options.verify_output_flags = verify_output_flags;
+    ReopenWithCompactionService(&options);
+    GenerateTestData();
+
+    auto my_cs = GetCompactionService();
+
+    std::string start_str = Key(15);
+    std::string end_str = Key(45);
+    Slice start(start_str);
+    Slice end(end_str);
+    uint64_t comp_num = my_cs->GetCompactionNum();
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "CompactionServiceCompactionJob::Run:0", [&](void* arg) {
+          CompactionServiceResult* compaction_result =
+              *(static_cast<CompactionServiceResult**>(arg));
+          ASSERT_TRUE(compaction_result != nullptr &&
+                      !compaction_result->output_files.empty());
+          // Corrupt files here
+          for (const auto& output_file : compaction_result->output_files) {
+            std::string file_name =
+                compaction_result->output_path + "/" + output_file.file_name;
+
+            // Corrupt very small range of bytes. This corruption is so small
+            // that this isn't caught by default light-weight check
+            ASSERT_OK(test::CorruptFile(env_, file_name, 0, 1,
+                                        false /* verifyChecksum */));
+          }
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+    const bool is_enabled_for_remote_compaction =
+        !!(verify_output_flags & VerifyOutputFlags::kEnableForRemoteCompaction);
+    const bool should_verify_block_checksum =
+        !!(verify_output_flags & VerifyOutputFlags::kVerifyBlockChecksum);
+    const bool should_verify_iteration =
+        !!(verify_output_flags & VerifyOutputFlags::kVerifyIteration);
+
+    Status s = db_->CompactRange(CompactRangeOptions(), &start, &end);
+    if (is_enabled_for_remote_compaction &&
+        (should_verify_block_checksum || should_verify_iteration)) {
+      ASSERT_NOK(s);
+      ASSERT_TRUE(s.IsCorruption());
+    } else {
+      // CompactRange() goes through if block checksum wasn't verified
+      ASSERT_OK(s);
+    }
+
+    ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+
+    // On the worker side, the compaction is considered success
+    // Verification is done on the primary side
+    CompactionServiceResult result;
+    my_cs->GetResult(&result);
+    ASSERT_OK(result.status);
+    ASSERT_TRUE(result.stats.is_manual_compaction);
+    ASSERT_TRUE(result.stats.is_remote_compaction);
+  }
+}
+
 TEST_F(CompactionServiceTest, TruncatedOutput) {
   Options options = CurrentOptions();
   options.disable_auto_compactions = true;
@@ -849,6 +1199,12 @@ TEST_F(CompactionServiceTest, TruncatedOutput) {
   Slice end(end_str);
   uint64_t comp_num = my_cs->GetCompactionNum();
 
+  // Skip calculating tail size to avoid crashing due to truncated file size
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "FileMetaData::CalculateTailSize", [&](void* arg) {
+        bool* skip = static_cast<bool*>(arg);
+        *skip = true;
+      });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "CompactionServiceCompactionJob::Run:0", [&](void* arg) {
         CompactionServiceResult* compaction_result =
@@ -865,7 +1221,7 @@ TEST_F(CompactionServiceTest, TruncatedOutput) {
           ASSERT_OK(s);
           ASSERT_GT(file_size, 0);
 
-          ASSERT_OK(test::TruncateFile(env_, file_name, file_size / 2));
+          ASSERT_OK(test::TruncateFile(env_, file_name, file_size / 4));
         }
       });
   SyncPoint::GetInstance()->EnableProcessing();
@@ -1024,8 +1380,9 @@ TEST_F(CompactionServiceTest, CancelCompactionOnPrimarySide) {
 
   // Primary DB calls CancelAllBackgroundWork() while the compaction is running
   SyncPoint::GetInstance()->SetCallBack(
-      "CompactionJob::Run():Inprogress",
-      [&](void* /*arg*/) { CancelAllBackgroundWork(db_, false /*wait*/); });
+      "CompactionJob::Run():Inprogress", [&](void* /*arg*/) {
+        CancelAllBackgroundWork(db_.get(), false /*wait*/);
+      });
 
   SyncPoint::GetInstance()->EnableProcessing();
 
@@ -1140,22 +1497,48 @@ TEST_F(CompactionServiceTest, CompactionFilter) {
   ASSERT_GE(my_cs->GetCompactionNum(), 1);
 }
 
-TEST_F(CompactionServiceTest, Snapshot) {
+TEST_F(CompactionServiceTest, MergeOperator) {
   Options options = CurrentOptions();
+  options.merge_operator.reset(new StringAppendOperator(','));
   ReopenWithCompactionService(&options);
-
-  ASSERT_OK(Put(Key(1), "value1"));
-  ASSERT_OK(Put(Key(2), "value1"));
-  const Snapshot* s1 = db_->GetSnapshot();
-  ASSERT_OK(Flush());
-
-  ASSERT_OK(Put(Key(1), "value2"));
-  ASSERT_OK(Put(Key(3), "value2"));
-  ASSERT_OK(Flush());
-
+  GenerateTestData();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  for (int i = 0; i < 200; i++) {
+    ASSERT_OK(db_->Merge(WriteOptions(), Key(i),
+                         "merge_op_append_" + std::to_string(i)));
+  }
   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  auto my_cs = GetCompactionService();
-  ASSERT_GE(my_cs->GetCompactionNum(), 1);
+  // verify result
+  for (int i = 0; i < 200; i++) {
+    auto result = Get(Key(i));
+    if (i % 2) {
+      ASSERT_EQ(result, "value" + std::to_string(i) + ",merge_op_append_" +
+                            std::to_string(i));
+    } else {
+      ASSERT_EQ(result, "value_new" + std::to_string(i) + ",merge_op_append_" +
+                            std::to_string(i));
+    }
+  }
+  auto my_cs = GetCompactionService();
+  ASSERT_GE(my_cs->GetCompactionNum(), 1);
+}
+
+TEST_F(CompactionServiceTest, Snapshot) {
+  Options options = CurrentOptions();
+  ReopenWithCompactionService(&options);
+
+  ASSERT_OK(Put(Key(1), "value1"));
+  ASSERT_OK(Put(Key(2), "value1"));
+  const Snapshot* s1 = db_->GetSnapshot();
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(Key(1), "value2"));
+  ASSERT_OK(Put(Key(3), "value2"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  auto my_cs = GetCompactionService();
+  ASSERT_GE(my_cs->GetCompactionNum(), 1);
   ASSERT_EQ("value1", Get(Key(1), s1));
   ASSERT_EQ("value2", Get(Key(1)));
   db_->ReleaseSnapshot(s1);
@@ -1188,34 +1571,52 @@ TEST_F(CompactionServiceTest, PrecludeLastLevel) {
 
   for (int i = 0; i < kNumTrigger; i++) {
     for (int j = 0; j < kNumKeys; j++) {
-      // FIXME: need to assign outputs to levels to allow overlapping ranges:
-      // ASSERT_OK(Put(Key(j * kNumTrigger + i), "v" + std::to_string(i)));
-      // instead of this (too easy):
-      ASSERT_OK(Put(Key(i * kNumKeys + j), "v" + std::to_string(i)));
+      ASSERT_OK(Put(Key(j * kNumTrigger + i), "v" + std::to_string(i)));
     }
     ASSERT_OK(Flush());
   }
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
-  // Data split between penultimate (kUnknown) and last (kCold) levels
-  // FIXME: need to assign outputs to levels to get this:
-  // ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
-  // ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
-  // ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
-  // instead of this (WRONG but currently expected):
-  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
-  // Check manifest temperatures
+  // Data split between proximal (kUnknown) and last (kCold) levels
+  ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
   ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
-  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
   // TODO: Check FileSystem temperatures with FileTemperatureTestFS
 
   for (int i = 0; i < kNumTrigger; i++) {
     for (int j = 0; j < kNumKeys; j++) {
-      // FIXME
-      // ASSERT_EQ(Get(Key(j * kNumTrigger + i)), "v" + std::to_string(i));
-      ASSERT_EQ(Get(Key(i * kNumKeys + j)), "v" + std::to_string(i));
+      ASSERT_EQ(Get(Key(j * kNumTrigger + i)), "v" + std::to_string(i));
     }
   }
+
+  // Verify Output Stats
+  auto my_cs = GetCompactionService();
+  {
+    CompactionServiceResult result;
+    my_cs->GetResult(&result);
+    ASSERT_OK(result.status);
+    ASSERT_GT(result.internal_stats.output_level_stats.cpu_micros, 0);
+    ASSERT_GT(result.internal_stats.output_level_stats.micros, 0);
+    ASSERT_EQ(result.internal_stats.output_level_stats.num_output_records +
+                  result.internal_stats.proximal_level_stats.num_output_records,
+              kNumTrigger * kNumKeys);
+    ASSERT_EQ(result.internal_stats.output_level_stats.num_output_files +
+                  result.internal_stats.proximal_level_stats.num_output_files,
+              2);
+
+    CompactionServiceJobInfo info = my_cs->GetCompactionInfoForStart();
+    ASSERT_EQ(0, info.base_input_level);
+    ASSERT_EQ(kNumLevels - 1, info.output_level);
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+  // Disable Preclude feature and run full compaction to the bottommost level
+  {
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    CompactionServiceJobInfo info = my_cs->GetCompactionInfoForStart();
+    ASSERT_EQ(kNumLevels - 2, info.base_input_level);
+    ASSERT_EQ(kNumLevels - 1, info.output_level);
+  }
 }
 
 TEST_F(CompactionServiceTest, ConcurrentCompaction) {
@@ -1285,12 +1686,17 @@ TEST_F(CompactionServiceTest, CompactionInfo) {
   ASSERT_EQ(true, info.is_manual_compaction);
   ASSERT_EQ(false, info.is_full_compaction);
   ASSERT_EQ(true, info.bottommost_level);
+  ASSERT_EQ(1, info.base_input_level);
+  ASSERT_EQ(2, info.output_level);
   info = my_cs->GetCompactionInfoForWait();
   ASSERT_EQ(Env::USER, info.priority);
   ASSERT_EQ(CompactionReason::kManualCompaction, info.compaction_reason);
   ASSERT_EQ(true, info.is_manual_compaction);
   ASSERT_EQ(false, info.is_full_compaction);
   ASSERT_EQ(true, info.bottommost_level);
+  ASSERT_EQ(1, info.base_input_level);
+  ASSERT_EQ(2, info.output_level);
+  ASSERT_EQ(kDefaultColumnFamilyName, info.cf_name);
 
   // Test priority BOTTOM
   env_->SetBackgroundThreads(1, Env::BOTTOM);
@@ -1322,18 +1728,24 @@ TEST_F(CompactionServiceTest, CompactionInfo) {
   ASSERT_EQ(false, info.is_full_compaction);
   ASSERT_EQ(true, info.bottommost_level);
   ASSERT_EQ(Env::BOTTOM, info.priority);
+  ASSERT_EQ(0, info.base_input_level);
+  ASSERT_EQ(db_->NumberLevels() - 1, info.output_level);
   info = my_cs->GetCompactionInfoForWait();
   ASSERT_EQ(Env::BOTTOM, info.priority);
   ASSERT_EQ(CompactionReason::kLevelL0FilesNum, info.compaction_reason);
   ASSERT_EQ(false, info.is_manual_compaction);
   ASSERT_EQ(false, info.is_full_compaction);
   ASSERT_EQ(true, info.bottommost_level);
+  ASSERT_EQ(0, info.base_input_level);
+  ASSERT_EQ(db_->NumberLevels() - 1, info.output_level);
 
   // Test Non-Bottommost Level
   options.num_levels = 4;
   ReopenWithCompactionService(&options);
   my_cs =
       static_cast_with_check<MyTestCompactionService>(GetCompactionService());
+  int compaction_num = my_cs->GetCompactionNum();
+  ASSERT_EQ(0, compaction_num);
 
   for (int i = 0; i < options.level0_file_num_compaction_trigger; i++) {
     for (int j = 0; j < 10; j++) {
@@ -1342,16 +1754,22 @@ TEST_F(CompactionServiceTest, CompactionInfo) {
     }
     ASSERT_OK(Flush());
   }
-
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // This is trivial move. Done locally.
+  ASSERT_EQ(0, my_cs->GetCompactionNum());
   info = my_cs->GetCompactionInfoForStart();
   ASSERT_EQ(false, info.is_manual_compaction);
   ASSERT_EQ(false, info.is_full_compaction);
   ASSERT_EQ(false, info.bottommost_level);
+  ASSERT_EQ(-1, info.base_input_level);
+  ASSERT_EQ(-1, info.output_level);
   info = my_cs->GetCompactionInfoForWait();
   ASSERT_EQ(false, info.is_manual_compaction);
   ASSERT_EQ(false, info.is_full_compaction);
   ASSERT_EQ(false, info.bottommost_level);
+  ASSERT_EQ(-1, info.base_input_level);
+  ASSERT_EQ(-1, info.output_level);
 
   // Test Full Compaction + Bottommost Level
   options.num_levels = 6;
@@ -1366,7 +1784,10 @@ TEST_F(CompactionServiceTest, CompactionInfo) {
     }
     ASSERT_OK(Flush());
   }
+  MoveFilesToLevel(options.num_levels - 1);
 
+  // Force final level compaction
+  // base_input_level == output_level == last_level
   CompactRangeOptions cro;
   cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
@@ -1378,10 +1799,15 @@ TEST_F(CompactionServiceTest, CompactionInfo) {
   ASSERT_EQ(true, info.bottommost_level);
   ASSERT_EQ(CompactionReason::kManualCompaction, info.compaction_reason);
   info = my_cs->GetCompactionInfoForWait();
+  ASSERT_EQ(options.num_levels - 1, info.base_input_level);
+  ASSERT_EQ(options.num_levels - 1, info.output_level);
   ASSERT_EQ(true, info.is_manual_compaction);
   ASSERT_EQ(true, info.is_full_compaction);
   ASSERT_EQ(true, info.bottommost_level);
   ASSERT_EQ(CompactionReason::kManualCompaction, info.compaction_reason);
+  ASSERT_EQ(options.num_levels - 1, info.base_input_level);
+  ASSERT_EQ(options.num_levels - 1, info.output_level);
+  ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
 }
 
 TEST_F(CompactionServiceTest, FallbackLocalAuto) {
@@ -1471,6 +1897,40 @@ TEST_F(CompactionServiceTest, FallbackLocalManual) {
   VerifyTestData();
 }
 
+TEST_F(CompactionServiceTest, AbortedWhileWait) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  ReopenWithCompactionService(&options);
+
+  GenerateTestData();
+  VerifyTestData();
+
+  auto my_cs = GetCompactionService();
+  Statistics* compactor_statistics = GetCompactorStatistics();
+  Statistics* primary_statistics = GetPrimaryStatistics();
+
+  my_cs->ResetOverride();
+  std::string start_str = Key(15);
+  std::string end_str = Key(45);
+  Slice start(start_str);
+  Slice end(end_str);
+
+  // Override Wait() result with kAborted
+  my_cs->OverrideWaitStatus(CompactionServiceJobStatus::kAborted);
+  start_str = Key(120);
+  start = start_str;
+
+  Status s = db_->CompactRange(CompactRangeOptions(), &start, nullptr);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsAborted());
+  // no remote compaction is run
+  ASSERT_EQ(my_cs->GetCompactionNum(), 0);
+  // make sure the compaction statistics is not recorded any side
+  ASSERT_EQ(primary_statistics->getTickerCount(COMPACT_WRITE_BYTES), 0);
+  ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES), 0);
+  ASSERT_EQ(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES), 0);
+}
+
 TEST_F(CompactionServiceTest, RemoteEventListener) {
   class RemoteEventListenerTest : public EventListener {
    public:
@@ -1643,6 +2103,761 @@ TEST_F(CompactionServiceTest, TablePropertiesCollector) {
   ASSERT_TRUE(has_user_property);
 }
 
+class ResumableCompactionService : public MyTestCompactionService {
+ public:
+  enum class TestScenario {
+    // Test scenario 1: Two-phase compaction with resumption
+    // - Phase 1: Cancel the compaction running with resumption enabled (saves
+    // progress)
+    // - Phase 2: Resume from saved progress and complete
+    // Validates: Resumption reduces redundant work
+    kCancelThenResume,
+
+    // Test scenario 2: Two-phase compaction without resumption
+    // - Phase 1: Cancel the compaction running with resumption enabled (saves
+    // progress)
+    // - Phase 2: Start fresh without resumption (ignores saved progress) and
+    // complete
+    // Validates: Disabling resumption causes full reprocessing
+    kCancelThenFreshStart,
+
+    // Test scenario 3: Three-phase compaction toggling resumption on/off/on
+    // - Phase 1: Cancel the compaction running with resumption enabled (saves
+    // progress)
+    // - Phase 2: Start fresh wtihout resumption (ignores saved progress) and
+    // cancel agains
+    // - Phase 3: Resume with resumption support (loads Phase 1's progress) and
+    // complete
+    // Validates: Resumption state can be toggled;
+    kMultipleCancelToggleResumption
+  };
+
+  ResumableCompactionService(const std::string& db_path, Options& options,
+                             std::shared_ptr<Statistics> statistics,
+                             TestScenario scenario)
+      : MyTestCompactionService(db_path, options, statistics,
+                                {} /* listeners */,
+                                {} /* table_properties_collector_factories */),
+        scenario_(scenario) {}
+
+  // Set the user key where cancellation should happen.
+  void SetCancelAtKey(const std::string& key, SequenceNumber seqno) {
+    cancel_at_key_ = key;
+    cancel_at_seqno_ = seqno;
+  }
+
+  CompactionServiceJobStatus Wait(const std::string& scheduled_job_id,
+                                  std::string* result) override {
+    std::string compaction_input = ExtractCompactionInput(scheduled_job_id);
+    EXPECT_FALSE(compaction_input.empty());
+
+    OpenAndCompactOptions open_and_compaction_options;
+    auto override_options = GetOptionsOverride();
+
+    // Force creation of one key per output file for test simplicity.
+    // ASSUMPTION: This makes stats.count directly proportional to keys
+    // processed.
+    SyncPoint::GetInstance()->SetCallBack(
+        "CompactionOutputs::ShouldStopBefore::manual_decision",
+        [this](void* p) {
+          auto* pair = static_cast<std::pair<bool*, const Slice>*>(p);
+          *(pair->first) = true;  // Force file cut at every key
+
+          // If cancel_at_key_ is set, cancel when we encounter that key
+          if (!cancel_at_key_.empty() && !already_canceled_) {
+            ParsedInternalKey parsed_key;
+            if (ParseInternalKey(pair->second, &parsed_key, true).ok()) {
+              if (parsed_key.user_key.ToString() == cancel_at_key_) {
+                // Check sequence number if specified
+                if (cancel_at_seqno_ == kMaxSequenceNumber ||
+                    parsed_key.sequence == cancel_at_seqno_) {
+                  canceled_ = true;
+                  already_canceled_ = true;
+                }
+              }
+            }
+          }
+        });
+
+    // If no cancel_at_key_ is set, use the original behavior:
+    // Simulate cancelled compaction by overriding status at completion. So
+    // compaction processes all keys before this point to make stats.count
+    // comparison straightforward.
+    if (cancel_at_key_.empty()) {
+      SyncPoint::GetInstance()->SetCallBack(
+          "DBImplSecondary::CompactWithoutInstallation::End",
+          [&](void* status) {
+            auto s = static_cast<Status*>(status);
+            *s = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+          });
+    }
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    // Phase 1: Run compaction with resumption enabled and cancel it
+    // - Processes input keys until cancellation point
+    // - Creates output files and saves progress
+    // - Status overridden to "paused"
+    open_and_compaction_options.allow_resumption = true;
+    open_and_compaction_options.canceled = &canceled_;
+    already_canceled_ = false;
+    canceled_ = false;
+
+    auto phase1_stats =
+        RunCancelledCompaction(open_and_compaction_options, scheduled_job_id,
+                               compaction_input, override_options);
+
+    HistogramData phase2_stats;
+
+    if (scenario_ == TestScenario::kMultipleCancelToggleResumption) {
+      // Phase 2: Run compaction WITHOUT resumption (fresh start) and cancel it
+      // - Delete all files left behind Phase 1 before calling OpenAndCompact()
+      // - Processes all input keys again from scratch
+      // - Creates output files but does NOT save progress
+      // - Status overridden to "paused"
+      open_and_compaction_options.allow_resumption = false;
+
+      // Clean up output folder for fresh start
+      std::string output_dir = GetOutputPath(scheduled_job_id);
+      Status cleanup_status = DestroyDir(override_options.env, output_dir);
+      EXPECT_TRUE(cleanup_status.ok());
+      EXPECT_OK(override_options.env->CreateDir(output_dir));
+
+      already_canceled_ = false;
+      canceled_ = false;
+
+      phase2_stats =
+          RunCancelledCompaction(open_and_compaction_options, scheduled_job_id,
+                                 compaction_input, override_options);
+
+      // Validation: Phase 2 starts from scratch, so it processes the same
+      // input keys as Phase 1.
+      // ASSUMPTION: With fixed input (10 keys) and deterministic cancellation
+      // (after processing), both phases create the same number of output files.
+      EXPECT_EQ(phase2_stats.count, phase1_stats.count);
+    }
+
+    // Final phase: Run compaction to completion (no cancellation)
+    if (scenario_ == TestScenario::kMultipleCancelToggleResumption) {
+      // Attempt to resume but it ends up starting fresh
+      open_and_compaction_options.allow_resumption = true;
+    } else if (scenario_ == TestScenario::kCancelThenResume) {
+      // Resume from Phase 1's saved progress
+      open_and_compaction_options.allow_resumption = true;
+    } else {  // kCancelThenFreshStart
+      // Start fresh without resumption
+      open_and_compaction_options.allow_resumption = false;
+
+      // Clean up output folder for fresh start
+      std::string output_dir = GetOutputPath(scheduled_job_id);
+      Status cleanup_status = DestroyDir(override_options.env, output_dir);
+      EXPECT_TRUE(cleanup_status.ok());
+      EXPECT_OK(override_options.env->CreateDir(output_dir));
+    }
+
+    // Prevent triggering of cancellation
+    SyncPoint::GetInstance()->ClearCallBack(
+        "DBImplSecondary::CompactWithoutInstallation::End");
+    already_canceled_ = true;
+    canceled_ = false;
+
+    auto final_phase_stats =
+        RunCompaction(open_and_compaction_options, scheduled_job_id,
+                      compaction_input, override_options, result);
+
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+
+    // Validate statistics based on scenario (only when cancelling at end)
+    if (cancel_at_key_.empty()) {
+      if (scenario_ == TestScenario::kMultipleCancelToggleResumption) {
+        // ASSUMPTION: Phase 1 processes all keys before cancellation
+        EXPECT_GT(phase1_stats.count, 0);
+
+        // ASSUMPTION: Phase 2 runs with allow_resumption=false and an empty
+        // folder. Phase 2 then creates its own output files (but doesn't save
+        // progress). When Phase 3 starts with allow_resumption=true, it finds
+        // no progress file exists, so it cannot resume and must start from
+        // scratch, processing all input keys again. Result: Phase 3 does the
+        // same amount of work as Phase 1.
+        EXPECT_EQ(final_phase_stats.count, phase1_stats.count);
+
+      } else if (scenario_ == TestScenario::kCancelThenResume) {
+        // ASSUMPTION: Phase 1 processes all keys before cancellation
+        EXPECT_GT(phase1_stats.count, 0);
+
+        // ASSUMPTION: Phase 1 processes all keys and saves progress before
+        // cancellation. Final phase resumes from Phase 1's saved progress.
+        // Since Phase 1 completed all processing before being cancelled, the
+        // final phase should do less work than Phase 1.
+        EXPECT_LT(final_phase_stats.count, phase1_stats.count);
+
+      } else {  // kCancelThenFreshStart
+                // ASSUMPTION: Phase 1 processes all keys before cancellation
+        EXPECT_GT(phase1_stats.count, 0);
+
+        // ASSUMPTION: Final phase starts fresh without resumption, so it
+        // processes all input keys again and creates the same number of files
+        EXPECT_EQ(final_phase_stats.count, phase1_stats.count);
+      }
+    }
+
+    StoreResult(*result);
+
+    return CompactionServiceJobStatus::kSuccess;
+  }
+
+ private:
+  std::string ExtractCompactionInput(const std::string& scheduled_job_id) {
+    InstrumentedMutexLock l(&mutex_);
+
+    auto job_index = jobs_.find(scheduled_job_id);
+    if (job_index == jobs_.end()) {
+      return "";
+    }
+    std::string compaction_input = std::move(job_index->second);
+    jobs_.erase(job_index);
+
+    auto info_index = infos_.find(scheduled_job_id);
+    if (info_index == infos_.end()) {
+      return "";
+    }
+    infos_.erase(info_index);
+
+    return compaction_input;
+  }
+
+  HistogramData RunCancelledCompaction(
+      const OpenAndCompactOptions& options, const std::string& scheduled_job_id,
+      const std::string& compaction_input,
+      const CompactionServiceOptionsOverride& override_options) {
+    std::string temp_result;
+    EXPECT_OK(statistics_->Reset());
+
+    Status s =
+        DB::OpenAndCompact(options, db_path_, GetOutputPath(scheduled_job_id),
+                           compaction_input, &temp_result, override_options);
+
+    EXPECT_TRUE(s.IsManualCompactionPaused());
+
+    HistogramData stats;
+    statistics_->histogramData(FILE_WRITE_COMPACTION_MICROS, &stats);
+    return stats;
+  }
+
+  HistogramData RunCompaction(
+      const OpenAndCompactOptions& options, const std::string& scheduled_job_id,
+      const std::string& compaction_input,
+      const CompactionServiceOptionsOverride& override_options,
+      std::string* result) {
+    EXPECT_OK(statistics_->Reset());
+
+    Status s =
+        DB::OpenAndCompact(options, db_path_, GetOutputPath(scheduled_job_id),
+                           compaction_input, result, override_options);
+
+    EXPECT_TRUE(s.ok());
+
+    HistogramData stats;
+    statistics_->histogramData(FILE_WRITE_COMPACTION_MICROS, &stats);
+    return stats;
+  }
+
+  void StoreResult(const std::string& result) {
+    InstrumentedMutexLock l(&mutex_);
+    result_ = result;
+  }
+
+  TestScenario scenario_;
+  std::string cancel_at_key_;
+  SequenceNumber cancel_at_seqno_ = kMaxSequenceNumber;
+  std::atomic<bool> already_canceled_{false};
+};
+
+class ResumableCompactionServiceTest : public CompactionServiceTest {
+ public:
+  explicit ResumableCompactionServiceTest() : CompactionServiceTest() {}
+
+  void RunCompactionCancelTest(
+      ResumableCompactionService::TestScenario scenario) {
+    Options options = CurrentOptions();
+    options.disable_auto_compactions = true;
+    std::shared_ptr<Statistics> statistics = CreateDBStatistics();
+
+    options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+    BlockBasedTableOptions table_options;
+    table_options.verify_compression = true;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    auto resume_cs = std::make_shared<ResumableCompactionService>(
+        dbname_, options, statistics, scenario);
+    options.compaction_service = resume_cs;
+
+    DestroyAndReopen(options);
+
+    GenerateTestData();
+
+    ASSERT_OK(statistics->Reset());
+
+    CompactRangeOptions cro;
+    cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+    Status s = db_->CompactRange(cro, nullptr, nullptr);
+    ASSERT_OK(s);
+
+    VerifyTestData();
+
+    s = db_->VerifyChecksum();
+    ASSERT_OK(s);
+
+    s = db_->VerifyFileChecksums(ReadOptions());
+    ASSERT_OK(s);
+
+    CompactionServiceResult result;
+    resume_cs->GetResult(&result);
+    ASSERT_OK(result.status);
+    ASSERT_TRUE(result.stats.is_manual_compaction);
+    ASSERT_TRUE(result.stats.is_remote_compaction);
+    ASSERT_GT(result.output_files.size(), 0);
+
+    uint64_t resumed_bytes =
+        statistics->getTickerCount(REMOTE_COMPACT_RESUMED_BYTES);
+    if (scenario ==
+        ResumableCompactionService::TestScenario::kCancelThenResume) {
+      // When resuming compaction, some bytes should be resumed from previous
+      // progress
+      ASSERT_GT(resumed_bytes, 0);
+    } else if (scenario == ResumableCompactionService::TestScenario::
+                               kCancelThenFreshStart) {
+      // When starting fresh (ignoring existing progress), no bytes should be
+      // resumed
+      ASSERT_EQ(resumed_bytes, 0);
+    } else {  // kMultipleCancelToggleResumption
+      // Phase 2 ran without resumption (fresh start), so Phase 3 has no
+      // progress to resume from. It processes all keys again from scratch.
+      ASSERT_EQ(resumed_bytes, 0);
+    }
+  }
+
+  void GenerateTestData() {
+    for (int i = 0; i < kNumKeys; ++i) {
+      ASSERT_OK(Put(Key(i), "value"));
+      ASSERT_OK(Flush());
+      if (i % 2 == 0) {
+        ASSERT_OK(Delete(Key(i)));
+        ASSERT_OK(Flush());
+      }
+    }
+  }
+
+  void VerifyTestData() {
+    for (int i = 0; i < kNumKeys; ++i) {
+      if (i % 2 == 0) {
+        ASSERT_EQ("NOT_FOUND", Get((Key(i))));
+      } else {
+        ASSERT_EQ("value", Get((Key(i))));
+      }
+    }
+  }
+
+ private:
+  static constexpr int kNumKeys = 10;
+};
+
+TEST_F(ResumableCompactionServiceTest, CompactionCancelThenResume) {
+  RunCompactionCancelTest(
+      ResumableCompactionService::TestScenario::kCancelThenResume);
+}
+
+TEST_F(ResumableCompactionServiceTest, CompactionCancelThenFreshStart) {
+  RunCompactionCancelTest(
+      ResumableCompactionService::TestScenario::kCancelThenFreshStart);
+}
+
+TEST_F(ResumableCompactionServiceTest,
+       CompactionMultipleCancelToggleResumption) {
+  RunCompactionCancelTest(ResumableCompactionService::TestScenario::
+                              kMultipleCancelToggleResumption);
+}
+
+class ResumableCompactionKeyTypeTest : public CompactionServiceTest {
+ public:
+  explicit ResumableCompactionKeyTypeTest() : CompactionServiceTest() {}
+
+ protected:
+  void SetupResumableCompactionService(
+      Options& options, const std::string& cancel_at_key = "",
+      SequenceNumber cancel_at_seqno = kMaxSequenceNumber) {
+    options.disable_auto_compactions = true;
+    statistics_ = CreateDBStatistics();
+
+    resume_cs_ = std::make_shared<ResumableCompactionService>(
+        dbname_, options, statistics_,
+        ResumableCompactionService::TestScenario::kCancelThenResume);
+
+    if (!cancel_at_key.empty()) {
+      resume_cs_->SetCancelAtKey(cancel_at_key, cancel_at_seqno);
+    }
+
+    options.compaction_service = resume_cs_;
+    DestroyAndReopen(options);
+  }
+
+  void ResetStatistics() { ASSERT_OK(statistics_->Reset()); }
+
+  void VerifyResumeBytes() {
+    uint64_t resumed_bytes =
+        statistics_->getTickerCount(REMOTE_COMPACT_RESUMED_BYTES);
+    ASSERT_GT(resumed_bytes, 0);
+  }
+
+ private:
+  std::shared_ptr<ResumableCompactionService> resume_cs_;
+  std::shared_ptr<Statistics> statistics_;
+};
+
+// Cancel compaction right before processing key "c" to test resumption at a
+//  deletion at the non-bottom level. When resumed, compaction will continue
+//  from this deletion.
+TEST_F(ResumableCompactionKeyTypeTest,
+       CancelAndResumeWithDeleteAtNonBottomLevel) {
+  Options options = CurrentOptions();
+
+  SetupResumableCompactionService(options, "c");
+
+  ASSERT_OK(Put("c", "old_value"));
+  ASSERT_OK(Put("c_placeholder", "placeholder"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(options.num_levels - 1);
+
+  ASSERT_OK(Put("a", "val1"));
+  ASSERT_OK(Put("b", "val2"));
+  ASSERT_OK(Put("d", "val4"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Delete("c"));
+  ASSERT_OK(Flush());
+
+  std::vector<std::string> input_files;
+  ColumnFamilyMetaData cf_meta;
+  db_->GetColumnFamilyMetaData(&cf_meta);
+
+  for (const auto& file : cf_meta.levels[0].files) {
+    input_files.push_back(file.name);
+  }
+
+  ASSERT_EQ(input_files.size(), 2);
+
+  ResetStatistics();
+
+  CompactionOptions compact_options;
+  ASSERT_OK(
+      db_->CompactFiles(compact_options, input_files, 1 /* output_level*/));
+
+  ASSERT_EQ(Get("a"), "val1");
+  ASSERT_EQ(Get("b"), "val2");
+  ASSERT_EQ(Get("c"), "NOT_FOUND");
+  ASSERT_EQ(Get("d"), "val4");
+
+  VerifyResumeBytes();
+}
+
+// Cancel compaction right before processing key "c" to test resumption at a
+//  deletion at the ottom level. When resumed, compaction will continue from
+//  the last saved progress point before the delete.
+TEST_F(ResumableCompactionKeyTypeTest, CancelAndResumeWithDeleteAtBottomLevel) {
+  Options options = CurrentOptions();
+
+  SetupResumableCompactionService(options, "c");
+
+  ASSERT_OK(Put("c", "old_value"));
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(Delete("c"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(options.num_levels - 1);
+
+  ASSERT_OK(Put("a", "val1"));
+  ASSERT_OK(Put("b", "val2"));
+  ASSERT_OK(Put("d", "val4"));
+  ASSERT_OK(Flush());
+
+  ResetStatistics();
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  ASSERT_EQ(Get("a"), "val1");
+  ASSERT_EQ(Get("b"), "val2");
+  ASSERT_EQ(Get("c"), "NOT_FOUND");
+  ASSERT_EQ(Get("c", snapshot), "old_value");
+  ASSERT_EQ(Get("d"), "val4");
+  db_->ReleaseSnapshot(snapshot);
+
+  VerifyResumeBytes();
+}
+
+// Cancel compaction right before processing key "c" to test resumption at a
+// merge operand. When resumed, compaction will continue from the last saved
+// progress point before the merge operand.
+TEST_F(ResumableCompactionKeyTypeTest, CancelAndResumeWithMerge) {
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+
+  SetupResumableCompactionService(options, "c");
+
+  ASSERT_OK(Put("c", "old_value"));
+  ASSERT_OK(Put("c_placeholder", "placeholder"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(options.num_levels - 1);
+
+  ASSERT_OK(Put("a", "val1"));
+  ASSERT_OK(Put("b", "val2"));
+  ASSERT_OK(Put("d", "val4"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Merge("c", "new_value"));
+  ASSERT_OK(Flush());
+
+  std::vector<std::string> input_files;
+  ColumnFamilyMetaData cf_meta;
+  db_->GetColumnFamilyMetaData(&cf_meta);
+
+  for (const auto& file : cf_meta.levels[0].files) {
+    input_files.push_back(file.name);
+  }
+
+  ASSERT_EQ(input_files.size(), 2);
+
+  ResetStatistics();
+
+  CompactionOptions compact_options;
+  ASSERT_OK(
+      db_->CompactFiles(compact_options, input_files, 1 /* output_level*/));
+
+  ASSERT_EQ(Get("a"), "val1");
+  ASSERT_EQ(Get("b"), "val2");
+  ASSERT_EQ(Get("c"), "old_value,new_value");
+  ASSERT_EQ(Get("d"), "val4");
+
+  VerifyResumeBytes();
+}
+
+// Cancel compaction right before processing key "c" to test resumption at a
+// single delete. When resumed, compaction will continue from the last saved
+// progress point before the single delete.
+TEST_F(ResumableCompactionKeyTypeTest, CancelAndResumeWithSingleDelete) {
+  Options options = CurrentOptions();
+
+  SetupResumableCompactionService(options, "c");
+
+  ASSERT_OK(Put("c", "old_value"));
+  ASSERT_OK(Put("c_placeholder", "placeholder"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(options.num_levels - 1);
+
+  ASSERT_OK(Put("a", "val1"));
+  ASSERT_OK(Put("b", "val2"));
+  ASSERT_OK(Put("d", "val4"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(SingleDelete("c"));
+  ASSERT_OK(Flush());
+
+  std::vector<std::string> input_files;
+  ColumnFamilyMetaData cf_meta;
+  db_->GetColumnFamilyMetaData(&cf_meta);
+
+  for (const auto& file : cf_meta.levels[0].files) {
+    input_files.push_back(file.name);
+  }
+
+  ASSERT_EQ(input_files.size(), 2);
+
+  ResetStatistics();
+
+  CompactionOptions compact_options;
+  ASSERT_OK(
+      db_->CompactFiles(compact_options, input_files, 1 /* output_level*/));
+
+  ASSERT_EQ(Get("a"), "val1");
+  ASSERT_EQ(Get("b"), "val2");
+  ASSERT_EQ(Get("c"), "NOT_FOUND");
+  ASSERT_EQ(Get("d"), "val4");
+
+  VerifyResumeBytes();
+}
+
+// Cancel compaction right before processing key "c" to test resumption at a
+// range delete. When resumed, compaction will continue from the last saved
+// progress point before the range delete.
+TEST_F(ResumableCompactionKeyTypeTest, CancelAndResumeWithRangeDelete) {
+  Options options = CurrentOptions();
+
+  SetupResumableCompactionService(options, "c");
+
+  ASSERT_OK(Put("c", "old_value"));
+  ASSERT_OK(Put("c_placeholder", "placeholder"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(options.num_levels - 1);
+
+  ASSERT_OK(Put("a", "val1"));
+  ASSERT_OK(Put("b", "val2"));
+  ASSERT_OK(Put("d", "val4"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "c", "c_"));
+  ASSERT_OK(Flush());
+
+  std::vector<std::string> input_files;
+  ColumnFamilyMetaData cf_meta;
+  db_->GetColumnFamilyMetaData(&cf_meta);
+
+  for (const auto& file : cf_meta.levels[0].files) {
+    input_files.push_back(file.name);
+  }
+
+  ASSERT_EQ(input_files.size(), 2);
+
+  ResetStatistics();
+
+  CompactionOptions compact_options;
+  ASSERT_OK(
+      db_->CompactFiles(compact_options, input_files, 1 /* output_level*/));
+
+  ASSERT_EQ(Get("a"), "val1");
+  ASSERT_EQ(Get("b"), "val2");
+  ASSERT_EQ(Get("c"), "NOT_FOUND");
+  ASSERT_EQ(Get("d"), "val4");
+
+  VerifyResumeBytes();
+}
+
+// Test resumption when a key has multiple versions spanning across file
+// boundaries (i.e., the same key exists in multiple SST files).
+//
+// Scenario:
+//   File 1 largest key: key "b"
+//   File 2 smallest key: key "c" with seqno=4 (older version)
+//   File 3 largest key: key "c" with seqno=5 (newer version)
+//
+// Cancel compaction right before processing the older version of key "c".
+// Upon resumption, compaction continues from the saved progress point "b" and
+// correctly processes both versions
+TEST_F(ResumableCompactionKeyTypeTest,
+       CancelAndResumeWithKeySpanningFileBoundaries) {
+  Options options = CurrentOptions();
+
+  // Set up cancellation at the older version of the key which will have
+  // sequence number zero-ed out
+  SetupResumableCompactionService(options, "c" /*cancel_at_key*/, 0 /*seqno*/);
+
+  ASSERT_OK(Put("a", "val1"));
+  ASSERT_OK(Put("b", "val2"));
+  ASSERT_OK(Put("d", "val4"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("c", "old_value"));
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(Put("c", "new_value"));
+  ASSERT_OK(Flush());
+
+  ResetStatistics();
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  ASSERT_EQ(Get("a"), "val1");
+  ASSERT_EQ(Get("b"), "val2");
+  ASSERT_EQ(Get("c"), "new_value");
+  ASSERT_EQ(Get("c", snapshot), "old_value");
+  ASSERT_EQ(Get("d"), "val4");
+  db_->ReleaseSnapshot(snapshot);
+
+  VerifyResumeBytes();
+}
+
+// Cancel compaction right before processing key "c" to test resumption at a
+// wide column. When resumed, compaction will continue
+// from the wide column.
+TEST_F(ResumableCompactionKeyTypeTest, CancelAndResumeWithWideColumn) {
+  Options options = CurrentOptions();
+
+  SetupResumableCompactionService(options, "c" /*cancel_at_key*/);
+
+  ASSERT_OK(Put("a", "val1"));
+  ASSERT_OK(Put("b", "val2"));
+  ASSERT_OK(Put("d", "val4"));
+  ASSERT_OK(Flush());
+
+  WideColumns columns{{"col1", "value1"}, {"col2", "value2"}};
+  ASSERT_OK(
+      db_->PutEntity(WriteOptions(), db_->DefaultColumnFamily(), "c", columns));
+  ASSERT_OK(Flush());
+
+  ResetStatistics();
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  ASSERT_EQ(Get("a"), "val1");
+  ASSERT_EQ(Get("b"), "val2");
+
+  PinnableWideColumns result;
+  ASSERT_OK(
+      db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), "c", &result));
+  WideColumns expected{{"col1", "value1"}, {"col2", "value2"}};
+  ASSERT_EQ(result.columns(), expected);
+
+  ASSERT_EQ(Get("d"), "val4");
+
+  VerifyResumeBytes();
+}
+
+// Cancel compaction right before processing key "c" to test resumption at a
+// timed put. When resumed, compaction will continue
+// from the timed put.
+TEST_F(ResumableCompactionKeyTypeTest, CancelAndResumeWithTimedPut) {
+  Options options = CurrentOptions();
+  options.preclude_last_level_data_seconds = 86400;  // Enable TimedPut feature
+  options.preserve_internal_time_seconds = 86400;    // Preserve write time
+
+  SetupResumableCompactionService(options, "c" /*cancel_at_key*/);
+
+  ASSERT_OK(Put("c", "old_value"));
+  ASSERT_OK(Put("c_placeholder", "placeholder"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(options.num_levels - 1);
+
+  ASSERT_OK(Put("a", "val1"));
+  ASSERT_OK(Put("b", "val2"));
+  ASSERT_OK(Put("d", "val4"));
+  ASSERT_OK(Flush());
+
+  // Use TimedPut for key "c" with current write time
+  uint64_t write_time = env_->NowMicros() / 1000000;
+  ASSERT_OK(TimedPut("c", "val3", write_time /*write_unix_time*/));
+  ASSERT_OK(Put("d", "val4"));
+  ASSERT_OK(Flush());
+
+  std::vector<std::string> input_files;
+  ColumnFamilyMetaData cf_meta;
+  db_->GetColumnFamilyMetaData(&cf_meta);
+
+  for (const auto& file : cf_meta.levels[0].files) {
+    input_files.push_back(file.name);
+  }
+
+  ASSERT_EQ(input_files.size(), 2);
+
+  ResetStatistics();
+
+  CompactionOptions compact_options;
+  ASSERT_OK(
+      db_->CompactFiles(compact_options, input_files, 1 /* output_level*/));
+
+  ASSERT_EQ(Get("a"), "val1");
+  ASSERT_EQ(Get("b"), "val2");
+  ASSERT_EQ(Get("c"), "val3");
+  ASSERT_EQ(Get("d"), "val4");
+
+  VerifyResumeBytes();
+}
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/compaction/compaction_state.cc b/db/compaction/compaction_state.cc
index bf016d04b694..febf2e01d1e0 100644
--- a/db/compaction/compaction_state.cc
+++ b/db/compaction/compaction_state.cc
@@ -36,11 +36,11 @@ Slice CompactionState::LargestUserKey() {
 }
 
 void CompactionState::AggregateCompactionStats(
-    InternalStats::CompactionStatsFull& compaction_stats,
-    CompactionJobStats& compaction_job_stats) {
+    InternalStats::CompactionStatsFull& internal_stats,
+    CompactionJobStats& job_stats) {
   for (const auto& sc : sub_compact_states) {
-    sc.AggregateCompactionOutputStats(compaction_stats);
-    compaction_job_stats.Add(sc.compaction_job_stats);
+    sc.AggregateCompactionOutputStats(internal_stats);
+    job_stats.Add(sc.compaction_job_stats);
   }
 }
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/compaction/compaction_state.h b/db/compaction/compaction_state.h
index cc5b66c68224..faad712b6ff5 100644
--- a/db/compaction/compaction_state.h
+++ b/db/compaction/compaction_state.h
@@ -29,8 +29,8 @@ class CompactionState {
   Status status;
 
   void AggregateCompactionStats(
-      InternalStats::CompactionStatsFull& compaction_stats,
-      CompactionJobStats& compaction_job_stats);
+      InternalStats::CompactionStatsFull& internal_stats,
+      CompactionJobStats& job_stats);
 
   explicit CompactionState(Compaction* c) : compaction(c) {}
 
diff --git a/db/compaction/subcompaction_state.cc b/db/compaction/subcompaction_state.cc
index 13f40f63f0ca..0e8f673c1124 100644
--- a/db/compaction/subcompaction_state.cc
+++ b/db/compaction/subcompaction_state.cc
@@ -14,33 +14,32 @@
 
 namespace ROCKSDB_NAMESPACE {
 void SubcompactionState::AggregateCompactionOutputStats(
-    InternalStats::CompactionStatsFull& compaction_stats) const {
+    InternalStats::CompactionStatsFull& internal_stats) const {
   // Outputs should be closed. By extension, any files created just for
   // range deletes have already been written also.
   assert(compaction_outputs_.HasBuilder() == false);
-  assert(penultimate_level_outputs_.HasBuilder() == false);
+  assert(proximal_level_outputs_.HasBuilder() == false);
 
   // FIXME: These stats currently include abandonned output files
   // assert(compaction_outputs_.stats_.num_output_files ==
   //        compaction_outputs_.outputs_.size());
-  // assert(penultimate_level_outputs_.stats_.num_output_files ==
-  //        penultimate_level_outputs_.outputs_.size());
+  // assert(proximal_level_outputs_.stats_.num_output_files ==
+  //        proximal_level_outputs_.outputs_.size());
 
-  compaction_stats.stats.Add(compaction_outputs_.stats_);
-  if (penultimate_level_outputs_.HasOutput()) {
-    compaction_stats.has_penultimate_level_output = true;
-    compaction_stats.penultimate_level_stats.Add(
-        penultimate_level_outputs_.stats_);
+  internal_stats.output_level_stats.Add(compaction_outputs_.stats_);
+  if (proximal_level_outputs_.HasOutput()) {
+    internal_stats.has_proximal_level_output = true;
+    internal_stats.proximal_level_stats.Add(proximal_level_outputs_.stats_);
   }
 }
 
 OutputIterator SubcompactionState::GetOutputs() const {
-  return OutputIterator(penultimate_level_outputs_.outputs_,
+  return OutputIterator(proximal_level_outputs_.outputs_,
                         compaction_outputs_.outputs_);
 }
 
 void SubcompactionState::Cleanup(Cache* cache) {
-  penultimate_level_outputs_.Cleanup();
+  proximal_level_outputs_.Cleanup();
   compaction_outputs_.Cleanup();
 
   if (!status.ok()) {
@@ -63,9 +62,9 @@ void SubcompactionState::Cleanup(Cache* cache) {
 }
 
 Slice SubcompactionState::SmallestUserKey() const {
-  if (penultimate_level_outputs_.HasOutput()) {
+  if (proximal_level_outputs_.HasOutput()) {
     Slice a = compaction_outputs_.SmallestUserKey();
-    Slice b = penultimate_level_outputs_.SmallestUserKey();
+    Slice b = proximal_level_outputs_.SmallestUserKey();
     if (a.empty()) {
       return b;
     }
@@ -85,9 +84,9 @@ Slice SubcompactionState::SmallestUserKey() const {
 }
 
 Slice SubcompactionState::LargestUserKey() const {
-  if (penultimate_level_outputs_.HasOutput()) {
+  if (proximal_level_outputs_.HasOutput()) {
     Slice a = compaction_outputs_.LargestUserKey();
-    Slice b = penultimate_level_outputs_.LargestUserKey();
+    Slice b = proximal_level_outputs_.LargestUserKey();
     if (a.empty()) {
       return b;
     }
@@ -107,13 +106,15 @@ Slice SubcompactionState::LargestUserKey() const {
 }
 
 Status SubcompactionState::AddToOutput(
-    const CompactionIterator& iter, bool use_penultimate_output,
+    const CompactionIterator& iter, bool use_proximal_output,
     const CompactionFileOpenFunc& open_file_func,
-    const CompactionFileCloseFunc& close_file_func) {
+    const CompactionFileCloseFunc& close_file_func,
+    const ParsedInternalKey& prev_iter_output_internal_key) {
   // update target output
-  current_outputs_ = use_penultimate_output ? &penultimate_level_outputs_
-                                            : &compaction_outputs_;
-  return current_outputs_->AddToOutput(iter, open_file_func, close_file_func);
+  current_outputs_ =
+      use_proximal_output ? &proximal_level_outputs_ : &compaction_outputs_;
+  return current_outputs_->AddToOutput(iter, open_file_func, close_file_func,
+                                       prev_iter_output_internal_key);
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/compaction/subcompaction_state.h b/db/compaction/subcompaction_state.h
index 6a28f74d9089..38785f9ae085 100644
--- a/db/compaction/subcompaction_state.h
+++ b/db/compaction/subcompaction_state.h
@@ -26,13 +26,13 @@ namespace ROCKSDB_NAMESPACE {
 // Maintains state and outputs for each sub-compaction
 // It contains 2 `CompactionOutputs`:
 //  1. one for the normal output files
-//  2. another for the penultimate level outputs
+//  2. another for the proximal level outputs
 // a `current` pointer maintains the current output group, when calling
 // `AddToOutput()`, it checks the output of the current compaction_iterator key
 // and point `current` to the target output group. By default, it just points to
 // normal compaction_outputs, if the compaction_iterator key should be placed on
-// the penultimate level, `current` is changed to point to
-// `penultimate_level_outputs`.
+// the proximal level, `current` is changed to point to
+// `proximal_level_outputs`.
 // The later operations uses `Current()` to get the target group.
 //
 // +----------+          +-----------------------------+      +---------+
@@ -43,7 +43,7 @@ namespace ROCKSDB_NAMESPACE {
 //       |                                                    |  ...    |
 //       |
 //       |               +-----------------------------+      +---------+
-//       +-------------> | penultimate_level_outputs   |----->| output  |
+//       +-------------> | proximal_level_outputs      |----->| output  |
 //                       +-----------------------------+      +---------+
 //                                                            |  ...    |
 
@@ -78,7 +78,7 @@ class SubcompactionState {
   Slice LargestUserKey() const;
 
   // Get all outputs from the subcompaction. For per_key_placement compaction,
-  // it returns both the last level outputs and penultimate level outputs.
+  // it returns both the last level outputs and proximal level outputs.
   OutputIterator GetOutputs() const;
 
   // Assign range dels aggregator. The various tombstones will potentially
@@ -92,7 +92,15 @@ class SubcompactionState {
 
   void RemoveLastEmptyOutput() {
     compaction_outputs_.RemoveLastEmptyOutput();
-    penultimate_level_outputs_.RemoveLastEmptyOutput();
+    proximal_level_outputs_.RemoveLastEmptyOutput();
+  }
+
+  // Cleanup output builders for abandoning in-progress files.
+  void CleanupOutputs() {
+    compaction_outputs_.Cleanup();
+    if (compaction->SupportsPerKeyPlacement()) {
+      proximal_level_outputs_.Cleanup();
+    }
   }
 
   void BuildSubcompactionJobInfo(
@@ -106,7 +114,11 @@ class SubcompactionState {
     subcompaction_job_info.subcompaction_job_id = static_cast<int>(sub_job_id);
     subcompaction_job_info.base_input_level = c->start_level();
     subcompaction_job_info.output_level = c->output_level();
+    subcompaction_job_info.compaction_reason = c->compaction_reason();
+    subcompaction_job_info.compression = c->output_compression();
     subcompaction_job_info.stats = compaction_job_stats;
+    subcompaction_job_info.blob_compression_type =
+        c->mutable_cf_options().blob_compression_type;
   }
 
   SubcompactionState() = delete;
@@ -119,14 +131,14 @@ class SubcompactionState {
         start(_start),
         end(_end),
         sub_job_id(_sub_job_id),
-        compaction_outputs_(c, /*is_penultimate_level=*/false),
-        penultimate_level_outputs_(c, /*is_penultimate_level=*/true) {
+        compaction_outputs_(c, /*is_proximal_level=*/false),
+        proximal_level_outputs_(c, /*is_proximal_level=*/true) {
     assert(compaction != nullptr);
     // Set output split key (used for RoundRobin feature) only for normal
-    // compaction_outputs, output to penultimate_level feature doesn't support
+    // compaction_outputs, output to proximal_level feature doesn't support
     // RoundRobin feature (and may never going to be supported, because for
     // RoundRobin, the data time is mostly naturally sorted, no need to have
-    // per-key placement with output_to_penultimate_level).
+    // per-key placement with output_to_proximal_level).
     compaction_outputs_.SetOutputSlitKey(start, end);
   }
 
@@ -141,18 +153,17 @@ class SubcompactionState {
         compaction_job_stats(std::move(state.compaction_job_stats)),
         sub_job_id(state.sub_job_id),
         compaction_outputs_(std::move(state.compaction_outputs_)),
-        penultimate_level_outputs_(std::move(state.penultimate_level_outputs_)),
+        proximal_level_outputs_(std::move(state.proximal_level_outputs_)),
         range_del_agg_(std::move(state.range_del_agg_)) {
-    current_outputs_ =
-        state.current_outputs_ == &state.penultimate_level_outputs_
-            ? &penultimate_level_outputs_
-            : &compaction_outputs_;
+    current_outputs_ = state.current_outputs_ == &state.proximal_level_outputs_
+                           ? &proximal_level_outputs_
+                           : &compaction_outputs_;
   }
 
   // Add all the new files from this compaction to version_edit
   void AddOutputsEdit(VersionEdit* out_edit) const {
-    for (const auto& file : penultimate_level_outputs_.outputs_) {
-      out_edit->AddFile(compaction->GetPenultimateLevel(), file.meta);
+    for (const auto& file : proximal_level_outputs_.outputs_) {
+      out_edit->AddFile(compaction->GetProximalLevel(), file.meta);
     }
     for (const auto& file : compaction_outputs_.outputs_) {
       out_edit->AddFile(compaction->output_level(), file.meta);
@@ -162,13 +173,40 @@ class SubcompactionState {
   void Cleanup(Cache* cache);
 
   void AggregateCompactionOutputStats(
-      InternalStats::CompactionStatsFull& compaction_stats) const;
+      InternalStats::CompactionStatsFull& internal_stats) const;
 
   CompactionOutputs& Current() const {
     assert(current_outputs_);
     return *current_outputs_;
   }
 
+  CompactionOutputs* Outputs(bool is_proximal_level) {
+    assert(compaction);
+    if (is_proximal_level) {
+      assert(compaction->SupportsPerKeyPlacement());
+      return &proximal_level_outputs_;
+    }
+    return &compaction_outputs_;
+  }
+
+  // Per-level stats for the output
+  InternalStats::CompactionStats* OutputStats(bool is_proximal_level) {
+    assert(compaction);
+    if (is_proximal_level) {
+      assert(compaction->SupportsPerKeyPlacement());
+      return &proximal_level_outputs_.stats_;
+    }
+    return &compaction_outputs_.stats_;
+  }
+
+  uint64_t GetWorkerCPUMicros() const {
+    uint64_t rv = compaction_outputs_.GetWorkerCPUMicros();
+    if (compaction->SupportsPerKeyPlacement()) {
+      rv += proximal_level_outputs_.GetWorkerCPUMicros();
+    }
+    return rv;
+  }
+
   CompactionRangeDelAggregator* RangeDelAgg() const {
     return range_del_agg_.get();
   }
@@ -178,13 +216,22 @@ class SubcompactionState {
     return range_del_agg_ && !range_del_agg_->IsEmpty();
   }
 
+  void SetSubcompactionProgress(
+      const SubcompactionProgress& subcompaction_progress) {
+    subcompaction_progress_ = subcompaction_progress;
+  }
+
+  SubcompactionProgress& GetSubcompactionProgressRef() {
+    return subcompaction_progress_;
+  }
+
   // Add compaction_iterator key/value to the `Current` output group.
-  Status AddToOutput(const CompactionIterator& iter,
-                     bool use_penultimate_output,
+  Status AddToOutput(const CompactionIterator& iter, bool use_proximal_output,
                      const CompactionFileOpenFunc& open_file_func,
-                     const CompactionFileCloseFunc& close_file_func);
+                     const CompactionFileCloseFunc& close_file_func,
+                     const ParsedInternalKey& prev_iter_output_internal_key);
 
-  // Close all compaction output files, both output_to_penultimate_level outputs
+  // Close all compaction output files, both output_to_proximal_level outputs
   // and normal outputs.
   Status CloseCompactionFiles(const Status& curr_status,
                               const CompactionFileOpenFunc& open_file_func,
@@ -195,11 +242,11 @@ class SubcompactionState {
     // CloseOutput() may open new compaction output files.
     Status s = curr_status;
     if (per_key) {
-      s = penultimate_level_outputs_.CloseOutput(
-          s, range_del_agg_.get(), open_file_func, close_file_func);
+      s = proximal_level_outputs_.CloseOutput(s, range_del_agg_.get(),
+                                              open_file_func, close_file_func);
     } else {
-      assert(penultimate_level_outputs_.HasBuilder() == false);
-      assert(penultimate_level_outputs_.HasOutput() == false);
+      assert(proximal_level_outputs_.HasBuilder() == false);
+      assert(proximal_level_outputs_.HasOutput() == false);
     }
     s = compaction_outputs_.CloseOutput(s, range_del_agg_.get(), open_file_func,
                                         close_file_func);
@@ -209,9 +256,11 @@ class SubcompactionState {
  private:
   // State kept for output being generated
   CompactionOutputs compaction_outputs_;
-  CompactionOutputs penultimate_level_outputs_;
+  CompactionOutputs proximal_level_outputs_;
   CompactionOutputs* current_outputs_ = &compaction_outputs_;
   std::unique_ptr<CompactionRangeDelAggregator> range_del_agg_;
+
+  SubcompactionProgress subcompaction_progress_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/compaction/tiered_compaction_test.cc b/db/compaction/tiered_compaction_test.cc
index eed5cb936f06..7bd840e486d4 100644
--- a/db/compaction/tiered_compaction_test.cc
+++ b/db/compaction/tiered_compaction_test.cc
@@ -33,42 +33,13 @@ ConfigOptions GetStrictConfigOptions() {
 class TieredCompactionTest : public DBTestBase {
  public:
   TieredCompactionTest()
-      : DBTestBase("tiered_compaction_test", /*env_do_fsync=*/true),
-        kBasicCompStats(CompactionReason::kUniversalSizeAmplification, 1),
-        kBasicPerKeyPlacementCompStats(
-            CompactionReason::kUniversalSizeAmplification, 1),
-        kBasicFlushStats(CompactionReason::kFlush, 1) {
-    kBasicCompStats.micros = kHasValue;
-    kBasicCompStats.cpu_micros = kHasValue;
-    kBasicCompStats.bytes_read_non_output_levels = kHasValue;
-    kBasicCompStats.num_input_files_in_non_output_levels = kHasValue;
-    kBasicCompStats.num_input_records = kHasValue;
-    kBasicCompStats.num_dropped_records = kHasValue;
-
-    kBasicPerLevelStats.num_output_records = kHasValue;
-    kBasicPerLevelStats.bytes_written = kHasValue;
-    kBasicPerLevelStats.num_output_files = kHasValue;
-
-    kBasicPerKeyPlacementCompStats.micros = kHasValue;
-    kBasicPerKeyPlacementCompStats.cpu_micros = kHasValue;
-    kBasicPerKeyPlacementCompStats.Add(kBasicPerLevelStats);
-
-    kBasicFlushStats.micros = kHasValue;
-    kBasicFlushStats.cpu_micros = kHasValue;
-    kBasicFlushStats.bytes_written = kHasValue;
-    kBasicFlushStats.num_output_files = kHasValue;
-  }
+      : DBTestBase("tiered_compaction_test", /*env_do_fsync=*/true) {}
 
  protected:
-  static constexpr uint8_t kHasValue = 1;
-
-  InternalStats::CompactionStats kBasicCompStats;
-  InternalStats::CompactionStats kBasicPerKeyPlacementCompStats;
-  InternalStats::CompactionOutputsStats kBasicPerLevelStats;
-  InternalStats::CompactionStats kBasicFlushStats;
-
   std::atomic_bool enable_per_key_placement = true;
 
+  CompactionJobStats job_stats;
+
   void SetUp() override {
     SyncPoint::GetInstance()->SetCallBack(
         "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
@@ -108,21 +79,36 @@ class TieredCompactionTest : public DBTestBase {
 
   // Verify the compaction stats, the stats are roughly compared
   void VerifyCompactionStats(
-      const std::vector<InternalStats::CompactionStats>& expect_stats,
-      const InternalStats::CompactionStats& expect_pl_stats) {
+      const std::vector<InternalStats::CompactionStats>& expected_stats,
+      const InternalStats::CompactionStats& expected_pl_stats,
+      size_t output_level, uint64_t num_input_range_del = 0) {
     const std::vector<InternalStats::CompactionStats>& stats =
         GetCompactionStats();
-    const size_t kLevels = expect_stats.size();
+    const size_t kLevels = expected_stats.size();
     ASSERT_EQ(kLevels, stats.size());
+    ASSERT_TRUE(output_level < kLevels);
 
-    for (auto it = stats.begin(), expect = expect_stats.begin();
-         it != stats.end(); it++, expect++) {
-      VerifyCompactionStats(*it, *expect);
+    for (size_t level = 0; level < kLevels; level++) {
+      VerifyCompactionStats(stats[level], expected_stats[level]);
     }
 
     const InternalStats::CompactionStats& pl_stats =
         GetPerKeyPlacementCompactionStats();
-    VerifyCompactionStats(pl_stats, expect_pl_stats);
+    VerifyCompactionStats(pl_stats, expected_pl_stats);
+
+    const auto& output_level_stats = stats[output_level];
+    CompactionJobStats expected_job_stats;
+    expected_job_stats.cpu_micros = output_level_stats.cpu_micros;
+    expected_job_stats.num_input_files =
+        output_level_stats.num_input_files_in_output_level +
+        output_level_stats.num_input_files_in_non_output_levels;
+    expected_job_stats.num_input_records =
+        output_level_stats.num_input_records - num_input_range_del;
+    expected_job_stats.num_output_files =
+        output_level_stats.num_output_files + pl_stats.num_output_files;
+    expected_job_stats.num_output_records =
+        output_level_stats.num_output_records + pl_stats.num_output_records;
+    VerifyCompactionJobStats(job_stats, expected_job_stats);
   }
 
   void ResetAllStats(std::vector<InternalStats::CompactionStats>& stats,
@@ -139,42 +125,52 @@ class TieredCompactionTest : public DBTestBase {
   }
 
  private:
-  void CompareStats(uint64_t val, uint64_t expect) {
-    if (expect > 0) {
-      ASSERT_TRUE(val > 0);
-    } else {
-      ASSERT_EQ(val, 0);
-    }
-  }
-
   void VerifyCompactionStats(
       const InternalStats::CompactionStats& stats,
       const InternalStats::CompactionStats& expect_stats) {
-    CompareStats(stats.micros, expect_stats.micros);
-    CompareStats(stats.cpu_micros, expect_stats.cpu_micros);
-    CompareStats(stats.bytes_read_non_output_levels,
-                 expect_stats.bytes_read_non_output_levels);
-    CompareStats(stats.bytes_read_output_level,
-                 expect_stats.bytes_read_output_level);
-    CompareStats(stats.bytes_read_blob, expect_stats.bytes_read_blob);
-    CompareStats(stats.bytes_written, expect_stats.bytes_written);
-    CompareStats(stats.bytes_moved, expect_stats.bytes_moved);
-    CompareStats(stats.num_input_files_in_non_output_levels,
-                 expect_stats.num_input_files_in_non_output_levels);
-    CompareStats(stats.num_input_files_in_output_level,
-                 expect_stats.num_input_files_in_output_level);
-    CompareStats(stats.num_output_files, expect_stats.num_output_files);
-    CompareStats(stats.num_output_files_blob,
-                 expect_stats.num_output_files_blob);
-    CompareStats(stats.num_input_records, expect_stats.num_input_records);
-    CompareStats(stats.num_dropped_records, expect_stats.num_dropped_records);
-    CompareStats(stats.num_output_records, expect_stats.num_output_records);
+    ASSERT_EQ(stats.micros > 0, expect_stats.micros > 0);
+    ASSERT_EQ(stats.cpu_micros > 0, expect_stats.cpu_micros > 0);
+
+    // Hard to get consistent byte sizes of SST files.
+    // Use ASSERT_NEAR for comparison
+    ASSERT_NEAR(stats.bytes_read_non_output_levels * 1.0f,
+                expect_stats.bytes_read_non_output_levels * 1.0f,
+                stats.bytes_read_non_output_levels * 0.5f);
+    ASSERT_NEAR(stats.bytes_read_output_level * 1.0f,
+                expect_stats.bytes_read_output_level * 1.0f,
+                stats.bytes_read_output_level * 0.5f);
+    ASSERT_NEAR(stats.bytes_read_blob * 1.0f,
+                expect_stats.bytes_read_blob * 1.0f,
+                stats.bytes_read_blob * 0.5f);
+    ASSERT_NEAR(stats.bytes_written * 1.0f, expect_stats.bytes_written * 1.0f,
+                stats.bytes_written * 0.5f);
+
+    ASSERT_EQ(stats.bytes_moved, expect_stats.bytes_moved);
+    ASSERT_EQ(stats.num_input_files_in_non_output_levels,
+              expect_stats.num_input_files_in_non_output_levels);
+    ASSERT_EQ(stats.num_input_files_in_output_level,
+              expect_stats.num_input_files_in_output_level);
+    ASSERT_EQ(stats.num_output_files, expect_stats.num_output_files);
+    ASSERT_EQ(stats.num_output_files_blob, expect_stats.num_output_files_blob);
+    ASSERT_EQ(stats.num_input_records, expect_stats.num_input_records);
+    ASSERT_EQ(stats.num_dropped_records, expect_stats.num_dropped_records);
+    ASSERT_EQ(stats.num_output_records, expect_stats.num_output_records);
+
     ASSERT_EQ(stats.count, expect_stats.count);
     for (int i = 0; i < static_cast<int>(CompactionReason::kNumOfReasons);
          i++) {
       ASSERT_EQ(stats.counts[i], expect_stats.counts[i]);
     }
   }
+
+  void VerifyCompactionJobStats(const CompactionJobStats& stats,
+                                const CompactionJobStats& expected_stats) {
+    ASSERT_EQ(stats.cpu_micros, expected_stats.cpu_micros);
+    ASSERT_EQ(stats.num_input_files, expected_stats.num_input_files);
+    ASSERT_EQ(stats.num_input_records, expected_stats.num_input_records);
+    ASSERT_EQ(job_stats.num_output_files, expected_stats.num_output_files);
+    ASSERT_EQ(job_stats.num_output_records, expected_stats.num_output_records);
+  }
 };
 
 TEST_F(TieredCompactionTest, SequenceBasedTieredStorageUniversal) {
@@ -199,19 +195,39 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageUniversal) {
       [&](void* arg) {
         *static_cast<SequenceNumber*>(arg) = latest_cold_seq.load();
       });
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Install:AfterUpdateCompactionJobStats", [&](void* arg) {
+        job_stats.Reset();
+        job_stats.Add(*(static_cast<CompactionJobStats*>(arg)));
+      });
   SyncPoint::GetInstance()->EnableProcessing();
 
   std::vector<InternalStats::CompactionStats> expect_stats(kNumLevels);
-  InternalStats::CompactionStats& last_stats = expect_stats[kLastLevel];
   InternalStats::CompactionStats expect_pl_stats;
 
+  // Put keys in the following way to create overlaps
+  // First file from 0 ~ 99
+  // Second file from 10 ~ 109
+  // ...
+  size_t bytes_per_file = 1952;
+  uint64_t total_input_key_count = kNumTrigger * kNumKeys;
+  uint64_t total_output_key_count = 130;  // 0 ~ 129
+
   for (int i = 0; i < kNumTrigger; i++) {
     for (int j = 0; j < kNumKeys; j++) {
       ASSERT_OK(Put(Key(i * 10 + j), "value" + std::to_string(i)));
     }
     ASSERT_OK(Flush());
+
     seq_history.emplace_back(dbfull()->GetLatestSequenceNumber());
-    expect_stats[0].Add(kBasicFlushStats);
+    InternalStats::CompactionStats flush_stats(CompactionReason::kFlush, 1);
+    flush_stats.cpu_micros = 1;
+    flush_stats.micros = 1;
+    flush_stats.bytes_written = bytes_per_file;
+    flush_stats.num_output_files = 1;
+    flush_stats.num_input_records = kNumKeys;
+    flush_stats.num_output_records = kNumKeys;
+    expect_stats[0].Add(flush_stats);
   }
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
@@ -221,32 +237,97 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageUniversal) {
   ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
   ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
 
-  // basic compaction stats are still counted to the last level
-  expect_stats[kLastLevel].Add(kBasicCompStats);
-  expect_pl_stats.Add(kBasicPerKeyPlacementCompStats);
+  uint64_t bytes_written_penultimate_level =
+      GetPerKeyPlacementCompactionStats().bytes_written;
 
-  VerifyCompactionStats(expect_stats, expect_pl_stats);
+  // TODO - Use designated initializer when c++20 support is required
+  {
+    InternalStats::CompactionStats last_level_compaction_stats(
+        CompactionReason::kUniversalSizeAmplification, 1);
+    last_level_compaction_stats.cpu_micros = 1;
+    last_level_compaction_stats.micros = 1;
+    last_level_compaction_stats.bytes_written = 0;
+    last_level_compaction_stats.bytes_read_non_output_levels =
+        bytes_per_file * kNumTrigger;
+    last_level_compaction_stats.num_input_files_in_non_output_levels =
+        kNumTrigger;
+    last_level_compaction_stats.num_input_records = total_input_key_count;
+    last_level_compaction_stats.num_dropped_records =
+        total_input_key_count - total_output_key_count;
+    last_level_compaction_stats.num_output_records = 0;
+    last_level_compaction_stats.num_output_files = 0;
+    expect_stats[kLastLevel].Add(last_level_compaction_stats);
+  }
+  {
+    InternalStats::CompactionStats penultimate_level_compaction_stats(
+        CompactionReason::kUniversalSizeAmplification, 1);
+    penultimate_level_compaction_stats.cpu_micros = 1;
+    penultimate_level_compaction_stats.micros = 1;
+    penultimate_level_compaction_stats.bytes_written =
+        bytes_written_penultimate_level;
+    penultimate_level_compaction_stats.num_output_files = 1;
+    penultimate_level_compaction_stats.num_output_records =
+        total_output_key_count;
+    expect_pl_stats.Add(penultimate_level_compaction_stats);
+  }
+  VerifyCompactionStats(expect_stats, expect_pl_stats, kLastLevel);
 
   ResetAllStats(expect_stats, expect_pl_stats);
 
   // move forward the cold_seq to split the file into 2 levels, so should have
-  // both the last level stats and the output_to_penultimate_level stats
+  // both the last level stats and the penultimate level stats
   latest_cold_seq = seq_history[0];
   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
 
   ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
   ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
 
-  last_stats.Add(kBasicCompStats);
-  last_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
-  last_stats.Add(kBasicPerLevelStats);
-  last_stats.num_dropped_records = 0;
-  expect_pl_stats.Add(kBasicPerKeyPlacementCompStats);
-  expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
-  VerifyCompactionStats(expect_stats, expect_pl_stats);
+  // Now update the input count to be the total count from the previous
+  total_input_key_count = total_output_key_count;
+  uint64_t moved_to_last_level_key_count = 10;
 
-  // delete all cold data, so all data will be on penultimate level
+  // bytes read in non output = bytes written in penultimate level from previous
+  uint64_t bytes_read_in_non_output_level = bytes_written_penultimate_level;
+  uint64_t bytes_written_output_level =
+      GetCompactionStats()[kLastLevel].bytes_written;
+
+  // Now get the new bytes written in penultimate level
+  bytes_written_penultimate_level =
+      GetPerKeyPlacementCompactionStats().bytes_written;
+  {
+    InternalStats::CompactionStats last_level_compaction_stats(
+        CompactionReason::kManualCompaction, 1);
+    last_level_compaction_stats.cpu_micros = 1;
+    last_level_compaction_stats.micros = 1;
+    last_level_compaction_stats.bytes_written = bytes_written_output_level;
+    last_level_compaction_stats.bytes_read_non_output_levels =
+        bytes_read_in_non_output_level;
+    last_level_compaction_stats.num_input_files_in_non_output_levels = 1;
+    last_level_compaction_stats.num_input_records = total_input_key_count;
+    last_level_compaction_stats.num_dropped_records =
+        total_input_key_count - total_output_key_count;
+    last_level_compaction_stats.num_output_records =
+        moved_to_last_level_key_count;
+    last_level_compaction_stats.num_output_files = 1;
+    expect_stats[kLastLevel].Add(last_level_compaction_stats);
+  }
+  {
+    InternalStats::CompactionStats penultimate_level_compaction_stats(
+        CompactionReason::kManualCompaction, 1);
+    penultimate_level_compaction_stats.cpu_micros = 1;
+    penultimate_level_compaction_stats.micros = 1;
+    penultimate_level_compaction_stats.bytes_written =
+        bytes_written_penultimate_level;
+    penultimate_level_compaction_stats.num_output_files = 1;
+    penultimate_level_compaction_stats.num_output_records =
+        total_output_key_count - moved_to_last_level_key_count;
+    expect_pl_stats.Add(penultimate_level_compaction_stats);
+  }
+  VerifyCompactionStats(expect_stats, expect_pl_stats, kLastLevel);
+
+  // delete all cold data, so all data will be on proximal level
   for (int i = 0; i < 10; i++) {
     ASSERT_OK(Delete(Key(i)));
   }
@@ -255,17 +336,54 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageUniversal) {
   ResetAllStats(expect_stats, expect_pl_stats);
 
   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
   ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
   ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
 
-  last_stats.Add(kBasicCompStats);
-  last_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
-  last_stats.bytes_read_output_level = kHasValue;
-  last_stats.num_input_files_in_output_level = kHasValue;
-  expect_pl_stats.Add(kBasicPerKeyPlacementCompStats);
-  expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
-  VerifyCompactionStats(expect_stats, expect_pl_stats);
+  // 10 tombstones added
+  total_input_key_count = total_input_key_count + 10;
+  total_output_key_count = total_output_key_count - 10;
+
+  auto last_level_stats = GetCompactionStats()[kLastLevel];
+  bytes_written_penultimate_level =
+      GetPerKeyPlacementCompactionStats().bytes_written;
+
+  ASSERT_LT(bytes_written_penultimate_level,
+            last_level_stats.bytes_read_non_output_levels +
+                last_level_stats.bytes_read_output_level);
+  {
+    InternalStats::CompactionStats last_level_compaction_stats(
+        CompactionReason::kManualCompaction, 1);
+    last_level_compaction_stats.cpu_micros = 1;
+    last_level_compaction_stats.micros = 1;
+    last_level_compaction_stats.bytes_written = 0;
+    last_level_compaction_stats.bytes_read_non_output_levels =
+        last_level_stats.bytes_read_non_output_levels;
+    last_level_compaction_stats.bytes_read_output_level =
+        last_level_stats.bytes_read_output_level;
+    last_level_compaction_stats.num_input_files_in_non_output_levels = 2;
+    last_level_compaction_stats.num_input_files_in_output_level = 1;
+    last_level_compaction_stats.num_input_records = total_input_key_count;
+    last_level_compaction_stats.num_dropped_records =
+        total_input_key_count - total_output_key_count;
+    last_level_compaction_stats.num_output_records = 0;
+    last_level_compaction_stats.num_output_files = 0;
+    expect_stats[kLastLevel].Add(last_level_compaction_stats);
+  }
+  {
+    InternalStats::CompactionStats penultimate_level_compaction_stats(
+        CompactionReason::kManualCompaction, 1);
+    penultimate_level_compaction_stats.cpu_micros = 1;
+    penultimate_level_compaction_stats.micros = 1;
+    penultimate_level_compaction_stats.bytes_written =
+        bytes_written_penultimate_level;
+    penultimate_level_compaction_stats.num_output_files = 1;
+    penultimate_level_compaction_stats.num_output_records =
+        total_output_key_count;
+    expect_pl_stats.Add(penultimate_level_compaction_stats);
+  }
+  VerifyCompactionStats(expect_stats, expect_pl_stats, kLastLevel);
 
   // move forward the cold_seq again with range delete, take a snapshot to keep
   // the range dels in both cold and hot SSTs
@@ -275,6 +393,7 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageUniversal) {
   ASSERT_OK(
       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
   ASSERT_OK(Flush());
+  uint64_t num_input_range_del = 1;
 
   ResetAllStats(expect_stats, expect_pl_stats);
 
@@ -283,12 +402,49 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageUniversal) {
   ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
   ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
 
-  last_stats.Add(kBasicCompStats);
-  last_stats.Add(kBasicPerLevelStats);
-  last_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
-  expect_pl_stats.Add(kBasicPerKeyPlacementCompStats);
-  expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
-  VerifyCompactionStats(expect_stats, expect_pl_stats);
+  // Previous output + one delete range
+  total_input_key_count = total_output_key_count + num_input_range_del;
+  moved_to_last_level_key_count = 20;
+
+  last_level_stats = GetCompactionStats()[kLastLevel];
+  bytes_written_penultimate_level =
+      GetPerKeyPlacementCompactionStats().bytes_written;
+  // Expected to write more in last level
+  ASSERT_GT(bytes_written_penultimate_level, last_level_stats.bytes_written);
+  {
+    InternalStats::CompactionStats last_level_compaction_stats(
+        CompactionReason::kManualCompaction, 1);
+    last_level_compaction_stats.cpu_micros = 1;
+    last_level_compaction_stats.micros = 1;
+    last_level_compaction_stats.bytes_written = last_level_stats.bytes_written;
+    last_level_compaction_stats.bytes_read_non_output_levels =
+        last_level_stats.bytes_read_non_output_levels;
+    last_level_compaction_stats.bytes_read_output_level = 0;
+    last_level_compaction_stats.num_input_files_in_non_output_levels = 2;
+    last_level_compaction_stats.num_input_files_in_output_level = 0;
+    last_level_compaction_stats.num_input_records = total_input_key_count;
+    last_level_compaction_stats.num_dropped_records =
+        num_input_range_del;  // delete range tombstone
+    last_level_compaction_stats.num_output_records =
+        moved_to_last_level_key_count;
+    last_level_compaction_stats.num_output_files = 1;
+    expect_stats[kLastLevel].Add(last_level_compaction_stats);
+  }
+  {
+    InternalStats::CompactionStats penultimate_level_compaction_stats(
+        CompactionReason::kManualCompaction, 1);
+    penultimate_level_compaction_stats.cpu_micros = 1;
+    penultimate_level_compaction_stats.micros = 1;
+    penultimate_level_compaction_stats.bytes_written =
+        bytes_written_penultimate_level;
+    penultimate_level_compaction_stats.num_output_files = 1;
+    penultimate_level_compaction_stats.num_output_records =
+        total_input_key_count - moved_to_last_level_key_count -
+        num_input_range_del;
+    expect_pl_stats.Add(penultimate_level_compaction_stats);
+  }
+  VerifyCompactionStats(expect_stats, expect_pl_stats, kLastLevel,
+                        num_input_range_del);
 
   // verify data
   std::string value;
@@ -341,11 +497,11 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageUniversal) {
 // This test was essentially for a hacked-up version on future functionality.
 // It can be resurrected if/when a form of range-based tiering is properly
 // implemented.
+// TODO - Add stats verification when adding this test back
 TEST_F(TieredCompactionTest, DISABLED_RangeBasedTieredStorageUniversal) {
   const int kNumTrigger = 4;
   const int kNumLevels = 7;
   const int kNumKeys = 100;
-  const int kLastLevel = kNumLevels - 1;
 
   auto options = CurrentOptions();
   options.compaction_style = kCompactionStyleUniversal;
@@ -364,14 +520,13 @@ TEST_F(TieredCompactionTest, DISABLED_RangeBasedTieredStorageUniversal) {
       "CompactionIterator::PrepareOutput.context", [&](void* arg) {
         auto context = static_cast<PerKeyPlacementContext*>(arg);
         MutexLock l(&mutex);
-        context->output_to_penultimate_level =
+        context->output_to_proximal_level =
             cmp->Compare(context->key, hot_start) >= 0 &&
             cmp->Compare(context->key, hot_end) < 0;
       });
   SyncPoint::GetInstance()->EnableProcessing();
 
   std::vector<InternalStats::CompactionStats> expect_stats(kNumLevels);
-  InternalStats::CompactionStats& last_stats = expect_stats[kLastLevel];
   InternalStats::CompactionStats expect_pl_stats;
 
   for (int i = 0; i < kNumTrigger; i++) {
@@ -379,21 +534,15 @@ TEST_F(TieredCompactionTest, DISABLED_RangeBasedTieredStorageUniversal) {
       ASSERT_OK(Put(Key(j), "value" + std::to_string(j)));
     }
     ASSERT_OK(Flush());
-    expect_stats[0].Add(kBasicFlushStats);
   }
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
   ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
   ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
 
-  last_stats.Add(kBasicCompStats);
-  last_stats.Add(kBasicPerLevelStats);
-  expect_pl_stats.Add(kBasicPerKeyPlacementCompStats);
-  VerifyCompactionStats(expect_stats, expect_pl_stats);
-
   ResetAllStats(expect_stats, expect_pl_stats);
 
-  // change to all cold, no output_to_penultimate_level output
+  // change to all cold, no output_to_proximal_level output
   {
     MutexLock l(&mutex);
     hot_start = Key(100);
@@ -404,14 +553,6 @@ TEST_F(TieredCompactionTest, DISABLED_RangeBasedTieredStorageUniversal) {
   ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
   ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
 
-  last_stats.Add(kBasicCompStats);
-  last_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
-  last_stats.Add(kBasicPerLevelStats);
-  last_stats.num_dropped_records = 0;
-  last_stats.bytes_read_output_level = kHasValue;
-  last_stats.num_input_files_in_output_level = kHasValue;
-  VerifyCompactionStats(expect_stats, expect_pl_stats);
-
   // change to all hot, universal compaction support moving data to up level if
   // it's within compaction level range.
   {
@@ -421,7 +562,7 @@ TEST_F(TieredCompactionTest, DISABLED_RangeBasedTieredStorageUniversal) {
   }
 
   // No data is moved from cold tier to hot tier because no input files from L5
-  // or higher, it's not safe to move data to output_to_penultimate_level level.
+  // or higher, it's not safe to move data to output_to_proximal_level level.
   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
 
@@ -567,7 +708,7 @@ TEST_F(TieredCompactionTest, LevelColdRangeDelete) {
 
   // 20->30 will be marked as cold data, but it cannot be placed to cold tier
   // (bottommost) otherwise, it will be "deleted" by the range del in
-  // output_to_penultimate_level level verify that these data will be able to
+  // output_to_proximal_level level verify that these data will be able to
   // queried
   for (int i = 20; i < 30; i++) {
     ASSERT_OK(Put(Key(i), "value" + std::to_string(i)));
@@ -677,17 +818,17 @@ TEST_F(TieredCompactionTest, LevelOutofBoundaryRangeDelete) {
   std::vector<std::vector<FileMetaData>> level_to_files;
   dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
                                   &level_to_files);
-  // range tombstone is in the penultimate level
-  const int penultimate_level = kNumLevels - 2;
-  ASSERT_EQ(level_to_files[penultimate_level].size(), 1);
-  ASSERT_EQ(level_to_files[penultimate_level][0].num_entries, 1);
-  ASSERT_EQ(level_to_files[penultimate_level][0].num_deletions, 1);
-  ASSERT_EQ(level_to_files[penultimate_level][0].temperature,
+  // range tombstone is in the proximal level
+  const int proximal_level = kNumLevels - 2;
+  ASSERT_EQ(level_to_files[proximal_level].size(), 1);
+  ASSERT_EQ(level_to_files[proximal_level][0].num_entries, 1);
+  ASSERT_EQ(level_to_files[proximal_level][0].num_deletions, 1);
+  ASSERT_EQ(level_to_files[proximal_level][0].temperature,
             Temperature::kUnknown);
 
   ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
   ASSERT_EQ("0,1,10",
-            FilesPerLevel());  // one file is at the penultimate level which
+            FilesPerLevel());  // one file is at the proximal level which
                                // only contains a range delete
 
   // Add 2 hot keys, each is a new SST, they will be placed in the same level as
@@ -701,7 +842,7 @@ TEST_F(TieredCompactionTest, LevelOutofBoundaryRangeDelete) {
 
   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
   ASSERT_EQ("0,2,10",
-            FilesPerLevel());  // one file is at the penultimate level
+            FilesPerLevel());  // one file is at the proximal level
                                // which only contains a range delete
   std::vector<LiveFileMetaData> live_file_meta;
   db_->GetLiveFilesMetaData(&live_file_meta);
@@ -711,7 +852,7 @@ TEST_F(TieredCompactionTest, LevelOutofBoundaryRangeDelete) {
     if (meta.num_deletions > 0) {
       // found SST with del, which has 2 entries, one for data one for range del
       ASSERT_EQ(meta.level,
-                kNumLevels - 2);  // output to penultimate level
+                kNumLevels - 2);  // output to proximal level
       ASSERT_EQ(meta.num_entries, 2);
       ASSERT_EQ(meta.num_deletions, 1);
       found_sst_with_del = true;
@@ -722,7 +863,7 @@ TEST_F(TieredCompactionTest, LevelOutofBoundaryRangeDelete) {
 
   // release the first snapshot and compact, which should compact the range del
   // but new inserted key `0` and `6` are still hot data which will be placed on
-  // the penultimate level
+  // the proximal level
   db_->ReleaseSnapshot(snap);
   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
   ASSERT_EQ("0,2,7", FilesPerLevel());
@@ -738,7 +879,7 @@ TEST_F(TieredCompactionTest, LevelOutofBoundaryRangeDelete) {
   ASSERT_FALSE(found_sst_with_del);
 
   // Now make all data cold, key 0 will be moved to the last level, but key 6 is
-  // still in snap2, so it will be kept at the penultimate level
+  // still in snap2, so it will be kept at the proximal level
   latest_cold_seq = dbfull()->GetLatestSequenceNumber();
   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
   ASSERT_EQ("0,1,8", FilesPerLevel());
@@ -783,7 +924,7 @@ TEST_F(TieredCompactionTest, UniversalRangeDelete) {
   }
   ASSERT_OK(Flush());
 
-  // compact to the penultimate level with 10 files
+  // compact to the proximal level with 10 files
   CompactRangeOptions cro;
   cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
@@ -810,7 +951,7 @@ TEST_F(TieredCompactionTest, UniversalRangeDelete) {
 
   ASSERT_EQ("0,0,0,0,0,0,8", FilesPerLevel());
 
-  // range del with snapshot should be preserved in the penultimate level
+  // range del with snapshot should be preserved in the proximal level
   auto snap = db_->GetSnapshot();
 
   start = Key(6);
@@ -841,7 +982,7 @@ TEST_F(TieredCompactionTest, UniversalRangeDelete) {
     if (meta.num_deletions > 0) {
       // found SST with del, which has 2 entries, one for data one for range del
       ASSERT_EQ(meta.level,
-                kNumLevels - 2);  // output_to_penultimate_level level
+                kNumLevels - 2);  // output_to_proximal_level level
       ASSERT_EQ(meta.num_entries, 2);
       ASSERT_EQ(meta.num_deletions, 1);
       found_sst_with_del = true;
@@ -890,6 +1031,8 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageLevel) {
   const int kNumKeys = 100;
   const int kLastLevel = kNumLevels - 1;
 
+  int output_level = 0;
+
   auto options = CurrentOptions();
   SetColdTemperature(options);
   options.level0_file_num_compaction_trigger = kNumTrigger;
@@ -906,18 +1049,42 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageLevel) {
       [&](void* arg) {
         *static_cast<SequenceNumber*>(arg) = latest_cold_seq.load();
       });
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Install:AfterUpdateCompactionJobStats", [&](void* arg) {
+        job_stats.Reset();
+        job_stats.Add(*(static_cast<CompactionJobStats*>(arg)));
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::ProcessKeyValueCompaction()::Processing", [&](void* arg) {
+        auto compaction = static_cast<Compaction*>(arg);
+        output_level = compaction->output_level();
+      });
   SyncPoint::GetInstance()->EnableProcessing();
 
   std::vector<InternalStats::CompactionStats> expect_stats(kNumLevels);
-  InternalStats::CompactionStats& last_stats = expect_stats[kLastLevel];
   InternalStats::CompactionStats expect_pl_stats;
 
+  // Put keys in the following way to create overlaps
+  // First file from 0 ~ 99
+  // Second file from 10 ~ 109
+  // ...
+  size_t bytes_per_file = 1952;
+  uint64_t total_input_key_count = kNumTrigger * kNumKeys;
+  uint64_t total_output_key_count = 130;  // 0 ~ 129
+
   for (int i = 0; i < kNumTrigger; i++) {
     for (int j = 0; j < kNumKeys; j++) {
       ASSERT_OK(Put(Key(i * 10 + j), "value" + std::to_string(i)));
     }
     ASSERT_OK(Flush());
-    expect_stats[0].Add(kBasicFlushStats);
+    InternalStats::CompactionStats flush_stats(CompactionReason::kFlush, 1);
+    flush_stats.cpu_micros = 1;
+    flush_stats.micros = 1;
+    flush_stats.bytes_written = bytes_per_file;
+    flush_stats.num_output_files = 1;
+    flush_stats.num_input_records = kNumKeys;
+    flush_stats.num_output_records = kNumKeys;
+    expect_stats[0].Add(flush_stats);
   }
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
@@ -926,10 +1093,30 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageLevel) {
   ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
   ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
 
-  expect_stats[1].Add(kBasicCompStats);
-  expect_stats[1].Add(kBasicPerLevelStats);
-  expect_stats[1].ResetCompactionReason(CompactionReason::kLevelL0FilesNum);
-  VerifyCompactionStats(expect_stats, expect_pl_stats);
+  uint64_t bytes_written_output_level =
+      GetCompactionStats()[output_level].bytes_written;
+  ASSERT_GT(bytes_written_output_level, 0);
+
+  {
+    InternalStats::CompactionStats output_level_compaction_stats(
+        CompactionReason::kLevelL0FilesNum, 1);
+    output_level_compaction_stats.cpu_micros = 1;
+    output_level_compaction_stats.micros = 1;
+    output_level_compaction_stats.bytes_written = bytes_written_output_level;
+    output_level_compaction_stats.bytes_read_non_output_levels =
+        bytes_per_file * kNumTrigger;
+    output_level_compaction_stats.bytes_read_output_level = 0;
+    output_level_compaction_stats.num_input_files_in_non_output_levels =
+        kNumTrigger;
+    output_level_compaction_stats.num_input_files_in_output_level = 0;
+    output_level_compaction_stats.num_input_records = total_input_key_count;
+    output_level_compaction_stats.num_dropped_records =
+        total_input_key_count - total_output_key_count;
+    output_level_compaction_stats.num_output_records = total_output_key_count;
+    output_level_compaction_stats.num_output_files = 1;
+    expect_stats[output_level].Add(output_level_compaction_stats);
+  }
+  VerifyCompactionStats(expect_stats, expect_pl_stats, output_level);
 
   // move all data to the last level
   MoveFilesToLevel(kLastLevel);
@@ -944,15 +1131,26 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageLevel) {
   ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
   ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
 
-  last_stats.Add(kBasicCompStats);
-  last_stats.Add(kBasicPerLevelStats);
-  last_stats.num_dropped_records = 0;
-  last_stats.bytes_read_non_output_levels = 0;
-  last_stats.num_input_files_in_non_output_levels = 0;
-  last_stats.bytes_read_output_level = kHasValue;
-  last_stats.num_input_files_in_output_level = kHasValue;
-  last_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
-  VerifyCompactionStats(expect_stats, expect_pl_stats);
+  total_input_key_count = total_output_key_count;
+  {
+    InternalStats::CompactionStats output_level_compaction_stats(
+        CompactionReason::kManualCompaction, 1);
+    output_level_compaction_stats.cpu_micros = 1;
+    output_level_compaction_stats.micros = 1;
+    output_level_compaction_stats.bytes_written = bytes_written_output_level;
+    output_level_compaction_stats.bytes_read_non_output_levels = 0;
+    output_level_compaction_stats.bytes_read_output_level =
+        bytes_written_output_level;
+    output_level_compaction_stats.num_input_files_in_non_output_levels = 0;
+    output_level_compaction_stats.num_input_files_in_output_level = 1;
+    output_level_compaction_stats.num_input_records = total_input_key_count;
+    output_level_compaction_stats.num_dropped_records =
+        total_input_key_count - total_output_key_count;
+    output_level_compaction_stats.num_output_records = total_output_key_count;
+    output_level_compaction_stats.num_output_files = 1;
+    expect_stats[output_level].Add(output_level_compaction_stats);
+  }
+  VerifyCompactionStats(expect_stats, expect_pl_stats, output_level);
 
   // Add new data, which is all hot and overriding all existing data
   latest_cold_seq = dbfull()->GetLatestSequenceNumber();
@@ -976,17 +1174,47 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageLevel) {
   ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
   ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
 
+  uint64_t bytes_written_in_proximal_level =
+      GetPerKeyPlacementCompactionStats().bytes_written;
   for (int level = 2; level < kNumLevels - 1; level++) {
-    expect_stats[level].bytes_moved = kHasValue;
+    expect_stats[level].bytes_moved = bytes_written_in_proximal_level;
   }
 
-  last_stats.Add(kBasicCompStats);
-  last_stats.bytes_read_output_level = kHasValue;
-  last_stats.num_input_files_in_output_level = kHasValue;
-  last_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
-  expect_pl_stats.Add(kBasicPerKeyPlacementCompStats);
-  expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
-  VerifyCompactionStats(expect_stats, expect_pl_stats);
+  // Another set of 130 keys + from the previous
+  total_input_key_count = total_output_key_count + 130;
+  // Merged into 130
+  total_output_key_count = 130;
+
+  {
+    InternalStats::CompactionStats output_level_compaction_stats(
+        CompactionReason::kManualCompaction, 1);
+    output_level_compaction_stats.cpu_micros = 1;
+    output_level_compaction_stats.micros = 1;
+    output_level_compaction_stats.bytes_written = 0;
+    output_level_compaction_stats.bytes_read_non_output_levels =
+        bytes_written_in_proximal_level;
+    output_level_compaction_stats.bytes_read_output_level =
+        bytes_written_output_level;
+    output_level_compaction_stats.num_input_files_in_non_output_levels = 1;
+    output_level_compaction_stats.num_input_files_in_output_level = 1;
+    output_level_compaction_stats.num_input_records = total_input_key_count;
+    output_level_compaction_stats.num_dropped_records =
+        total_input_key_count - total_output_key_count;
+    output_level_compaction_stats.num_output_records = 0;
+    output_level_compaction_stats.num_output_files = 0;
+    expect_stats[output_level].Add(output_level_compaction_stats);
+  }
+  {
+    InternalStats::CompactionStats proximal_level_compaction_stats(
+        CompactionReason::kManualCompaction, 1);
+    expect_pl_stats.cpu_micros = 1;
+    expect_pl_stats.micros = 1;
+    expect_pl_stats.bytes_written = bytes_written_in_proximal_level;
+    expect_pl_stats.num_output_files = 1;
+    expect_pl_stats.num_output_records = total_output_key_count;
+    expect_pl_stats.Add(proximal_level_compaction_stats);
+  }
+  VerifyCompactionStats(expect_stats, expect_pl_stats, output_level);
 
   // move forward the cold_seq, try to split the data into cold and hot, but in
   // this case it's unsafe to split the data
@@ -1138,7 +1366,7 @@ TEST_F(TieredCompactionTest, DISABLED_RangeBasedTieredStorageLevel) {
       "CompactionIterator::PrepareOutput.context", [&](void* arg) {
         auto context = static_cast<PerKeyPlacementContext*>(arg);
         MutexLock l(&mutex);
-        context->output_to_penultimate_level =
+        context->output_to_proximal_level =
             cmp->Compare(context->key, hot_start) >= 0 &&
             cmp->Compare(context->key, hot_end) < 0;
       });
@@ -1221,10 +1449,10 @@ TEST_F(TieredCompactionTest, DISABLED_RangeBasedTieredStorageLevel) {
       options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE),
       1);
 
-  // Tests that we only compact keys up to penultimate level
-  // that are within penultimate level input's internal key range.
-  // UPDATE: this functionality has changed. With penultimate-enabled
-  // compaction, the expanded potential output range in the penultimate
+  // Tests that we only compact keys up to proximal level
+  // that are within proximal level input's internal key range.
+  // UPDATE: this functionality has changed. With proximal-enabled
+  // compaction, the expanded potential output range in the proximal
   // level is reserved so should be safe to use.
   {
     MutexLock l(&mutex);
@@ -1376,7 +1604,7 @@ TEST_P(PrecludeLastLevelTest, MigrationFromPreserveTimeManualCompaction) {
   cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
 
-  // all data is moved up to the penultimate level
+  // all data is moved up to the proximal level
   ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
   ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
   ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
@@ -1448,7 +1676,7 @@ TEST_P(PrecludeLastLevelTest, MigrationFromPreserveTimeAutoCompaction) {
     ASSERT_OK(dbfull()->TEST_WaitForCompact());
   }
 
-  // all data is moved up to the penultimate level
+  // all data is moved up to the proximal level
   ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
   ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
   ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
@@ -1489,9 +1717,8 @@ TEST_P(PrecludeLastLevelTest, MigrationFromPreserveTimePartial) {
   ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
 
   std::vector<KeyVersion> key_versions;
-  ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(),
-                              std::numeric_limits<size_t>::max(),
-                              &key_versions));
+  ASSERT_OK(GetAllKeyVersions(
+      db_.get(), {}, {}, std::numeric_limits<size_t>::max(), &key_versions));
 
   // make sure there're more than 300 keys and first 100 keys are having seqno
   // zeroed out, the last 100 key seqno not zeroed out
@@ -1537,7 +1764,10 @@ TEST_P(PrecludeLastLevelTest, SmallPrecludeTime) {
   options.env = mock_env_.get();
   options.level0_file_num_compaction_trigger = kNumTrigger;
   options.num_levels = kNumLevels;
-  options.last_level_temperature = Temperature::kCold;
+  // This existing test selected to also check the case of various temperatures
+  // for last_level_temperature, which should not be interesting enough to
+  // exercise across many/all test cases
+  options.last_level_temperature = RandomKnownTemperature();
   DestroyAndReopen(options);
 
   Random rnd(301);
@@ -1564,6 +1794,10 @@ TEST_P(PrecludeLastLevelTest, SmallPrecludeTime) {
   ASSERT_FALSE(tp_mapping.Empty());
   auto seqs = tp_mapping.TEST_GetInternalMapping();
   ASSERT_FALSE(seqs.empty());
+  ASSERT_GE(GetSstSizeHelper(Temperature::kUnknown), 1);
+  for (auto t : kKnownTemperatures) {
+    ASSERT_EQ(GetSstSizeHelper(t), 0);
+  }
 
   // Wait more than preclude_last_level time, then make sure all the data is
   // compacted to the last level even there's no write (no seqno -> time
@@ -1572,16 +1806,22 @@ TEST_P(PrecludeLastLevelTest, SmallPrecludeTime) {
 
   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
-  ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
-  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  for (auto t : kKnownTemperatures) {
+    if (t == options.last_level_temperature) {
+      ASSERT_GT(GetSstSizeHelper(t), 0);
+    } else {
+      ASSERT_EQ(GetSstSizeHelper(t), 0);
+    }
+  }
 
   Close();
 }
 
 TEST_P(PrecludeLastLevelTest, CheckInternalKeyRange) {
-  // When compacting keys from the last level to penultimate level,
-  // output to penultimate level should be within internal key range
-  // of input files from penultimate level.
+  // When compacting keys from the last level to proximal level,
+  // output to proximal level should be within internal key range
+  // of input files from proximal level.
   // Set up:
   // L5:
   //  File 1: DeleteRange[1, 3)@4, File 2: [3@5, 100@6]
@@ -1719,8 +1959,8 @@ TEST_P(PrecludeWithCompactStyleTest, RangeTombstoneSnapshotMigrateFromLast) {
 
   ApplyConfigChange(&options, {{"preclude_last_level_data_seconds", "10000"}});
 
-  // To exercise the WithinPenultimateLevelOutputRange feature, we want files
-  // around the middle file to be compacted on the penultimate level
+  // To exercise the WithinProximalLevelOutputRange feature, we want files
+  // around the middle file to be compacted on the proximal level
   ASSERT_OK(Put(Key(0), "val0"));
   ASSERT_OK(Flush());
   ASSERT_OK(Put(Key(3), "val3"));
@@ -1777,9 +2017,9 @@ TEST_P(PrecludeWithCompactStyleTest, RangeTombstoneSnapshotMigrateFromLast) {
   EXPECT_EQ("0,0,0,0,0,3,1", FilesPerLevel());
   VerifyLogicalState(__LINE__);
 
-  // Compact everything, but some data still goes to both penultimate and last
+  // Compact everything, but some data still goes to both proximal and last
   // levels. A full-range compaction should be safe to "migrate" data from the
-  // last level to penultimate (because of preclude setting change).
+  // last level to proximal (because of preclude setting change).
   ASSERT_OK(CompactRange({}, {}, {}));
   EXPECT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
   VerifyLogicalState(__LINE__);
@@ -1898,7 +2138,7 @@ TEST_P(TimedPutPrecludeLastLevelTest, InterleavedTimedPutAndPut) {
   Close();
 }
 
-TEST_P(TimedPutPrecludeLastLevelTest, PreserveTimedPutOnPenultimateLevel) {
+TEST_P(TimedPutPrecludeLastLevelTest, PreserveTimedPutOnProximalLevel) {
   Options options = CurrentOptions();
   options.compaction_style = kCompactionStyleUniversal;
   options.disable_auto_compactions = true;
@@ -1924,14 +2164,14 @@ TEST_P(TimedPutPrecludeLastLevelTest, PreserveTimedPutOnPenultimateLevel) {
   ASSERT_OK(TimedPut(0, Key(2), "v2", kMockStartTime - 1 * 24 * 60 * 60, wo));
   ASSERT_OK(Flush());
 
-  // Should still be in penultimate level.
+  // Should still be in proximal level.
   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
   ASSERT_GT(GetSstSizeHelper(Temperature::kHot), 0);
   ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
 
   // Wait one more day and release snapshot. Data's preferred seqno should be
-  // swapped in, but data should still stay in penultimate level. SST file's
+  // swapped in, but data should still stay in proximal level. SST file's
   // seqno to time mapping should continue to cover preferred seqno after
   // compaction.
   db_->ReleaseSnapshot(snap1);
@@ -2079,9 +2319,8 @@ TEST_P(PrecludeLastLevelTest, LastLevelOnlyCompactionPartial) {
   ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
 
   std::vector<KeyVersion> key_versions;
-  ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(),
-                              std::numeric_limits<size_t>::max(),
-                              &key_versions));
+  ASSERT_OK(GetAllKeyVersions(
+      db_.get(), {}, {}, std::numeric_limits<size_t>::max(), &key_versions));
 
   // make sure there're more than 300 keys and first 100 keys are having seqno
   // zeroed out, the last 100 key seqno not zeroed out
@@ -2253,13 +2492,13 @@ TEST_P(PrecludeLastLevelOptionalTest, LastLevelOnlyCompactionNoPreclude) {
   Close();
 }
 
-TEST_P(PrecludeLastLevelOptionalTest, PeriodicCompactionToPenultimateLevel) {
+TEST_P(PrecludeLastLevelOptionalTest, PeriodicCompactionToProximalLevel) {
   // Test the last level only periodic compaction should also be blocked by an
-  // ongoing compaction in penultimate level if tiered compaction is enabled
+  // ongoing compaction in proximal level if tiered compaction is enabled
   // otherwise, the periodic compaction should just run for the last level.
   const int kNumTrigger = 4;
   const int kNumLevels = 7;
-  const int kPenultimateLevel = kNumLevels - 2;
+  const int kProximalLevel = kNumLevels - 2;
   const int kKeyPerSec = 1;
   const int kNumKeys = 100;
 
@@ -2301,13 +2540,13 @@ TEST_P(PrecludeLastLevelOptionalTest, PeriodicCompactionToPenultimateLevel) {
   SyncPoint::GetInstance()->SetCallBack(
       "CompactionJob::ProcessKeyValueCompaction()::Processing", [&](void* arg) {
         auto compaction = static_cast<Compaction*>(arg);
-        if (compaction->output_level() == kPenultimateLevel) {
+        if (compaction->output_level() == kProximalLevel) {
           is_size_ratio_compaction_running = true;
           TEST_SYNC_POINT(
-              "PrecludeLastLevelTest::PeriodicCompactionToPenultimateLevel:"
+              "PrecludeLastLevelTest::PeriodicCompactionToProximalLevel:"
               "SizeRatioCompaction1");
           TEST_SYNC_POINT(
-              "PrecludeLastLevelTest::PeriodicCompactionToPenultimateLevel:"
+              "PrecludeLastLevelTest::PeriodicCompactionToProximalLevel:"
               "SizeRatioCompaction2");
           is_size_ratio_compaction_running = false;
         }
@@ -2329,17 +2568,17 @@ TEST_P(PrecludeLastLevelOptionalTest, PeriodicCompactionToPenultimateLevel) {
           verified_last_level_compaction = true;
         }
         TEST_SYNC_POINT(
-            "PrecludeLastLevelTest::PeriodicCompactionToPenultimateLevel:"
+            "PrecludeLastLevelTest::PeriodicCompactionToProximalLevel:"
             "AutoCompactionPicked");
       });
 
   SyncPoint::GetInstance()->LoadDependency({
-      {"PrecludeLastLevelTest::PeriodicCompactionToPenultimateLevel:"
+      {"PrecludeLastLevelTest::PeriodicCompactionToProximalLevel:"
        "SizeRatioCompaction1",
-       "PrecludeLastLevelTest::PeriodicCompactionToPenultimateLevel:DoneWrite"},
-      {"PrecludeLastLevelTest::PeriodicCompactionToPenultimateLevel:"
+       "PrecludeLastLevelTest::PeriodicCompactionToProximalLevel:DoneWrite"},
+      {"PrecludeLastLevelTest::PeriodicCompactionToProximalLevel:"
        "AutoCompactionPicked",
-       "PrecludeLastLevelTest::PeriodicCompactionToPenultimateLevel:"
+       "PrecludeLastLevelTest::PeriodicCompactionToProximalLevel:"
        "SizeRatioCompaction2"},
   });
 
@@ -2356,11 +2595,11 @@ TEST_P(PrecludeLastLevelOptionalTest, PeriodicCompactionToPenultimateLevel) {
   }
 
   TEST_SYNC_POINT(
-      "PrecludeLastLevelTest::PeriodicCompactionToPenultimateLevel:DoneWrite");
+      "PrecludeLastLevelTest::PeriodicCompactionToProximalLevel:DoneWrite");
 
   // wait for periodic compaction time and flush to trigger the periodic
   // compaction, which should be blocked by ongoing compaction in the
-  // penultimate level
+  // proximal level
   mock_clock_->MockSleepForSeconds(10000);
   for (int i = 0; i < 3 * kNumKeys; i++) {
     ASSERT_OK(Put(Key(i), rnd.RandomString(10)));
@@ -2423,7 +2662,7 @@ class ThreeRangesPartitionerFactory : public SstPartitionerFactory {
   }
 };
 
-TEST_P(PrecludeLastLevelTest, PartialPenultimateLevelCompaction) {
+TEST_P(PrecludeLastLevelTest, PartialProximalLevelCompaction) {
   const int kNumTrigger = 4;
   const int kNumLevels = 7;
   const int kKeyPerSec = 10;
@@ -2593,8 +2832,8 @@ TEST_P(PrecludeLastLevelTest, RangeDelsCauseFileEndpointsToOverlap) {
       "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
         auto compaction = static_cast<Compaction*>(arg);
         if (compaction->SupportsPerKeyPlacement()) {
-          ASSERT_EQ(compaction->GetPenultimateOutputRangeType(),
-                    Compaction::PenultimateOutputRangeType::kNonLastRange);
+          ASSERT_EQ(compaction->GetProximalOutputRangeType(),
+                    Compaction::ProximalOutputRangeType::kNonLastRange);
           per_key_comp_num++;
         }
       });
@@ -2650,7 +2889,7 @@ TEST_P(PrecludeLastLevelTest, RangeDelsCauseFileEndpointsToOverlap) {
   ASSERT_EQ(3, per_key_comp_num);
   verify_db();
 
-  // Finish off the penultimate level.
+  // Finish off the proximal level.
   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
   ASSERT_EQ("0,0,0,0,0,0,3", FilesPerLevel());
   verify_db();
diff --git a/db/comparator_db_test.cc b/db/comparator_db_test.cc
index f9c0f47ef7be..fdd042fcd717 100644
--- a/db/comparator_db_test.cc
+++ b/db/comparator_db_test.cc
@@ -258,12 +258,12 @@ class ComparatorDBTest
  private:
   std::string dbname_;
   Env* env_;
-  DB* db_;
+  std::unique_ptr<DB> db_;
   Options last_options_;
   std::unique_ptr<const Comparator> comparator_guard;
 
  public:
-  ComparatorDBTest() : env_(Env::Default()), db_(nullptr) {
+  ComparatorDBTest() : env_(Env::Default()) {
     kTestComparator = BytewiseComparator();
     dbname_ = test::PerThreadDBPath("comparator_db_test");
     BlockBasedTableOptions toptions;
@@ -274,12 +274,12 @@ class ComparatorDBTest
   }
 
   ~ComparatorDBTest() override {
-    delete db_;
+    db_.reset();
     EXPECT_OK(DestroyDB(dbname_, last_options_));
     kTestComparator = BytewiseComparator();
   }
 
-  DB* GetDB() { return db_; }
+  DB* GetDB() { return db_.get(); }
 
   void SetOwnedComparator(const Comparator* cmp, bool owner = true) {
     if (owner) {
@@ -301,14 +301,12 @@ class ComparatorDBTest
   }
 
   void Destroy() {
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     ASSERT_OK(DestroyDB(dbname_, last_options_));
   }
 
   Status TryReopen() {
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     last_options_.create_if_missing = true;
 
     return DB::Open(last_options_, dbname_, &db_);
@@ -318,7 +316,7 @@ class ComparatorDBTest
 INSTANTIATE_TEST_CASE_P(FormatDef, ComparatorDBTest,
                         testing::Values(test::kDefaultFormatVersion));
 INSTANTIATE_TEST_CASE_P(FormatLatest, ComparatorDBTest,
-                        testing::Values(kLatestFormatVersion));
+                        testing::Values(kLatestBbtFormatVersion));
 
 TEST_P(ComparatorDBTest, Bytewise) {
   for (int rand_seed = 301; rand_seed < 306; rand_seed++) {
diff --git a/db/convenience.cc b/db/convenience.cc
index 47ce59f2f8d1..5560cffe5fda 100644
--- a/db/convenience.cc
+++ b/db/convenience.cc
@@ -26,6 +26,17 @@ Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family,
 
 Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family,
                            const RangePtr* ranges, size_t n, bool include_end) {
+  std::vector<RangeOpt> range_opts(n);
+  for (size_t i = 0; i < n; ++i) {
+    range_opts[i] = {OptSlice::CopyFromPtr(ranges[i].start),
+                     OptSlice::CopyFromPtr(ranges[i].limit)};
+  }
+  return DeleteFilesInRanges(db, column_family, range_opts.data(), n,
+                             include_end);
+}
+
+Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family,
+                           const RangeOpt* ranges, size_t n, bool include_end) {
   return (static_cast_with_check<DBImpl>(db->GetRootDB()))
       ->DeleteFilesInRanges(column_family, ranges, n, include_end);
 }
@@ -54,7 +65,7 @@ Status VerifySstFileChecksum(const Options& options,
 }
 
 Status VerifySstFileChecksumInternal(const Options& options,
-                                     const EnvOptions& env_options,
+                                     const FileOptions& file_options,
                                      const ReadOptions& read_options,
                                      const std::string& file_path,
                                      const SequenceNumber& largest_seqno) {
@@ -63,8 +74,8 @@ Status VerifySstFileChecksumInternal(const Options& options,
   InternalKeyComparator internal_comparator(options.comparator);
   ImmutableOptions ioptions(options);
 
-  Status s = ioptions.fs->NewRandomAccessFile(
-      file_path, FileOptions(env_options), &file, nullptr);
+  Status s =
+      ioptions.fs->NewRandomAccessFile(file_path, file_options, &file, nullptr);
   if (s.ok()) {
     s = ioptions.fs->GetFileSize(file_path, IOOptions(), &file_size, nullptr);
   } else {
@@ -82,9 +93,10 @@ Status VerifySstFileChecksumInternal(const Options& options,
           nullptr /* file_read_hist */, ioptions.rate_limiter.get()));
   const bool kImmortal = true;
   auto reader_options = TableReaderOptions(
-      ioptions, options.prefix_extractor, env_options, internal_comparator,
-      options.block_protection_bytes_per_key, false /* skip_filters */,
-      !kImmortal, false /* force_direct_prefetch */, -1 /* level */);
+      ioptions, options.prefix_extractor, options.compression_manager.get(),
+      file_options, internal_comparator, options.block_protection_bytes_per_key,
+      false /* skip_filters */, !kImmortal, false /* force_direct_prefetch */,
+      -1 /* level */);
   reader_options.largest_seqno = largest_seqno;
   s = options.table_factory->NewTableReader(
       read_options, reader_options, std::move(file_reader), file_size,
diff --git a/db/convenience_impl.h b/db/convenience_impl.h
index 32f4476bde99..5e8d6d49667c 100644
--- a/db/convenience_impl.h
+++ b/db/convenience_impl.h
@@ -5,10 +5,11 @@
 
 #pragma once
 #include "rocksdb/db.h"
+#include "rocksdb/file_system.h"
 
 namespace ROCKSDB_NAMESPACE {
 Status VerifySstFileChecksumInternal(const Options& options,
-                                     const EnvOptions& env_options,
+                                     const FileOptions& file_options,
                                      const ReadOptions& read_options,
                                      const std::string& file_path,
                                      const SequenceNumber& largest_seqno = 0);
diff --git a/db/corruption_test.cc b/db/corruption_test.cc
index e99612c2b8a3..448d2c9d94c0 100644
--- a/db/corruption_test.cc
+++ b/db/corruption_test.cc
@@ -73,7 +73,7 @@ class CorruptionTest : public testing::Test {
   std::string dbname_;
   std::shared_ptr<Cache> tiny_cache_;
   Options options_;
-  DB* db_;
+  std::unique_ptr<DB> db_;
 
   CorruptionTest() {
     // If LRU cache shard bit is smaller than 2 (or -1 which will automatically
@@ -105,8 +105,7 @@ class CorruptionTest : public testing::Test {
     SyncPoint::GetInstance()->DisableProcessing();
     SyncPoint::GetInstance()->LoadDependency({});
     SyncPoint::GetInstance()->ClearAllCallBacks();
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     if (getenv("KEEP_DB")) {
       fprintf(stdout, "db is still at %s\n", dbname_.c_str());
     } else {
@@ -116,14 +115,12 @@ class CorruptionTest : public testing::Test {
     }
   }
 
-  void CloseDb() {
-    delete db_;
-    db_ = nullptr;
-  }
+  void CloseDb() { db_.reset(); }
+
+  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_.get()); }
 
   Status TryReopen(Options* options = nullptr) {
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     Options opt = (options ? *options : options_);
     if (opt.env == Options().env) {
       // If env is not overridden, replace it with ErrorEnv.
@@ -141,8 +138,7 @@ class CorruptionTest : public testing::Test {
   void Reopen(Options* options = nullptr) { ASSERT_OK(TryReopen(options)); }
 
   void RepairDB() {
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     ASSERT_OK(::ROCKSDB_NAMESPACE::RepairDB(dbname_, options_));
   }
 
@@ -151,8 +147,7 @@ class CorruptionTest : public testing::Test {
     WriteBatch batch;
     for (int i = 0; i < n; i++) {
       if (flush_every != 0 && i != 0 && i % flush_every == 0) {
-        DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
-        ASSERT_OK(dbi->TEST_FlushMemTable());
+        ASSERT_OK(dbfull()->TEST_FlushMemTable());
       }
       // if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n);
       Slice key = Key(i + start, &key_space);
@@ -436,14 +431,14 @@ TEST_F(CorruptionTest, NewFileErrorDuringWrite) {
 
 TEST_F(CorruptionTest, TableFile) {
   Build(100);
-  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  DBImpl* dbi = dbfull();
   ASSERT_OK(dbi->TEST_FlushMemTable());
   ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr));
   ASSERT_OK(dbi->TEST_CompactRange(1, nullptr, nullptr));
 
   Corrupt(kTableFile, 100, 1);
   Check(99, 99);
-  ASSERT_NOK(dbi->VerifyChecksum());
+  ASSERT_NOK(db_->VerifyChecksum());
 }
 
 TEST_F(CorruptionTest, VerifyChecksumReadahead) {
@@ -460,14 +455,14 @@ TEST_F(CorruptionTest, VerifyChecksumReadahead) {
   Reopen(&options);
 
   Build(10000);
-  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  DBImpl* dbi = dbfull();
   ASSERT_OK(dbi->TEST_FlushMemTable());
   ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr));
   ASSERT_OK(dbi->TEST_CompactRange(1, nullptr, nullptr));
 
   senv.count_random_reads_ = true;
   senv.random_read_counter_.Reset();
-  ASSERT_OK(dbi->VerifyChecksum());
+  ASSERT_OK(db_->VerifyChecksum());
 
   // Make sure the counter is enabled.
   ASSERT_GT(senv.random_read_counter_.Read(), 0);
@@ -480,7 +475,7 @@ TEST_F(CorruptionTest, VerifyChecksumReadahead) {
   senv.random_read_bytes_counter_ = 0;
   ReadOptions ro;
   ro.readahead_size = size_t{32 * 1024};
-  ASSERT_OK(dbi->VerifyChecksum(ro));
+  ASSERT_OK(db_->VerifyChecksum(ro));
   // The SST file is about 10MB. We set readahead size to 32KB.
   // Give 0 to 20 reads for metadata blocks, and allow real read
   // to range from 24KB to 48KB. The lower bound would be:
@@ -494,8 +489,7 @@ TEST_F(CorruptionTest, VerifyChecksumReadahead) {
   // disabled).
   options.allow_mmap_reads = true;
   Reopen(&options);
-  dbi = static_cast<DBImpl*>(db_);
-  ASSERT_OK(dbi->VerifyChecksum(ro));
+  ASSERT_OK(db_->VerifyChecksum(ro));
 
   CloseDb();
 }
@@ -508,18 +502,16 @@ TEST_F(CorruptionTest, TableFileIndexData) {
   Reopen(&options);
   // build 2 tables, flush at 5000
   Build(10000, 5000);
-  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
-  ASSERT_OK(dbi->TEST_FlushMemTable());
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
   // corrupt an index block of an entire file
   Corrupt(kTableFile, -2000, 500);
   options.paranoid_checks = false;
   Reopen(&options);
-  dbi = static_cast_with_check<DBImpl>(db_);
   // one full file may be readable, since only one was corrupted
   // the other file should be fully non-readable, since index was corrupted
   Check(0, 5000, ReadOptions(true, true));
-  ASSERT_NOK(dbi->VerifyChecksum());
+  ASSERT_NOK(db_->VerifyChecksum());
 
   // In paranoid mode, the db cannot be opened due to the corrupted file.
   ASSERT_TRUE(TryReopen().IsCorruption());
@@ -527,8 +519,7 @@ TEST_F(CorruptionTest, TableFileIndexData) {
 
 TEST_F(CorruptionTest, TableFileFooterMagic) {
   Build(100);
-  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
-  ASSERT_OK(dbi->TEST_FlushMemTable());
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
   Check(100, 100);
   // Corrupt the whole footer
   Corrupt(kTableFile, -100, 100);
@@ -543,8 +534,7 @@ TEST_F(CorruptionTest, TableFileFooterMagic) {
 
 TEST_F(CorruptionTest, TableFileFooterNotMagic) {
   Build(100);
-  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
-  ASSERT_OK(dbi->TEST_FlushMemTable());
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
   Check(100, 100);
   // Corrupt footer except magic number
   Corrupt(kTableFile, -100, 92);
@@ -556,10 +546,77 @@ TEST_F(CorruptionTest, TableFileFooterNotMagic) {
   ASSERT_TRUE(s.ToString().find(".sst") != std::string::npos);
 }
 
+TEST_F(CorruptionTest, DBOpenWithWrongFileSize) {
+  // Validate that when paranoid flag is true, DB::Open() fails if one of the
+  // file corrupted. Validate that when paranoid flag is false, DB::Open()
+  // succeed if one of the file corrupted, and the healthy file is readable.
+  CloseDb();
+
+  const std::string test_cf_name = "test_cf";
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions());
+  cf_descs.emplace_back(test_cf_name, ColumnFamilyOptions());
+
+  {
+    options_.create_missing_column_families = true;
+    std::vector<ColumnFamilyHandle*> cfhs;
+    ASSERT_OK(DB::Open(options_, dbname_, cf_descs, &cfhs, &db_));
+    assert(db_ != nullptr);  // suppress false clang-analyze report
+
+    ASSERT_OK(db_->Put(WriteOptions(), cfhs[0], "k", "v"));
+    ASSERT_OK(db_->Put(WriteOptions(), cfhs[1], "k1", "v1"));
+    ASSERT_OK(db_->Put(WriteOptions(), cfhs[0], "k2", "v2"));
+    for (auto* cfh : cfhs) {
+      delete cfh;
+    }
+    ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+    // ********************************************
+    // Corrupt the file by making the file bigger
+    std::vector<LiveFileMetaData> metadata;
+    db_->GetLiveFilesMetaData(&metadata);
+    std::string filename = dbname_ + metadata[0].name;
+    const auto& fs = options_.env->GetFileSystem();
+    {
+      std::unique_ptr<FSWritableFile> f;
+      ASSERT_OK(fs->ReopenWritableFile(filename, FileOptions(), &f, nullptr));
+      ASSERT_OK(f->Append("blahblah", IOOptions(), nullptr));
+      ASSERT_OK(f->Close(IOOptions(), nullptr));
+    }
+    CloseDb();
+  }
+
+  // DB failed to open due to one of the file is corrupted, as paranoid flag is
+  // true
+  options_.paranoid_checks = true;
+  std::vector<ColumnFamilyHandle*> cfhs;
+  Status s;
+  s = DB::Open(options_, dbname_, cf_descs, &cfhs, &db_);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(s.ToString().find("file size mismatch") != std::string::npos);
+
+  // DB opened successfully, as paranoid flag is false, validate the one that is
+  // healthy is still accessible
+  options_.paranoid_checks = false;
+  ASSERT_OK(DB::Open(options_, dbname_, cf_descs, &cfhs, &db_));
+  assert(db_ != nullptr);  // suppress false clang-analyze report
+
+  std::string v;
+  ASSERT_OK(db_->Get(ReadOptions(), cfhs[1], "k1", &v));
+  ASSERT_EQ(v, "v1");
+
+  // Validate the default column family is corrupted
+  Check(0, 0);
+  s = db_->Get(ReadOptions(), cfhs[0], "k1", &v);
+  ASSERT_TRUE(s.IsCorruption());
+
+  delete cfhs[1];
+  delete cfhs[0];
+}
+
 TEST_F(CorruptionTest, TableFileWrongSize) {
   Build(100);
-  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
-  ASSERT_OK(dbi->TEST_FlushMemTable());
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
   Check(100, 100);
 
   // ********************************************
@@ -579,13 +636,16 @@ TEST_F(CorruptionTest, TableFileWrongSize) {
   // DB actually accepts this without paranoid checks, relying on size
   // recorded in manifest to locate the SST footer.
   options_.paranoid_checks = false;
-  options_.skip_checking_sst_file_sizes_on_db_open = false;
   Reopen();
-  Check(100, 100);
+  // As footer could not be extraced, file is completely unreadable
+  Check(0, 0);
+  std::string v;
+  auto s = db_->Get(ReadOptions(), "k1", &v);
+  ASSERT_TRUE(s.IsCorruption());
 
   // But reports the issue with paranoid checks
   options_.paranoid_checks = true;
-  Status s = TryReopen();
+  s = TryReopen();
   ASSERT_TRUE(s.IsCorruption());
   ASSERT_TRUE(s.ToString().find("file size mismatch") != std::string::npos);
 
@@ -639,12 +699,11 @@ TEST_F(CorruptionTest, SequenceNumberRecovery) {
 
 TEST_F(CorruptionTest, CorruptedDescriptor) {
   ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello"));
-  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
-  ASSERT_OK(dbi->TEST_FlushMemTable());
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
   CompactRangeOptions cro;
   cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
   ASSERT_OK(
-      dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr));
+      db_->CompactRange(cro, db_->DefaultColumnFamily(), nullptr, nullptr));
 
   Corrupt(kDescriptorFile, 0, 1000);
   Status s = TryReopen();
@@ -663,7 +722,7 @@ TEST_F(CorruptionTest, CompactionInputError) {
   options.env = env_.get();
   Reopen(&options);
   Build(10);
-  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  DBImpl* dbi = dbfull();
   ASSERT_OK(dbi->TEST_FlushMemTable());
   ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr));
   ASSERT_OK(dbi->TEST_CompactRange(1, nullptr, nullptr));
@@ -671,12 +730,12 @@ TEST_F(CorruptionTest, CompactionInputError) {
 
   Corrupt(kTableFile, 100, 1);
   Check(9, 9);
-  ASSERT_NOK(dbi->VerifyChecksum());
+  ASSERT_NOK(db_->VerifyChecksum());
 
   // Force compactions by writing lots of values
   Build(10000);
   Check(10000, 10000);
-  ASSERT_NOK(dbi->VerifyChecksum());
+  ASSERT_NOK(db_->VerifyChecksum());
 }
 
 TEST_F(CorruptionTest, CompactionInputErrorParanoid) {
@@ -687,14 +746,14 @@ TEST_F(CorruptionTest, CompactionInputErrorParanoid) {
   options.write_buffer_size = 131072;
   options.max_write_buffer_number = 2;
   Reopen(&options);
-  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  DBImpl* dbi = dbfull();
 
   // Fill levels >= 1
-  for (int level = 1; level < dbi->NumberLevels(); level++) {
-    ASSERT_OK(dbi->Put(WriteOptions(), "", "begin"));
-    ASSERT_OK(dbi->Put(WriteOptions(), "~", "end"));
+  for (int level = 1; level < db_->NumberLevels(); level++) {
+    ASSERT_OK(db_->Put(WriteOptions(), "", "begin"));
+    ASSERT_OK(db_->Put(WriteOptions(), "~", "end"));
     ASSERT_OK(dbi->TEST_FlushMemTable());
-    for (int comp_level = 0; comp_level < dbi->NumberLevels() - level;
+    for (int comp_level = 0; comp_level < db_->NumberLevels() - level;
          ++comp_level) {
       ASSERT_OK(dbi->TEST_CompactRange(comp_level, nullptr, nullptr));
     }
@@ -702,7 +761,7 @@ TEST_F(CorruptionTest, CompactionInputErrorParanoid) {
 
   Reopen(&options);
 
-  dbi = static_cast_with_check<DBImpl>(db_);
+  dbi = dbfull();
   Build(10);
   ASSERT_OK(dbi->TEST_FlushMemTable());
   ASSERT_OK(dbi->TEST_WaitForCompact());
@@ -710,7 +769,7 @@ TEST_F(CorruptionTest, CompactionInputErrorParanoid) {
 
   CorruptTableFileAtLevel(0, 100, 1);
   Check(9, 9);
-  ASSERT_NOK(dbi->VerifyChecksum());
+  ASSERT_NOK(db_->VerifyChecksum());
 
   // Write must eventually fail because of corrupted table
   Status s;
@@ -729,17 +788,16 @@ TEST_F(CorruptionTest, CompactionInputErrorParanoid) {
 
 TEST_F(CorruptionTest, UnrelatedKeys) {
   Build(10);
-  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
-  ASSERT_OK(dbi->TEST_FlushMemTable());
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
   Corrupt(kTableFile, 100, 1);
-  ASSERT_NOK(dbi->VerifyChecksum());
+  ASSERT_NOK(db_->VerifyChecksum());
 
   std::string tmp1, tmp2;
   ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2)));
   std::string v;
   ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
   ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
-  ASSERT_OK(dbi->TEST_FlushMemTable());
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
   ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
   ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
 }
@@ -786,14 +844,12 @@ TEST_F(CorruptionTest, FileSystemStateCorrupted) {
     Reopen(&options);
     Build(10);
     ASSERT_OK(db_->Flush(FlushOptions()));
-    DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
     std::vector<LiveFileMetaData> metadata;
-    dbi->GetLiveFilesMetaData(&metadata);
+    db_->GetLiveFilesMetaData(&metadata);
     ASSERT_GT(metadata.size(), 0);
     std::string filename = dbname_ + metadata[0].name;
 
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
 
     if (iter == 0) {  // corrupt file size
       std::unique_ptr<WritableFile> file;
@@ -825,8 +881,7 @@ TEST_F(CorruptionTest, ParanoidFileChecksOnFlush) {
   options.create_if_missing = true;
   Status s;
   for (const auto& mode : corruption_modes) {
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     s = DestroyDB(dbname_, options);
     ASSERT_OK(s);
     std::shared_ptr<mock::MockTableFactory> mock =
@@ -853,8 +908,7 @@ TEST_F(CorruptionTest, ParanoidFileChecksOnCompact) {
   options.create_if_missing = true;
   Status s;
   for (const auto& mode : corruption_modes) {
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     s = DestroyDB(dbname_, options);
     ASSERT_OK(s);
     std::shared_ptr<mock::MockTableFactory> mock =
@@ -863,13 +917,11 @@ TEST_F(CorruptionTest, ParanoidFileChecksOnCompact) {
     ASSERT_OK(DB::Open(options, dbname_, &db_));
     assert(db_ != nullptr);  // suppress false clang-analyze report
     Build(100, 2);
-    // ASSERT_OK(db_->Flush(FlushOptions()));
-    DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
-    ASSERT_OK(dbi->TEST_FlushMemTable());
+    ASSERT_OK(dbfull()->TEST_FlushMemTable());
     mock->SetCorruptionMode(mode);
     CompactRangeOptions cro;
     cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
-    s = dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr);
+    s = db_->CompactRange(cro, db_->DefaultColumnFamily(), nullptr, nullptr);
     if (mode == mock::MockTableFactory::kCorruptNone) {
       ASSERT_OK(s);
     } else {
@@ -885,8 +937,7 @@ TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeFirst) {
   options.paranoid_file_checks = true;
   options.create_if_missing = true;
   for (bool do_flush : {true, false}) {
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     ASSERT_OK(DestroyDB(dbname_, options));
     ASSERT_OK(DB::Open(options, dbname_, &db_));
     std::string start, end;
@@ -903,12 +954,11 @@ TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeFirst) {
     if (do_flush) {
       ASSERT_OK(db_->Flush(FlushOptions()));
     } else {
-      DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
-      ASSERT_OK(dbi->TEST_FlushMemTable());
+      ASSERT_OK(dbfull()->TEST_FlushMemTable());
       CompactRangeOptions cro;
       cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
       ASSERT_OK(
-          dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr));
+          db_->CompactRange(cro, db_->DefaultColumnFamily(), nullptr, nullptr));
     }
     db_->ReleaseSnapshot(snap);
   }
@@ -921,8 +971,7 @@ TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRange) {
   options.paranoid_file_checks = true;
   options.create_if_missing = true;
   for (bool do_flush : {true, false}) {
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     ASSERT_OK(DestroyDB(dbname_, options));
     ASSERT_OK(DB::Open(options, dbname_, &db_));
     assert(db_ != nullptr);  // suppress false clang-analyze report
@@ -942,12 +991,11 @@ TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRange) {
     if (do_flush) {
       ASSERT_OK(db_->Flush(FlushOptions()));
     } else {
-      DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
-      ASSERT_OK(dbi->TEST_FlushMemTable());
+      ASSERT_OK(dbfull()->TEST_FlushMemTable());
       CompactRangeOptions cro;
       cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
       ASSERT_OK(
-          dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr));
+          db_->CompactRange(cro, db_->DefaultColumnFamily(), nullptr, nullptr));
     }
     db_->ReleaseSnapshot(snap);
   }
@@ -960,8 +1008,7 @@ TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeLast) {
   options.paranoid_file_checks = true;
   options.create_if_missing = true;
   for (bool do_flush : {true, false}) {
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     ASSERT_OK(DestroyDB(dbname_, options));
     ASSERT_OK(DB::Open(options, dbname_, &db_));
     assert(db_ != nullptr);  // suppress false clang-analyze report
@@ -978,12 +1025,11 @@ TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeLast) {
     if (do_flush) {
       ASSERT_OK(db_->Flush(FlushOptions()));
     } else {
-      DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
-      ASSERT_OK(dbi->TEST_FlushMemTable());
+      ASSERT_OK(dbfull()->TEST_FlushMemTable());
       CompactRangeOptions cro;
       cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
       ASSERT_OK(
-          dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr));
+          db_->CompactRange(cro, db_->DefaultColumnFamily(), nullptr, nullptr));
     }
     db_->ReleaseSnapshot(snap);
   }
@@ -996,8 +1042,7 @@ TEST_F(CorruptionTest, LogCorruptionErrorsInCompactionIterator) {
   options.create_if_missing = true;
   options.allow_data_in_errors = true;
   auto mode = mock::MockTableFactory::kCorruptKey;
-  delete db_;
-  db_ = nullptr;
+  db_.reset();
   ASSERT_OK(DestroyDB(dbname_, options));
 
   std::shared_ptr<mock::MockTableFactory> mock =
@@ -1009,12 +1054,11 @@ TEST_F(CorruptionTest, LogCorruptionErrorsInCompactionIterator) {
   assert(db_ != nullptr);  // suppress false clang-analyze report
   Build(100, 2);
 
-  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
-  ASSERT_OK(dbi->TEST_FlushMemTable());
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
   CompactRangeOptions cro;
   cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
   Status s =
-      dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr);
+      db_->CompactRange(cro, db_->DefaultColumnFamily(), nullptr, nullptr);
   ASSERT_NOK(s);
   ASSERT_TRUE(s.IsCorruption());
 }
@@ -1025,8 +1069,7 @@ TEST_F(CorruptionTest, CompactionKeyOrderCheck) {
   options.env = env_.get();
   options.paranoid_file_checks = false;
   options.create_if_missing = true;
-  delete db_;
-  db_ = nullptr;
+  db_.reset();
   ASSERT_OK(DestroyDB(dbname_, options));
   std::shared_ptr<mock::MockTableFactory> mock =
       std::make_shared<mock::MockTableFactory>();
@@ -1035,14 +1078,13 @@ TEST_F(CorruptionTest, CompactionKeyOrderCheck) {
   assert(db_ != nullptr);  // suppress false clang-analyze report
   mock->SetCorruptionMode(mock::MockTableFactory::kCorruptReorderKey);
   Build(100, 2);
-  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
-  ASSERT_OK(dbi->TEST_FlushMemTable());
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
   mock->SetCorruptionMode(mock::MockTableFactory::kCorruptNone);
   CompactRangeOptions cro;
   cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
   ASSERT_NOK(
-      dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr));
+      db_->CompactRange(cro, db_->DefaultColumnFamily(), nullptr, nullptr));
 }
 
 TEST_F(CorruptionTest, FlushKeyOrderCheck) {
@@ -1069,7 +1111,7 @@ TEST_F(CorruptionTest, FlushKeyOrderCheck) {
         }
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
-  Status s = static_cast_with_check<DBImpl>(db_)->TEST_FlushMemTable();
+  Status s = dbfull()->TEST_FlushMemTable();
   ASSERT_NOK(s);
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
@@ -1193,7 +1235,7 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecovery) {
   // while other don't.
   {
     ASSERT_OK(DB::Open(options, dbname_, cf_descs, &handles, &db_));
-    auto* dbimpl = static_cast_with_check<DBImpl>(db_);
+    auto* dbimpl = dbfull();
     assert(dbimpl);
 
     // Write one key to test_cf.
diff --git a/db/cuckoo_table_db_test.cc b/db/cuckoo_table_db_test.cc
index 78ae86683318..1ece0e3630ab 100644
--- a/db/cuckoo_table_db_test.cc
+++ b/db/cuckoo_table_db_test.cc
@@ -21,18 +21,18 @@ class CuckooTableDBTest : public testing::Test {
  private:
   std::string dbname_;
   Env* env_;
-  DB* db_;
+  std::unique_ptr<DB> db_;
 
  public:
   CuckooTableDBTest() : env_(Env::Default()) {
     dbname_ = test::PerThreadDBPath("cuckoo_table_db_test");
     EXPECT_OK(DestroyDB(dbname_, Options()));
-    db_ = nullptr;
+    db_.reset();
     Reopen();
   }
 
   ~CuckooTableDBTest() override {
-    delete db_;
+    db_.reset();
     EXPECT_OK(DestroyDB(dbname_, Options()));
   }
 
@@ -47,12 +47,11 @@ class CuckooTableDBTest : public testing::Test {
     return options;
   }
 
-  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_); }
+  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_.get()); }
 
   // The following util methods are copied from plain_table_db_test.
   void Reopen(Options* options = nullptr) {
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     Options opts;
     if (options != nullptr) {
       opts = *options;
@@ -66,8 +65,7 @@ class CuckooTableDBTest : public testing::Test {
   void DestroyAndReopen(Options* options) {
     assert(options);
     ASSERT_OK(db_->Close());
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     ASSERT_OK(DestroyDB(dbname_, *options));
     Reopen(options);
   }
@@ -130,7 +128,7 @@ TEST_F(CuckooTableDBTest, Flush) {
   ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
   TablePropertiesCollection ptc;
-  ASSERT_OK(static_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc));
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&ptc));
   VerifySstUniqueIds(ptc);
   ASSERT_EQ(1U, ptc.size());
   ASSERT_EQ(3U, ptc.begin()->second->num_entries);
@@ -147,7 +145,7 @@ TEST_F(CuckooTableDBTest, Flush) {
   ASSERT_OK(Put("key6", "v6"));
   ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
-  ASSERT_OK(static_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc));
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&ptc));
   VerifySstUniqueIds(ptc);
   ASSERT_EQ(2U, ptc.size());
   auto row = ptc.begin();
@@ -165,7 +163,7 @@ TEST_F(CuckooTableDBTest, Flush) {
   ASSERT_OK(Delete("key5"));
   ASSERT_OK(Delete("key4"));
   ASSERT_OK(dbfull()->TEST_FlushMemTable());
-  ASSERT_OK(static_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc));
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&ptc));
   VerifySstUniqueIds(ptc);
   ASSERT_EQ(3U, ptc.size());
   row = ptc.begin();
@@ -190,7 +188,7 @@ TEST_F(CuckooTableDBTest, FlushWithDuplicateKeys) {
   ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
   TablePropertiesCollection ptc;
-  ASSERT_OK(static_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc));
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&ptc));
   VerifySstUniqueIds(ptc);
   ASSERT_EQ(1U, ptc.size());
   ASSERT_EQ(2U, ptc.begin()->second->num_entries);
diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc
index edb10693affd..71bf37f197fe 100644
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@@ -91,17 +91,15 @@ class DBBasicTest : public DBTestBase {
 TEST_F(DBBasicTest, OpenWhenOpen) {
   Options options = CurrentOptions();
   options.env = env_;
-  DB* db2 = nullptr;
+  std::unique_ptr<DB> db2;
   Status s = DB::Open(options, dbname_, &db2);
-  ASSERT_NOK(s) << [db2]() {
-    delete db2;
+  ASSERT_NOK(s) << [&db2]() {
+    db2.reset();
     return "db2 open: ok";
   }();
   ASSERT_EQ(Status::Code::kIOError, s.code());
   ASSERT_EQ(Status::SubCode::kNone, s.subcode());
   ASSERT_TRUE(strstr(s.getState(), "lock ") != nullptr);
-
-  delete db2;
 }
 
 TEST_F(DBBasicTest, EnableDirectIOWithZeroBuf) {
@@ -161,6 +159,7 @@ TEST_F(DBBasicTest, UniqueSession) {
 
   ASSERT_EQ(sid2, sid3);
 
+  DestroyAndReopen(options);
   CreateAndReopenWithCF({"goku"}, options);
   ASSERT_OK(db_->GetDbSessionId(sid1));
   ASSERT_OK(Put("bar", "e1"));
@@ -179,6 +178,7 @@ TEST_F(DBBasicTest, UniqueSession) {
 TEST_F(DBBasicTest, ReadOnlyDB) {
   ASSERT_OK(Put("foo", "v1"));
   ASSERT_OK(Put("bar", "v2"));
+  ASSERT_OK(Flush());
   ASSERT_OK(Put("foo", "v3"));
   Close();
 
@@ -208,10 +208,11 @@ TEST_F(DBBasicTest, ReadOnlyDB) {
 
   auto options = CurrentOptions();
   assert(options.env == env_);
-  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_OK(EnforcedReadOnlyReopen(options));
   ASSERT_EQ("v3", Get("foo"));
   ASSERT_EQ("v2", Get("bar"));
   verify_all_iters();
+  ASSERT_EQ(Flush().code(), Status::Code::kNotSupported);
   Close();
 
   // Reopen and flush memtable.
@@ -219,26 +220,75 @@ TEST_F(DBBasicTest, ReadOnlyDB) {
   ASSERT_OK(Flush());
   Close();
   // Now check keys in read only mode.
-  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_OK(EnforcedReadOnlyReopen(options));
   ASSERT_EQ("v3", Get("foo"));
   ASSERT_EQ("v2", Get("bar"));
   verify_all_iters();
-  ASSERT_TRUE(db_->SyncWAL().IsNotSupported());
+  ASSERT_EQ(db_->SyncWAL().code(), Status::Code::kNotSupported);
+
+  // More ops that should fail
+  std::vector<ColumnFamilyHandle*> cfhs{{}};
+  ASSERT_EQ(db_->CreateColumnFamily(options, "blah", &cfhs[0]).code(),
+            Status::Code::kNotSupported);
+
+  ASSERT_EQ(db_->CreateColumnFamilies(options, {"blah"}, &cfhs).code(),
+            Status::Code::kNotSupported);
+
+  std::vector<ColumnFamilyDescriptor> cfds;
+  cfds.push_back({"blah", options});
+  ASSERT_EQ(db_->CreateColumnFamilies(cfds, &cfhs).code(),
+            Status::Code::kNotSupported);
 }
 
-// TODO akanksha: Update the test to check that combination
-// does not actually write to FS (use open read-only with
-// CompositeEnvWrapper+ReadOnlyFileSystem).
-TEST_F(DBBasicTest, DISABLED_ReadOnlyDBWithWriteDBIdToManifestSet) {
+TEST_F(DBBasicTest, ReadOnlyDBFlushWAL) {
+  // Test that FlushWAL returns NotSupported on read-only DB, and that
+  // GetLiveFilesStorageInfo works correctly even with manual_wal_flush=true.
+  // This is a regression test for a bug where GetLiveFilesStorageInfo would
+  // crash on read-only DBs with manual_wal_flush=true because FlushWAL
+  // accessed logs_.back() on an empty deque.
+  auto options = CurrentOptions();
+  options.manual_wal_flush = true;
+  DestroyAndReopen(options);
   ASSERT_OK(Put("foo", "v1"));
   ASSERT_OK(Put("bar", "v2"));
-  ASSERT_OK(Put("foo", "v3"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("baz", "v3"));  // Unflushed data in WAL
+  Close();
+
+  // Reopen as read-only
+  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_EQ("v1", Get("foo"));
+  ASSERT_EQ("v2", Get("bar"));
+  ASSERT_EQ("v3", Get("baz"));
+
+  // FlushWAL should return NotSupported (not crash)
+  ASSERT_EQ(db_->FlushWAL(/*sync=*/false).code(), Status::Code::kNotSupported);
+  ASSERT_EQ(db_->FlushWAL(/*sync=*/true).code(), Status::Code::kNotSupported);
+
+  // GetLiveFilesStorageInfo should succeed (previously crashed with
+  // manual_wal_flush=true because it called FlushWAL which accessed
+  // logs_.back() on empty deque)
+  LiveFilesStorageInfoOptions lfsi_opts;
+  lfsi_opts.wal_size_for_flush = 0;
+  std::vector<LiveFileStorageInfo> files;
+  ASSERT_OK(db_->GetLiveFilesStorageInfo(lfsi_opts, &files));
+  ASSERT_GT(files.size(), 0);
+
   Close();
+}
 
+TEST_F(DBBasicTest, ReadOnlyDBWithWriteDBIdToManifestSet) {
   auto options = CurrentOptions();
+  options.write_dbid_to_manifest = false;
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_OK(Put("bar", "v2"));
+  ASSERT_OK(Put("foo", "v3"));
+  Close();
+
   options.write_dbid_to_manifest = true;
   assert(options.env == env_);
-  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_OK(EnforcedReadOnlyReopen(options));
   std::string db_id1;
   ASSERT_OK(db_->GetDbIdentity(db_id1));
   ASSERT_EQ("v3", Get("foo"));
@@ -258,7 +308,7 @@ TEST_F(DBBasicTest, DISABLED_ReadOnlyDBWithWriteDBIdToManifestSet) {
   ASSERT_OK(Flush());
   Close();
   // Now check keys in read only mode.
-  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_OK(EnforcedReadOnlyReopen(options));
   ASSERT_EQ("v3", Get("foo"));
   ASSERT_EQ("v2", Get("bar"));
   ASSERT_TRUE(db_->SyncWAL().IsNotSupported());
@@ -534,14 +584,14 @@ TEST_F(DBBasicTest, GetSnapshot) {
 
 TEST_F(DBBasicTest, CheckLock) {
   do {
-    DB* localdb = nullptr;
+    std::unique_ptr<DB> localdb;
     Options options = CurrentOptions();
     ASSERT_OK(TryReopen(options));
 
     // second open should fail
     Status s = DB::Open(options, dbname_, &localdb);
-    ASSERT_NOK(s) << [localdb]() {
-      delete localdb;
+    ASSERT_NOK(s) << [&localdb]() {
+      localdb.reset();
       return "localdb open: ok";
     }();
 #ifdef OS_LINUX
@@ -660,30 +710,6 @@ TEST_F(DBBasicTest, Flush) {
   } while (ChangeCompactOptions());
 }
 
-TEST_F(DBBasicTest, ManifestRollOver) {
-  do {
-    Options options;
-    options.max_manifest_file_size = 10;  // 10 bytes
-    options = CurrentOptions(options);
-    CreateAndReopenWithCF({"pikachu"}, options);
-    {
-      ASSERT_OK(Put(1, "manifest_key1", std::string(1000, '1')));
-      ASSERT_OK(Put(1, "manifest_key2", std::string(1000, '2')));
-      ASSERT_OK(Put(1, "manifest_key3", std::string(1000, '3')));
-      uint64_t manifest_before_flush = dbfull()->TEST_Current_Manifest_FileNo();
-      ASSERT_OK(Flush(1));  // This should trigger LogAndApply.
-      uint64_t manifest_after_flush = dbfull()->TEST_Current_Manifest_FileNo();
-      ASSERT_GT(manifest_after_flush, manifest_before_flush);
-      ReopenWithColumnFamilies({"default", "pikachu"}, options);
-      ASSERT_GT(dbfull()->TEST_Current_Manifest_FileNo(), manifest_after_flush);
-      // check if a new manifest file got inserted or not.
-      ASSERT_EQ(std::string(1000, '1'), Get(1, "manifest_key1"));
-      ASSERT_EQ(std::string(1000, '2'), Get(1, "manifest_key2"));
-      ASSERT_EQ(std::string(1000, '3'), Get(1, "manifest_key3"));
-    }
-  } while (ChangeCompactOptions());
-}
-
 TEST_F(DBBasicTest, IdentityAcrossRestarts) {
   constexpr size_t kMinIdSize = 10;
   do {
@@ -834,7 +860,7 @@ TEST_F(DBBasicTest, Snapshot) {
     ASSERT_OK(Put(1, "foo", "1v3"));
 
     {
-      ManagedSnapshot s3(db_);
+      ManagedSnapshot s3(db_.get());
       ASSERT_EQ(3U, GetNumSnapshots());
       ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
       ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
@@ -957,7 +983,7 @@ TEST_F(DBBasicTest, DBOpen_Options) {
   Destroy(options);
 
   // Does not exist, and create_if_missing == false: error
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   options.create_if_missing = false;
   Status s = DB::Open(options, dbname_, &db);
   ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr);
@@ -969,8 +995,7 @@ TEST_F(DBBasicTest, DBOpen_Options) {
   ASSERT_OK(s);
   ASSERT_TRUE(db != nullptr);
 
-  delete db;
-  db = nullptr;
+  db.reset();
 
   // Does exist, and error_if_exists == true: error
   options.create_if_missing = false;
@@ -986,8 +1011,7 @@ TEST_F(DBBasicTest, DBOpen_Options) {
   ASSERT_OK(s);
   ASSERT_TRUE(db != nullptr);
 
-  delete db;
-  db = nullptr;
+  db.reset();
 }
 
 TEST_F(DBBasicTest, CompactOnFlush) {
@@ -1292,7 +1316,7 @@ TEST_F(DBBasicTest, DBClose) {
   std::string dbname = test::PerThreadDBPath("db_close_test");
   ASSERT_OK(DestroyDB(dbname, options));
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   TestEnv* env = new TestEnv(env_);
   std::unique_ptr<TestEnv> local_env_guard(env);
   options.create_if_missing = true;
@@ -1305,14 +1329,14 @@ TEST_F(DBBasicTest, DBClose) {
   ASSERT_EQ(env->GetCloseCount(), 1);
   ASSERT_EQ(s, Status::IOError());
 
-  delete db;
+  db.reset();
   ASSERT_EQ(env->GetCloseCount(), 1);
 
   // Do not call DB::Close() and ensure our logger Close() still gets called
   s = DB::Open(options, dbname, &db);
   ASSERT_OK(s);
   ASSERT_TRUE(db != nullptr);
-  delete db;
+  db.reset();
   ASSERT_EQ(env->GetCloseCount(), 2);
 
   // close by WaitForCompact() with close_db option
@@ -1327,7 +1351,7 @@ TEST_F(DBBasicTest, DBClose) {
   // see TestLogger::CloseHelper()
   ASSERT_EQ(s, Status::IOError());
 
-  delete db;
+  db.reset();
   ASSERT_EQ(env->GetCloseCount(), 3);
 
   // Provide our own logger and ensure DB::Close() does not close it
@@ -1338,7 +1362,7 @@ TEST_F(DBBasicTest, DBClose) {
 
   s = db->Close();
   ASSERT_EQ(s, Status::OK());
-  delete db;
+  db.reset();
   ASSERT_EQ(env->GetCloseCount(), 3);
   options.info_log.reset();
   ASSERT_EQ(env->GetCloseCount(), 4);
@@ -1356,7 +1380,7 @@ TEST_F(DBBasicTest, DBCloseAllDirectoryFDs) {
 
   ASSERT_OK(DestroyDB(dbname, options));
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   std::unique_ptr<Env> env = NewCompositeEnv(
       std::make_shared<CountedFileSystem>(FileSystem::Default()));
   options.create_if_missing = true;
@@ -1374,7 +1398,7 @@ TEST_F(DBBasicTest, DBCloseAllDirectoryFDs) {
   ASSERT_EQ(counted_fs->counters()->dir_opens,
             counted_fs->counters()->dir_closes);
   ASSERT_OK(s);
-  delete db;
+  db.reset();
 }
 
 TEST_F(DBBasicTest, DBCloseFlushError) {
@@ -1436,7 +1460,7 @@ TEST_P(DBMultiGetTestWithParam, MultiGetMultiCF) {
   }
 
   int get_sv_count = 0;
-  ROCKSDB_NAMESPACE::DBImpl* db = static_cast_with_check<DBImpl>(db_);
+  ROCKSDB_NAMESPACE::DBImpl* db = dbfull();
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::MultiCFSnapshot::AfterRefSV", [&](void* /*arg*/) {
         if (++get_sv_count == 2) {
@@ -1508,10 +1532,9 @@ TEST_P(DBMultiGetTestWithParam, MultiGetMultiCF) {
   ASSERT_EQ(values[2], std::get<2>(cf_kv_vec[1]) + "_2");
 
   for (int cf = 0; cf < 8; ++cf) {
-    auto* cfd =
-        static_cast_with_check<ColumnFamilyHandleImpl>(
-            static_cast_with_check<DBImpl>(db_)->GetColumnFamilyHandle(cf))
-            ->cfd();
+    auto* cfd = static_cast_with_check<ColumnFamilyHandleImpl>(
+                    dbfull()->GetColumnFamilyHandle(cf))
+                    ->cfd();
     ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
     ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVObsolete);
   }
@@ -1597,10 +1620,9 @@ TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFMutex) {
               "cf" + std::to_string(j) + "_val" + std::to_string(retries));
   }
   for (int i = 0; i < 8; ++i) {
-    auto* cfd =
-        static_cast_with_check<ColumnFamilyHandleImpl>(
-            static_cast_with_check<DBImpl>(db_)->GetColumnFamilyHandle(i))
-            ->cfd();
+    auto* cfd = static_cast_with_check<ColumnFamilyHandleImpl>(
+                    dbfull()->GetColumnFamilyHandle(i))
+                    ->cfd();
     ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
   }
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
@@ -1624,7 +1646,7 @@ TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFSnapshot) {
   }
 
   int get_sv_count = 0;
-  ROCKSDB_NAMESPACE::DBImpl* db = static_cast_with_check<DBImpl>(db_);
+  ROCKSDB_NAMESPACE::DBImpl* db = dbfull();
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::MultiCFSnapshot::AfterRefSV", [&](void* /*arg*/) {
         if (++get_sv_count == 2) {
@@ -1665,10 +1687,9 @@ TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFSnapshot) {
     ASSERT_EQ(values[j], "cf" + std::to_string(j) + "_val");
   }
   for (int i = 0; i < 8; ++i) {
-    auto* cfd =
-        static_cast_with_check<ColumnFamilyHandleImpl>(
-            static_cast_with_check<DBImpl>(db_)->GetColumnFamilyHandle(i))
-            ->cfd();
+    auto* cfd = static_cast_with_check<ColumnFamilyHandleImpl>(
+                    dbfull()->GetColumnFamilyHandle(i))
+                    ->cfd();
     ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
   }
 }
@@ -3273,9 +3294,8 @@ TEST_F(DBBasicTest, GetAllKeyVersions) {
     ASSERT_OK(Delete(std::to_string(i)));
   }
   std::vector<KeyVersion> key_versions;
-  ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(),
-                              std::numeric_limits<size_t>::max(),
-                              &key_versions));
+  ASSERT_OK(GetAllKeyVersions(
+      db_.get(), {}, {}, std::numeric_limits<size_t>::max(), &key_versions));
   ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size());
   for (size_t i = 0; i < kNumInserts + kNumDeletes + kNumUpdates; i++) {
     if (i % 3 == 0) {
@@ -3284,7 +3304,7 @@ TEST_F(DBBasicTest, GetAllKeyVersions) {
       ASSERT_EQ(key_versions[i].GetTypeName(), "TypeValue");
     }
   }
-  ASSERT_OK(GetAllKeyVersions(db_, handles_[0], Slice(), Slice(),
+  ASSERT_OK(GetAllKeyVersions(db_.get(), handles_[0], {}, {},
                               std::numeric_limits<size_t>::max(),
                               &key_versions));
   ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size());
@@ -3299,10 +3319,17 @@ TEST_F(DBBasicTest, GetAllKeyVersions) {
   for (size_t i = 0; i + 1 != kNumDeletes; ++i) {
     ASSERT_OK(Delete(1, std::to_string(i)));
   }
-  ASSERT_OK(GetAllKeyVersions(db_, handles_[1], Slice(), Slice(),
+  ASSERT_OK(GetAllKeyVersions(db_.get(), handles_[1], {}, {},
                               std::numeric_limits<size_t>::max(),
                               &key_versions));
   ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates - 3, key_versions.size());
+
+  // Change from historical behavior: empty key is now interpreted literally as
+  // a legal key (rather than as a "not present" key)
+  ASSERT_OK(GetAllKeyVersions(db_.get(), handles_[1], Slice(), Slice(),
+                              std::numeric_limits<size_t>::max(),
+                              &key_versions));
+  ASSERT_EQ(key_versions.size(), 0);
 }
 
 TEST_F(DBBasicTest, ValueTypeString) {
@@ -3354,6 +3381,69 @@ TEST_F(DBBasicTest, MultiGetIOBufferOverrun) {
                      keys.data(), values.data(), statuses.data(), true);
 }
 
+TEST_F(DBBasicTest, MultiGetWithSnapshotsAndPersistedTier) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = true;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"cf1", "cf2"}, options);
+
+  // Insert initial data
+  ASSERT_OK(Put(0, "key1", "value1_cf0"));
+  ASSERT_OK(Put(1, "key1", "value1_cf1"));
+  ASSERT_OK(Put(2, "key1", "value1_cf2"));
+  ASSERT_OK(Flush({0, 1, 2}));
+  for (auto cf : {0, 1, 2}) {
+    ASSERT_EQ(1, NumTableFilesAtLevel(0, cf));
+  }
+
+  ASSERT_OK(Put(0, "key1", "value2_cf0"));
+  ASSERT_OK(Put(1, "key1", "value2_cf1"));
+  ASSERT_OK(Put(2, "key1", "value2_cf2"));
+
+  // Prepare for concurrent atomic flush
+  std::atomic<bool> flush_done(false);
+  std::thread flush_thread([&]() {
+    ASSERT_OK(Flush({0, 1, 2}));
+    flush_done.store(true);
+  });
+
+  // Perform MultiGet with snapshot and read_tier = kPersistentTier
+  ReadOptions ro;
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ro.snapshot = snapshot;
+  ro.read_tier = kPersistedTier;
+
+  std::string k = "key1";
+  std::vector<Slice> keys(3, Slice(k));
+  std::vector<Status> statuses(keys.size());
+  std::vector<ColumnFamilyHandle*> cfs(keys.size());
+  std::vector<Slice> new_keys(keys.size());
+  std::vector<PinnableSlice> pin_values(keys.size());
+  for (size_t i = 0; i < keys.size(); ++i) {
+    cfs[i] = handles_[i];
+  }
+  db_->MultiGet(ro, cfs.size(), cfs.data(), keys.data(), pin_values.data(),
+                statuses.data());
+  for (const auto& s : statuses) {
+    ASSERT_OK(s);
+  }
+
+  if (pin_values[0] == "value1_cf0") {
+    // Check if the first value matches expected value
+    ASSERT_EQ(pin_values[1], "value1_cf1");
+    ASSERT_EQ(pin_values[2], "value1_cf2");
+  } else {
+    // If first value doesn't match, check if we got the updated values
+    ASSERT_EQ(pin_values[0], "value2_cf0");
+    ASSERT_EQ(pin_values[1], "value2_cf1");
+    ASSERT_EQ(pin_values[2], "value2_cf2");
+  }
+
+  flush_thread.join();
+  db_->ReleaseSnapshot(snapshot);
+}
+
 TEST_F(DBBasicTest, IncrementalRecoveryNoCorrupt) {
   Options options = CurrentOptions();
   DestroyAndReopen(options);
@@ -3808,6 +3898,75 @@ TEST_F(DBBasicTest, SkipWALIfMissingTableFiles) {
   ASSERT_OK(iter->status());
 }
 
+TEST_F(DBBasicTest, BestEffortRecoveryFailureWithTableCacheUseAfterFree) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.env = env_;
+  // Force multiple manifest files
+  options.max_manifest_file_size = 1;
+  options.max_manifest_space_amp_pct = 0;
+
+  DestroyAndReopen(options);
+
+  // Disable file deletions to preserve old manifest files for
+  // best-efforts recovery to succeed
+  ASSERT_OK(db_->DisableFileDeletions());
+
+  // Create multiple SST files to populate TableCache during
+  // best-efforts recovery
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put("key" + std::to_string(i),
+                  std::string(1000, static_cast<char>('a' + i))));
+    ASSERT_OK(Flush());
+  }
+
+  // Verify we have multiple manifest files
+  std::vector<std::string> files;
+  ASSERT_OK(env_->GetChildren(dbname_, &files));
+  int manifest_count = 0;
+  for (const auto& file : files) {
+    if (file.find("MANIFEST") != std::string::npos) {
+      manifest_count++;
+    }
+  }
+  ASSERT_GE(manifest_count, 2);
+
+  // Inject corruption after TableCache is populated (count > 3), but only once
+  // (injected flag) to allow best-effort recovery to trigger retry and succeed.
+  // This coerce the bug: first recovery caches SSTs with reference to column
+  // family's options in table cache and retry deletes column family so the
+  // reference becomes dangling.
+  int count = 0;
+  bool injected = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionBuilder::CheckConsistencyBeforeReturn", [&](void* arg) {
+        count++;
+        if (count > 3 && !injected) {
+          ASSERT_NE(nullptr, arg);
+          *(static_cast<Status*>(arg)) =
+              Status::Corruption("Injected corruption");
+          injected = true;
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  options.best_efforts_recovery = true;
+
+  Status s = TryReopen(options);
+  ASSERT_OK(s);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  for (int i = 0; i < 10; i++) {
+    std::string value;
+    // Without the fix, ASAN detects use-after-free when accessing cached SST
+    // files that hold dangling references to deleted ioptions.
+    s = db_->Get(ReadOptions(), "key" + std::to_string(i), &value);
+    ASSERT_TRUE(s.ok() || s.IsNotFound());
+  }
+}
+
 TEST_F(DBBasicTest, DisableTrackWal) {
   // If WAL tracking was enabled, and then disabled during reopen,
   // the previously tracked WALs should be removed from MANIFEST.
@@ -4994,6 +5153,104 @@ TEST_F(DBBasicTest, VerifyFileChecksumsReadahead) {
             (sst_size + alignment - 1) / (alignment));
 }
 
+TEST_F(DBBasicTest, DisallowMemtableWrite) {
+  // This test is mostly about what you can't do with memtable writes
+  // disallowed. For what you can do, see
+  // ExternalSSTFileBasicTest.FailIfNotBottommostLevelAndDisallowMemtable
+  Options options_allow = GetDefaultOptions();
+  options_allow.create_if_missing = true;
+  Options options_disallow = options_allow;
+  options_disallow.disallow_memtable_writes = true;
+  options_disallow.paranoid_memory_checks = true;
+  options_disallow.memtable_veirfy_per_key_checksum_on_seek = true;
+
+  DestroyAndReopen(options_allow);
+  // CFs allowing and disallowing memtable write
+  CreateColumnFamilies({"cf1", "cf2"}, options_allow);
+  CreateColumnFamilies({"cf3"}, options_disallow);
+  // XXX: needed to get consistent handles_ mappings
+  ReopenWithColumnFamilies(
+      {"default", "cf1", "cf2", "cf3"},
+      {options_allow, options_allow, options_allow, options_disallow});
+
+  EXPECT_EQ(Put(0, "a0", "1").code(), Status::Code::kOk);
+  EXPECT_EQ(Put(1, "a1", "1").code(), Status::Code::kOk);
+  EXPECT_EQ(Put(2, "a2", "1").code(), Status::Code::kOk);
+  EXPECT_EQ(Put(3, "a3", "1").code(), Status::Code::kInvalidArgument);
+
+  EXPECT_EQ(Get(0, "a0"), "1");
+  EXPECT_EQ(Get(1, "a1"), "1");
+  EXPECT_EQ(Get(2, "a2"), "1");
+  EXPECT_EQ(Get(3, "a3"), "NOT_FOUND");
+
+  EXPECT_EQ(Delete(0, "z0").code(), Status::Code::kOk);
+  EXPECT_EQ(Delete(1, "z1").code(), Status::Code::kOk);
+  EXPECT_EQ(Delete(2, "z2").code(), Status::Code::kOk);
+  EXPECT_EQ(Delete(3, "z3").code(), Status::Code::kInvalidArgument);
+
+  WriteBatch wb;
+  EXPECT_EQ(wb.Put(handles_[0], "b0", "2").code(), Status::Code::kOk);
+  EXPECT_EQ(wb.Put(handles_[1], "b1", "2").code(), Status::Code::kOk);
+  EXPECT_EQ(wb.Put(handles_[2], "b2", "2").code(), Status::Code::kOk);
+  EXPECT_EQ(wb.Put(handles_[3], "b3", "2").code(),
+            Status::Code::kInvalidArgument);
+  ASSERT_OK(db_->Write({}, &wb));
+  wb.Clear();
+
+  EXPECT_EQ(Get(0, "b0"), "2");
+  EXPECT_EQ(Get(1, "b1"), "2");
+  EXPECT_EQ(Get(2, "b2"), "2");
+  EXPECT_EQ(Get(3, "b3"), "NOT_FOUND");
+
+  std::unique_ptr<Iterator> iter(
+      dbfull()->NewIterator(ReadOptions(), handles_[3]));
+  iter->Seek("a3");
+  ASSERT_OK(iter->status());
+  iter.reset();
+  // When the DB is re-opened with WAL entries for a CF that is newly setting
+  // disallow_memtable_writes, we detect that and fail the open gracefully.
+  ASSERT_EQ(TryReopenWithColumnFamilies(
+                {"default", "cf1", "cf2", "cf3"},
+                {options_allow, options_allow, options_disallow, options_allow})
+                .code(),
+            Status::Code::kInvalidArgument);
+
+  // Successfully opening with allow creates L0 files from the WAL
+  ReopenWithColumnFamilies({"default", "cf1", "cf2", "cf3"}, options_allow);
+
+  EXPECT_EQ(Get(0, "a0"), "1");
+  EXPECT_EQ(Get(1, "a1"), "1");
+  EXPECT_EQ(Get(2, "a2"), "1");
+  EXPECT_EQ(Get(3, "a3"), "NOT_FOUND");
+
+  // Now able to disallow on CF2 because no relevant WAL entries
+  ReopenWithColumnFamilies(
+      {"default", "cf1", "cf2", "cf3"},
+      {options_allow, options_allow, options_disallow, options_allow});
+
+  EXPECT_EQ(Get(0, "a0"), "1");
+  EXPECT_EQ(Get(1, "a1"), "1");
+  EXPECT_EQ(Get(2, "a2"), "1");
+  EXPECT_EQ(Get(3, "a3"), "NOT_FOUND");
+
+  // Now able to write to CF 3 but not CF 2
+  EXPECT_EQ(Put(0, "c0", "3").code(), Status::Code::kOk);
+  EXPECT_EQ(Put(1, "c1", "3").code(), Status::Code::kOk);
+  EXPECT_EQ(Put(2, "c2", "3").code(), Status::Code::kInvalidArgument);
+  EXPECT_EQ(Put(3, "c3", "3").code(), Status::Code::kOk);
+
+  EXPECT_EQ(Get(0, "c0"), "3");
+  EXPECT_EQ(Get(1, "c1"), "3");
+  EXPECT_EQ(Get(2, "c2"), "NOT_FOUND");
+  EXPECT_EQ(Get(3, "c3"), "3");
+
+  // disallow_memtable_writes not supported on default column family.
+  // (Would be complicated to make a WriteBatch aware of the setting in order
+  // to reject the write before entering the write path.)
+  Destroy(options_allow);
+  EXPECT_EQ(TryReopen(options_disallow).code(), Status::Code::kInvalidArgument);
+}
+
 // TODO: re-enable after we provide finer-grained control for WAL tracking to
 // meet the needs of different use cases, durability levels and recovery modes.
 TEST_F(DBBasicTest, DISABLED_ManualWalSync) {
@@ -5210,6 +5467,94 @@ INSTANTIATE_TEST_CASE_P(DBBasicTestDeadline, DBBasicTestDeadline,
                         ::testing::Values(std::make_tuple(true, false),
                                           std::make_tuple(false, true),
                                           std::make_tuple(true, true)));
+
+// FileSystemWrapper that captures FileOptions passed to NewRandomAccessFile
+// for .sst files, so we can verify file_checksum fields are populated.
+class ChecksumCapturingFS : public FileSystemWrapper {
+ public:
+  explicit ChecksumCapturingFS(const std::shared_ptr<FileSystem>& base)
+      : FileSystemWrapper(base) {}
+
+  static const char* kClassName() { return "ChecksumCapturingFS"; }
+  const char* Name() const override { return kClassName(); }
+
+  IOStatus NewRandomAccessFile(const std::string& fname,
+                               const FileOptions& opts,
+                               std::unique_ptr<FSRandomAccessFile>* result,
+                               IODebugContext* dbg) override {
+    if (fname.find(".sst") != std::string::npos) {
+      std::lock_guard<std::mutex> lock(mu_);
+      captured_file_checksum_ = opts.file_checksum;
+      captured_file_checksum_func_name_ = opts.file_checksum_func_name;
+      capture_count_++;
+    }
+    return target()->NewRandomAccessFile(fname, opts, result, dbg);
+  }
+
+  std::string GetCapturedFileChecksum() {
+    std::lock_guard<std::mutex> lock(mu_);
+    return captured_file_checksum_;
+  }
+
+  std::string GetCapturedFileChecksumFuncName() {
+    std::lock_guard<std::mutex> lock(mu_);
+    return captured_file_checksum_func_name_;
+  }
+
+  int GetCaptureCount() {
+    std::lock_guard<std::mutex> lock(mu_);
+    return capture_count_;
+  }
+
+  void Reset() {
+    std::lock_guard<std::mutex> lock(mu_);
+    captured_file_checksum_.clear();
+    captured_file_checksum_func_name_.clear();
+    capture_count_ = 0;
+  }
+
+ private:
+  std::mutex mu_;
+  std::string captured_file_checksum_;
+  std::string captured_file_checksum_func_name_;
+  int capture_count_ = 0;
+};
+
+TEST_F(DBBasicTest, FileChecksumInFileOptions) {
+  // Verify that file_checksum and file_checksum_func_name from FileMetaData
+  // are propagated through FileOptions when opening SST files.
+  auto capturing_fs =
+      std::make_shared<ChecksumCapturingFS>(env_->GetFileSystem());
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, capturing_fs));
+
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.env = env.get();
+  options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  DestroyAndReopen(options);
+
+  // Write data and flush to create an SST with a file checksum.
+  ASSERT_OK(Put("key1", "value1"));
+  ASSERT_OK(Flush());
+
+  // Reset captures, then reopen to trigger TableCache SST open.
+  capturing_fs->Reset();
+  Reopen(options);
+
+  // Read to trigger SST open through TableCache::GetTableReader.
+  ASSERT_EQ("value1", Get("key1"));
+
+  // Verify that checksum fields were populated.
+  ASSERT_GT(capturing_fs->GetCaptureCount(), 0);
+  ASSERT_FALSE(capturing_fs->GetCapturedFileChecksum().empty());
+  ASSERT_NE(capturing_fs->GetCapturedFileChecksumFuncName(),
+            capturing_fs->GetCapturedFileChecksum());
+  ASSERT_EQ(capturing_fs->GetCapturedFileChecksumFuncName(),
+            "FileChecksumCrc32c");
+
+  Close();
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc
index cafb3710092d..1433bd6014e6 100644
--- a/db/db_block_cache_test.cc
+++ b/db/db_block_cache_test.cc
@@ -506,6 +506,8 @@ TEST_P(DBBlockCacheTest1, WarmCacheWithBlocksDuringFlush) {
   table_options.prepopulate_block_cache =
       BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  // Include a compression dictionary block
+  options.compression_opts.max_dict_bytes = 123;
   DestroyAndReopen(options);
 
   std::string value(kValueSize, 'a');
@@ -537,6 +539,9 @@ TEST_P(DBBlockCacheTest1, WarmCacheWithBlocksDuringFlush) {
                 options.statistics->getTickerCount(BLOCK_CACHE_FILTER_HIT));
     }
     ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_FILTER_MISS));
+
+    // Including compression dict
+    ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_MISS));
   }
 
   // Verify compaction not counted
@@ -824,68 +829,78 @@ TEST_F(DBBlockCacheTest, CacheCompressionDict) {
   const int kNumEntriesPerFile = 128;
   const int kNumBytesPerEntry = 1024;
 
-  // Try all the available libraries that support dictionary compression
-  std::vector<CompressionType> compression_types;
-  if (Zlib_Supported()) {
-    compression_types.push_back(kZlibCompression);
-  }
-  if (LZ4_Supported()) {
-    compression_types.push_back(kLZ4Compression);
-    compression_types.push_back(kLZ4HCCompression);
-  }
-  if (ZSTD_Supported()) {
-    compression_types.push_back(kZSTD);
-  }
+  std::vector<CompressionType> dict_compressions =
+      GetSupportedDictCompressions();
   Random rnd(301);
-  for (auto compression_type : compression_types) {
-    Options options = CurrentOptions();
-    options.bottommost_compression = compression_type;
-    options.bottommost_compression_opts.max_dict_bytes = 4096;
-    options.bottommost_compression_opts.enabled = true;
-    options.create_if_missing = true;
-    options.num_levels = 2;
-    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
-    options.target_file_size_base = kNumEntriesPerFile * kNumBytesPerEntry;
-    BlockBasedTableOptions table_options;
-    table_options.cache_index_and_filter_blocks = true;
-    table_options.block_cache.reset(new MockCache());
-    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-    DestroyAndReopen(options);
+  // Format version before and after compression handling changes
+  for (int format_version : {6, 7}) {
+    // Test all supported compression types because (at least historically)
+    // dictionary compression could be enabled and a dictionary block saved
+    // but ignored by some compression types. Ensure we at least don't crash
+    // or return corruption for those.
+    for (auto compression_type : GetSupportedCompressions()) {
+      // Extra handling checks only for types actually supporting dictionary
+      // compression.
+      bool dict_supported =
+          std::count(dict_compressions.begin(), dict_compressions.end(),
+                     compression_type) > 0;
 
-    RecordCacheCountersForCompressionDict(options);
+      Options options = CurrentOptions();
+      options.bottommost_compression = compression_type;
+      options.bottommost_compression_opts.max_dict_bytes = 4096;
+      options.bottommost_compression_opts.enabled = true;
+      options.create_if_missing = true;
+      options.num_levels = 2;
+      options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+      options.target_file_size_base = kNumEntriesPerFile * kNumBytesPerEntry;
+      BlockBasedTableOptions table_options;
+      table_options.cache_index_and_filter_blocks = true;
+      table_options.block_cache.reset(new MockCache());
+      table_options.format_version = format_version;
+      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+      DestroyAndReopen(options);
 
-    for (int i = 0; i < kNumFiles; ++i) {
-      ASSERT_EQ(i, NumTableFilesAtLevel(0, 0));
-      for (int j = 0; j < kNumEntriesPerFile; ++j) {
-        std::string value = rnd.RandomString(kNumBytesPerEntry);
-        ASSERT_OK(Put(Key(j * kNumFiles + i), value.c_str()));
+      RecordCacheCountersForCompressionDict(options);
+
+      for (int i = 0; i < kNumFiles; ++i) {
+        ASSERT_EQ(i, NumTableFilesAtLevel(0, 0));
+        for (int j = 0; j < kNumEntriesPerFile; ++j) {
+          std::string value = rnd.RandomString(kNumBytesPerEntry);
+          ASSERT_OK(Put(Key(j * kNumFiles + i), value.c_str()));
+        }
+        ASSERT_OK(Flush());
+      }
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+      ASSERT_EQ(0, NumTableFilesAtLevel(0));
+      ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(1));
+
+      if (dict_supported) {
+        // Compression dictionary blocks are preloaded.
+        CheckCacheCountersForCompressionDict(
+            options, kNumFiles /* expected_compression_dict_misses */,
+            0 /* expected_compression_dict_hits */,
+            kNumFiles /* expected_compression_dict_inserts */);
+      }
+
+      // Seek to a key in a file. It should cause the SST's dictionary
+      // meta-block to be read.
+      RecordCacheCounters(options);
+      RecordCacheCountersForCompressionDict(options);
+      ReadOptions read_options;
+      ASSERT_NE("NOT_FOUND", Get(Key(kNumFiles * kNumEntriesPerFile - 1)));
+
+      if (dict_supported) {
+        // Two block hits: index and dictionary since they are prefetched
+        // One block missed/added: data block
+        CheckCacheCounters(options, 1 /* expected_misses */,
+                           2 /* expected_hits */, 1 /* expected_inserts */,
+                           0 /* expected_failures */);
+        CheckCacheCountersForCompressionDict(
+            options, 0 /* expected_compression_dict_misses */,
+            1 /* expected_compression_dict_hits */,
+            0 /* expected_compression_dict_inserts */);
       }
-      ASSERT_OK(Flush());
     }
-    ASSERT_OK(dbfull()->TEST_WaitForCompact());
-    ASSERT_EQ(0, NumTableFilesAtLevel(0));
-    ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(1));
-
-    // Compression dictionary blocks are preloaded.
-    CheckCacheCountersForCompressionDict(
-        options, kNumFiles /* expected_compression_dict_misses */,
-        0 /* expected_compression_dict_hits */,
-        kNumFiles /* expected_compression_dict_inserts */);
-
-    // Seek to a key in a file. It should cause the SST's dictionary meta-block
-    // to be read.
-    RecordCacheCounters(options);
-    RecordCacheCountersForCompressionDict(options);
-    ReadOptions read_options;
-    ASSERT_NE("NOT_FOUND", Get(Key(kNumFiles * kNumEntriesPerFile - 1)));
-    // Two block hits: index and dictionary since they are prefetched
-    // One block missed/added: data block
-    CheckCacheCounters(options, 1 /* expected_misses */, 2 /* expected_hits */,
-                       1 /* expected_inserts */, 0 /* expected_failures */);
-    CheckCacheCountersForCompressionDict(
-        options, 0 /* expected_compression_dict_misses */,
-        1 /* expected_compression_dict_hits */,
-        0 /* expected_compression_dict_inserts */);
   }
 }
 
@@ -1646,7 +1661,7 @@ TEST_P(DBBlockCacheKeyTest, StableCacheKeys) {
   std::string export_files_dir = dbname_ + "/exported";
   ExportImportFilesMetaData* metadata_ptr_ = nullptr;
   Checkpoint* checkpoint;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
   ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir,
                                            &metadata_ptr_));
   ASSERT_NE(metadata_ptr_, nullptr);
@@ -1683,7 +1698,7 @@ TEST_P(DBBlockCacheKeyTest, StableCacheKeys) {
   // StableCacheKeyTestFS, Checkpoint will resort to full copy not hard link.
   // (Checkpoint  not available in LITE mode to test this.)
   auto db_copy_name = dbname_ + "-copy";
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
   ASSERT_OK(checkpoint->CreateCheckpoint(db_copy_name));
   delete checkpoint;
 
diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc
index edb02920e72d..eb6e51a95ec6 100644
--- a/db/db_bloom_filter_test.cc
+++ b/db/db_bloom_filter_test.cc
@@ -137,11 +137,6 @@ class SliceTransformLimitedDomainGeneric : public SliceTransform {
     // prefix will be x????
     return src.size() >= 5;
   }
-
-  bool InRange(const Slice& dst) const override {
-    // prefix will be x????
-    return dst.size() == 5;
-  }
 };
 
 // KeyMayExist can lead to a few false positives, but not false negatives.
@@ -710,12 +705,20 @@ class AlwaysTrueBitsBuilder : public FilterBitsBuilder {
     count_ = 0;
     // Interpreted as "always true" filter (0 probes over 1 byte of
     // payload, 5 bytes metadata)
-    return Slice("\0\0\0\0\0\0", 6);
+    return Slice("\0\0\0\0\0\0", kAlwaysTrueFilterBytes);
   }
   using FilterBitsBuilder::Finish;
   size_t ApproximateNumEntries(size_t) override { return SIZE_MAX; }
+  size_t CalculateSpace(size_t /* num_entries */) override {
+    return kAlwaysTrueFilterBytes;
+  }
+  double EstimatedFpRate(size_t /* num_entries */,
+                         size_t /* bytes */) override {
+    return 1.0;
+  }
 
  private:
+  static constexpr size_t kAlwaysTrueFilterBytes = 6;
   size_t count_ = 0;
 };
 
@@ -914,14 +917,14 @@ INSTANTIATE_TEST_CASE_P(
     ::testing::Values(
         std::make_tuple(kAutoBloom,
                         FilterPartitioning::kCoupledPartitionedFilter,
-                        kLatestFormatVersion),
+                        kLatestBbtFormatVersion),
         std::make_tuple(kAutoBloom,
                         FilterPartitioning::kDecoupledPartitionedFilter,
-                        kLatestFormatVersion),
+                        kLatestBbtFormatVersion),
         std::make_tuple(kAutoBloom, FilterPartitioning::kUnpartitionedFilter,
-                        kLatestFormatVersion),
+                        kLatestBbtFormatVersion),
         std::make_tuple(kAutoRibbon, FilterPartitioning::kUnpartitionedFilter,
-                        kLatestFormatVersion)));
+                        kLatestBbtFormatVersion)));
 #endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 
 TEST_F(DBBloomFilterTest, BloomFilterRate) {
@@ -2069,11 +2072,6 @@ class SliceTransformLimitedDomain : public SliceTransform {
     // prefix will be x????
     return src.size() >= 5 && src[0] == 'x';
   }
-
-  bool InRange(const Slice& dst) const override {
-    // prefix will be x????
-    return dst.size() == 5 && dst[0] == 'x';
-  }
 };
 
 TEST_F(DBBloomFilterTest, PrefixExtractorWithFilter1) {
@@ -4137,7 +4135,7 @@ TEST_F(DBBloomFilterTest, SstQueryFilter) {
 
   using Keys = std::vector<std::string>;
   auto RangeQuery =
-      [factory, db = db_](
+      [factory, db = db_.get()](
           std::string lb, std::string ub,
           std::shared_ptr<SstQueryFilterConfigsManager::Factory> alt_factory =
               nullptr) {
diff --git a/db/db_compaction_abort_test.cc b/db/db_compaction_abort_test.cc
new file mode 100644
index 000000000000..a76e1d689f1f
--- /dev/null
+++ b/db/db_compaction_abort_test.cc
@@ -0,0 +1,993 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <atomic>
+#include <set>
+#include <thread>
+#include <unordered_map>
+
+#include "db/compaction/compaction_job.h"
+#include "db/db_impl/db_impl_secondary.h"
+#include "db/db_test_util.h"
+#include "options/options_helper.h"
+#include "port/stack_trace.h"
+#include "rocksdb/db.h"
+#include "rocksdb/sst_file_writer.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Helper class to manage abort synchronization in tests.
+//
+// Compaction abort could happen at various stage of compaction.
+// To test this, we need to trigger abort at different stage. This requires
+// precise control on the timing of abort API invocation. To achieve this in a
+// consistent way across various tests, we invoke AbortAllCompactions() within
+// the sync point callback, that is added at various stages of compaction.
+// However as the abort API is a blocking call, calling it within the sync point
+// callback on the compaction thread would cause deadlock. This test helper
+// class is designed to solve this challenge.
+//
+// 1. Abort must happen from a different thread:
+//    AbortAllCompactions() is typically called from the compaction thread
+//    via a sync point callback, so that we could precisely control the time of
+//    API invocation to simulate abort at different stage of compaction.
+//    However, we can't block the compaction thread waiting for the abort to
+//    complete - the compaction needs to continue executing to actually check
+//    the abort flag and exit. So we spawn a separate thread to call
+//    AbortAllCompactions().
+//
+// 2. We need to know when abort completes:
+//    After compaction returns (with aborted status), we often need to:
+//    - Verify state (e.g., no output files created)
+//    - Call ResumeAllCompactions()
+//    - Run compaction again to verify it succeeds
+//    We must wait for the abort thread to finish before proceeding, otherwise
+//    we might call Resume before Abort completes, causing race conditions.
+//
+// 3. Sync point callbacks may fire multiple times:
+//    With multiple subcompactions, a callback like
+//    "CompactionJob::ProcessKeyValueCompaction:Start" fires once per
+//    subcompaction. We only want to trigger abort once, so we use
+//    abort_triggered_ as a guard.
+//
+// 4. Tests may need multiple abort cycles:
+//    Some tests (e.g., MultipleAbortResumeSequence) do abort->resume->abort
+//    multiple times. The class supports this by auto-resetting when a
+//    previous abort has completed.
+class AbortSynchronizer {
+ public:
+  AbortSynchronizer() : abort_cv_(&abort_mutex_) {}
+
+  ~AbortSynchronizer() {
+    // Join the thread if it was started - ensures clean shutdown
+    if (abort_thread_.joinable()) {
+      abort_thread_.join();
+    }
+  }
+
+  // Non-copyable, non-movable due to thread member
+  AbortSynchronizer(const AbortSynchronizer&) = delete;
+  AbortSynchronizer& operator=(const AbortSynchronizer&) = delete;
+
+  // Trigger abort from a separate thread.
+  // - Safe to call multiple times; only first call in each cycle spawns thread
+  // - If a previous abort has completed, automatically resets state first
+  // - The spawned thread calls AbortAllCompactions() and signals completion
+  void TriggerAbort(DBImpl* db) {
+    // If previous abort completed, reset state to allow new abort
+    if (abort_triggered_.load() && abort_completed_.load()) {
+      Reset();
+    }
+
+    if (!abort_triggered_.exchange(true)) {
+      abort_thread_ = std::thread([this, db]() {
+        db->AbortAllCompactions();
+        SignalAbortCompleted();
+      });
+    }
+  }
+
+  // Wait for the abort thread to complete.
+  // Call this AFTER compaction returns to ensure the abort thread has finished
+  // before proceeding with Resume or other operations.
+  void WaitForAbortCompletion() {
+    MutexLock l(&abort_mutex_);
+    while (!abort_completed_.load()) {
+      abort_cv_.Wait();
+    }
+  }
+
+  // Reset state for reuse. Joins any previous thread first.
+  // Called automatically by TriggerAbort() if previous abort completed,
+  // but can also be called explicitly for clarity.
+  void Reset() {
+    if (abort_thread_.joinable()) {
+      abort_thread_.join();
+    }
+    abort_triggered_.store(false);
+    abort_completed_.store(false);
+  }
+
+  bool IsAbortTriggered() const { return abort_triggered_.load(); }
+
+ private:
+  void SignalAbortCompleted() {
+    MutexLock l(&abort_mutex_);
+    abort_completed_.store(true);
+    abort_cv_.SignalAll();
+  }
+
+  std::atomic<bool> abort_triggered_{false};  // Guards against multiple spawns
+  std::atomic<bool> abort_completed_{false};  // Signals thread completion
+  port::Mutex abort_mutex_;
+  port::CondVar abort_cv_;
+  std::thread abort_thread_;  // The thread that calls AbortAllCompactions()
+};
+
+// Helper to clean up SyncPoint state after tests
+inline void CleanupSyncPoints() {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// Helper class that combines AbortSynchronizer with sync point setup for
+// deterministic abort triggering. This adds sync point coordination on top
+// of AbortSynchronizer:
+//
+// This is useful when you need deterministic timing - the callback won't
+// return until AbortAllCompactions() has actually set the abort flag,
+// guaranteeing the compaction will see it on the next check.
+class SyncPointAbortHelper {
+ public:
+  explicit SyncPointAbortHelper(const std::string& trigger_point)
+      : trigger_point_(trigger_point) {}
+
+  // Set up sync points and callbacks. Call this before starting compaction.
+  void Setup(DBImpl* db_impl) {
+    db_impl_ = db_impl;
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+        {"DBImpl::AbortAllCompactions:FlagSet", kWaitPointName},
+    });
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        trigger_point_, [this](void* /*arg*/) {
+          // Use AbortSynchronizer to handle the abort in a separate thread
+          abort_sync_.TriggerAbort(db_impl_);
+
+          // Wait for abort flag to be set via sync point dependency
+          // This ensures deterministic timing - compaction will see the flag
+          TEST_SYNC_POINT_CALLBACK(kWaitPointName, nullptr);
+        });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  }
+
+  // Wait for the abort to complete. Call this after compaction returns.
+  void WaitForAbortCompletion() { abort_sync_.WaitForAbortCompletion(); }
+
+  // Clean up sync points and wait for abort completion in one call
+  void CleanupAndWait() {
+    CleanupSyncPoints();
+    WaitForAbortCompletion();
+  }
+
+ private:
+  static constexpr const char* kWaitPointName =
+      "SyncPointAbortHelper::WaitForAbort";
+  std::string trigger_point_;
+  DBImpl* db_impl_{nullptr};
+  AbortSynchronizer abort_sync_;
+};
+
+class DBCompactionAbortTest : public DBTestBase {
+ public:
+  DBCompactionAbortTest()
+      : DBTestBase("db_compaction_abort_test", /*env_do_fsync=*/false) {}
+
+ protected:
+  // Map to track the latest value of each key for verification
+  std::unordered_map<std::string, std::string> expected_values_;
+
+  // Statistics object for verifying compaction metrics
+  std::shared_ptr<Statistics> stats_;
+
+  // Get current options with statistics enabled
+  Options GetOptionsWithStats() {
+    Options options = CurrentOptions();
+    stats_ = CreateDBStatistics();
+    options.statistics = stats_;
+    return options;
+  }
+
+  // Populate database with test data.
+  // If overlapping=true, uses the same key range (0 to keys_per_file-1) in each
+  // file to ensure compaction has work to do.
+  // If overlapping=false, uses non-overlapping keys across files.
+  void PopulateData(int num_files, int keys_per_file, int value_size,
+                    bool overlapping = true, int seed = 301) {
+    Random rnd(seed);
+    for (int i = 0; i < num_files; ++i) {
+      for (int j = 0; j < keys_per_file; ++j) {
+        int key_index = overlapping ? j : (j + i * keys_per_file);
+        std::string key = Key(key_index);
+        std::string value = rnd.RandomString(value_size);
+        ASSERT_OK(Put(key, value));
+        expected_values_[key] = value;
+      }
+      ASSERT_OK(Flush());
+    }
+  }
+
+  // Verify data integrity by reading all keys and comparing with expected
+  // values
+  void VerifyDataIntegrity(int num_keys, int start_key = 0) {
+    std::string val;
+    for (int j = start_key; j < start_key + num_keys; ++j) {
+      std::string key = Key(j);
+      ASSERT_OK(dbfull()->Get(ReadOptions(), key, &val));
+      auto it = expected_values_.find(key);
+      if (it != expected_values_.end()) {
+        ASSERT_EQ(it->second, val) << "Value mismatch for key: " << key;
+      }
+    }
+  }
+
+  // Clear expected values (useful when reopening DB or between tests)
+  void ClearExpectedValues() { expected_values_.clear(); }
+
+  // Run the common abort test pattern with SyncPointAbortHelper:
+  // 1. Set up sync point abort helper
+  // 2. Run compaction and verify it's aborted
+  // 3. Verify COMPACTION_ABORTED stat increased (if stats enabled)
+  // 4. Clean up, resume, and verify compaction succeeds
+  // 5. Verify COMPACT_WRITE_BYTES increased (if stats enabled)
+  void RunSyncPointAbortTest(const std::string& trigger_point,
+                             CompactRangeOptions cro = CompactRangeOptions()) {
+    // Capture stats and file counts before abort
+    uint64_t aborted_before = 0;
+    uint64_t write_bytes_before = 0;
+    if (stats_) {
+      aborted_before = stats_->getTickerCount(COMPACTION_ABORTED);
+      write_bytes_before = stats_->getTickerCount(COMPACT_WRITE_BYTES);
+    }
+
+    SyncPointAbortHelper helper(trigger_point);
+    helper.Setup(dbfull());
+
+    Status s = dbfull()->CompactRange(cro, nullptr, nullptr);
+    ASSERT_TRUE(s.IsIncomplete());
+    ASSERT_TRUE(s.IsCompactionAborted());
+
+    // Verify abort was counted
+    if (stats_) {
+      uint64_t aborted_after = stats_->getTickerCount(COMPACTION_ABORTED);
+      ASSERT_GT(aborted_after, aborted_before)
+          << "COMPACTION_ABORTED stat should increase after abort";
+    }
+
+    helper.CleanupAndWait();
+    dbfull()->ResumeAllCompactions();
+
+    ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+
+    // Verify compaction completed and wrote bytes
+    if (stats_) {
+      uint64_t write_bytes_after = stats_->getTickerCount(COMPACT_WRITE_BYTES);
+      ASSERT_GT(write_bytes_after, write_bytes_before)
+          << "COMPACT_WRITE_BYTES should increase after successful compaction";
+    }
+  }
+};
+
+// Parameterized test for abort with different number of max subcompactions.
+// This consolidates tests that were essentially duplicates with different
+// max_subcompactions values
+class DBCompactionAbortSubcompactionTest
+    : public DBCompactionAbortTest,
+      public ::testing::WithParamInterface<int> {};
+
+TEST_P(DBCompactionAbortSubcompactionTest, AbortWithVaryingSubcompactions) {
+  int max_subcompactions = GetParam();
+
+  Options options = GetOptionsWithStats();
+  options.level0_file_num_compaction_trigger = 4;
+  options.max_subcompactions = max_subcompactions;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+
+  PopulateData(/*num_files=*/4, /*keys_per_file=*/100, /*value_size=*/100);
+
+  RunSyncPointAbortTest("CompactionJob::RunSubcompactions:BeforeStart");
+
+  VerifyDataIntegrity(/*num_keys=*/100);
+}
+
+INSTANTIATE_TEST_CASE_P(SubcompactionVariants,
+                        DBCompactionAbortSubcompactionTest,
+                        ::testing::Values(1, 2, 4),
+                        [](const ::testing::TestParamInfo<int>& param_info) {
+                          return "MaxSubcompactionCount_" +
+                                 std::to_string(param_info.param);
+                        });
+
+// Parameterized test for abort with different compaction styles
+// This consolidates tests for Level, Universal, and FIFO compaction styles
+class DBCompactionAbortStyleTest
+    : public DBCompactionAbortTest,
+      public ::testing::WithParamInterface<CompactionStyle> {
+ protected:
+  // Configure options based on compaction style
+  void ConfigureOptionsForStyle(Options& options, CompactionStyle style) {
+    options.compaction_style = style;
+    options.level0_file_num_compaction_trigger = 4;
+    options.disable_auto_compactions = true;
+
+    switch (style) {
+      case kCompactionStyleLevel:
+        // Level compaction uses default settings
+        break;
+      case kCompactionStyleUniversal:
+        options.compaction_options_universal.size_ratio = 10;
+        break;
+      case kCompactionStyleFIFO:
+        // Set a large max_table_files_size to avoid deletion compaction
+        options.compaction_options_fifo.max_table_files_size =
+            100 * 1024 * 1024;
+        // Enable intra-L0 compaction which goes through normal compaction path
+        options.compaction_options_fifo.allow_compaction = true;
+        options.max_open_files = -1;  // Required for FIFO compaction
+        break;
+      default:
+        break;
+    }
+  }
+};
+
+TEST_P(DBCompactionAbortStyleTest, AbortCompaction) {
+  CompactionStyle style = GetParam();
+
+  Options options = GetOptionsWithStats();
+  options.max_subcompactions = 1;
+  ConfigureOptionsForStyle(options, style);
+  Reopen(options);
+
+  PopulateData(/*num_files=*/4, /*keys_per_file=*/100, /*value_size=*/100);
+
+  RunSyncPointAbortTest("CompactionJob::RunSubcompactions:BeforeStart");
+
+  VerifyDataIntegrity(/*num_keys=*/100);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    CompactionStyleVariants, DBCompactionAbortStyleTest,
+    ::testing::Values(kCompactionStyleLevel, kCompactionStyleUniversal,
+                      kCompactionStyleFIFO),
+    [](const ::testing::TestParamInfo<CompactionStyle>& param_info) {
+      return OptionsHelper::compaction_style_to_string.at(param_info.param);
+    });
+
+TEST_F(DBCompactionAbortTest, AbortManualCompaction) {
+  Options options = GetOptionsWithStats();
+  options.level0_file_num_compaction_trigger = 10;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+
+  PopulateData(/*num_files=*/5, /*keys_per_file=*/100, /*value_size=*/1000);
+
+  CompactRangeOptions cro;
+  cro.exclusive_manual_compaction = true;
+  RunSyncPointAbortTest("CompactionJob::ProcessKeyValueCompaction:Start", cro);
+
+  VerifyDataIntegrity(/*num_keys=*/100);
+}
+
+TEST_F(DBCompactionAbortTest, AbortAutomaticCompaction) {
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 4;
+  options.max_subcompactions = 2;
+  options.disable_auto_compactions = false;
+  Reopen(options);
+
+  Random rnd(301);
+  AbortSynchronizer abort_sync;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::ProcessKeyValueCompaction:Start",
+      [&](void* /*arg*/) { abort_sync.TriggerAbort(dbfull()); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 100; ++j) {
+      ASSERT_OK(Put(Key(j), rnd.RandomString(1000)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  CleanupSyncPoints();
+
+  abort_sync.WaitForAbortCompletion();
+  dbfull()->ResumeAllCompactions();
+
+  for (int j = 0; j < 100; ++j) {
+    ASSERT_OK(Put(Key(j), rnd.RandomString(1000)));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  std::string val;
+  for (int j = 0; j < 100; ++j) {
+    ASSERT_OK(dbfull()->Get(ReadOptions(), Key(j), &val));
+  }
+}
+
+TEST_F(DBCompactionAbortTest, AbortAndVerifyNoOutputFiles) {
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 4;
+  options.max_subcompactions = 2;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+
+  PopulateData(/*num_files=*/4, /*keys_per_file=*/100, /*value_size=*/1000);
+
+  int num_l0_files_before = NumTableFilesAtLevel(0);
+  int num_l1_files_before = NumTableFilesAtLevel(1);
+
+  SyncPointAbortHelper helper("CompactionJob::ProcessKeyValueCompaction:Start");
+  helper.Setup(dbfull());
+
+  CompactRangeOptions cro;
+  Status s = dbfull()->CompactRange(cro, nullptr, nullptr);
+  ASSERT_TRUE(s.IsIncomplete());
+  ASSERT_TRUE(s.IsCompactionAborted());
+
+  CleanupSyncPoints();
+
+  int num_l0_files_after = NumTableFilesAtLevel(0);
+  int num_l1_files_after = NumTableFilesAtLevel(1);
+
+  ASSERT_EQ(num_l0_files_before, num_l0_files_after);
+  ASSERT_EQ(num_l1_files_before, num_l1_files_after);
+
+  helper.WaitForAbortCompletion();
+  dbfull()->ResumeAllCompactions();
+
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+
+  int num_l0_files_final = NumTableFilesAtLevel(0);
+  int num_l1_files_final = NumTableFilesAtLevel(1);
+
+  ASSERT_EQ(0, num_l0_files_final);
+  ASSERT_GT(num_l1_files_final, 0);
+
+  VerifyDataIntegrity(/*num_keys=*/100);
+}
+
+TEST_F(DBCompactionAbortTest, MultipleAbortResumeSequence) {
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 4;
+  options.max_subcompactions = 2;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+
+  PopulateData(/*num_files=*/4, /*keys_per_file=*/100, /*value_size=*/1000);
+
+  for (int round = 0; round < 3; ++round) {
+    // Use SyncPointAbortHelper for deterministic abort timing - it waits
+    // for the abort flag to be set via sync point dependency
+    SyncPointAbortHelper helper(
+        "CompactionJob::ProcessKeyValueCompaction:Start");
+    helper.Setup(dbfull());
+
+    CompactRangeOptions cro;
+    Status s = dbfull()->CompactRange(cro, nullptr, nullptr);
+    ASSERT_TRUE(s.IsIncomplete());
+    ASSERT_TRUE(s.IsCompactionAborted());
+
+    helper.CleanupAndWait();
+    dbfull()->ResumeAllCompactions();
+  }
+
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  VerifyDataIntegrity(/*num_keys=*/100);
+}
+
+TEST_F(DBCompactionAbortTest, AbortWithOutputFilesCleanup) {
+  Options options = CurrentOptions();
+  options.num_levels = 2;  // Ensure compaction output goes to L1
+  options.level0_file_num_compaction_trigger = 4;
+  options.max_subcompactions = 2;
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = 50 * 1024;
+  Reopen(options);
+
+  PopulateData(/*num_files=*/4, /*keys_per_file=*/100, /*value_size=*/100);
+
+  SyncPointAbortHelper helper("CompactionJob::RunSubcompactions:BeforeStart");
+  helper.Setup(dbfull());
+
+  CompactRangeOptions cro;
+  Status s = dbfull()->CompactRange(cro, nullptr, nullptr);
+  ASSERT_TRUE(s.IsIncomplete());
+  ASSERT_TRUE(s.IsCompactionAborted());
+
+  CleanupSyncPoints();
+
+  int num_l1_files_after_abort = NumTableFilesAtLevel(1);
+  ASSERT_EQ(0, num_l1_files_after_abort);
+
+  helper.WaitForAbortCompletion();
+  dbfull()->ResumeAllCompactions();
+
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+
+  // Verify L0 files are compacted and L1 has output files
+  int num_l0_files_final = NumTableFilesAtLevel(0);
+  int num_l1_files_final = NumTableFilesAtLevel(1);
+  ASSERT_EQ(0, num_l0_files_final)
+      << "L0 should be empty after successful compaction";
+  ASSERT_GT(num_l1_files_final, 0)
+      << "L1 should have files after successful compaction";
+
+  VerifyDataIntegrity(/*num_keys=*/100);
+}
+
+TEST_F(DBCompactionAbortTest, NestedAbortResumeCalls) {
+  // Test that nested AbortAllCompactions() calls work correctly with the
+  // counter
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 4;
+  options.max_subcompactions = 2;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+
+  PopulateData(/*num_files=*/4, /*keys_per_file=*/100, /*value_size=*/1000);
+
+  // First abort call
+  dbfull()->AbortAllCompactions();
+
+  // Nested abort call (counter should be 2)
+  dbfull()->AbortAllCompactions();
+
+  // Compaction should still be blocked after one resume
+  dbfull()->ResumeAllCompactions();
+
+  // Compaction should still return aborted because counter is still 1
+  CompactRangeOptions cro;
+  Status s = dbfull()->CompactRange(cro, nullptr, nullptr);
+  ASSERT_TRUE(s.IsIncomplete());
+  ASSERT_TRUE(s.IsCompactionAborted());
+
+  // Second resume - counter should be 0 now
+  dbfull()->ResumeAllCompactions();
+
+  // Compaction should succeed now
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+
+  VerifyDataIntegrity(/*num_keys=*/100);
+}
+
+TEST_F(DBCompactionAbortTest, AbortCompactFilesAPI) {
+  // Test that AbortAllCompactions works with CompactFiles API
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 100;  // Disable auto compaction
+  options.disable_auto_compactions = true;
+  Reopen(options);
+
+  PopulateData(/*num_files=*/4, /*keys_per_file=*/100, /*value_size=*/1000);
+
+  // Get the L0 file names
+  std::vector<std::string> files_to_compact;
+  ColumnFamilyMetaData cf_meta;
+  dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta);
+  for (const auto& file : cf_meta.levels[0].files) {
+    files_to_compact.push_back(file.name);
+  }
+  ASSERT_GE(files_to_compact.size(), 2);
+
+  SyncPointAbortHelper helper("CompactionJob::ProcessKeyValueCompaction:Start");
+  helper.Setup(dbfull());
+
+  CompactionOptions compact_options;
+  Status s = dbfull()->CompactFiles(compact_options, files_to_compact, 1);
+  ASSERT_TRUE(s.IsIncomplete());
+  ASSERT_TRUE(s.IsCompactionAborted());
+
+  helper.CleanupAndWait();
+  dbfull()->ResumeAllCompactions();
+
+  // CompactFiles should work after resume
+  ASSERT_OK(dbfull()->CompactFiles(compact_options, files_to_compact, 1));
+
+  VerifyDataIntegrity(/*num_keys=*/100);
+}
+
+TEST_F(DBCompactionAbortTest, AbortDoesNotAffectFlush) {
+  // Test that AbortAllCompactions does not affect flush operations
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 100;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+
+  Random rnd(301);
+  for (int j = 0; j < 100; ++j) {
+    ASSERT_OK(Put(Key(j), rnd.RandomString(1000)));
+  }
+
+  // Abort compactions
+  dbfull()->AbortAllCompactions();
+
+  // Flush should still work
+  ASSERT_OK(Flush());
+
+  // Write more data
+  for (int j = 100; j < 200; ++j) {
+    ASSERT_OK(Put(Key(j), rnd.RandomString(1000)));
+  }
+
+  // Flush should still work
+  ASSERT_OK(Flush());
+
+  // Resume compactions
+  dbfull()->ResumeAllCompactions();
+
+  VerifyDataIntegrity(/*num_keys=*/200);
+}
+
+TEST_F(DBCompactionAbortTest, AbortBeforeCompactionStarts) {
+  // Test aborting before any compaction has started
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 4;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+
+  PopulateData(/*num_files=*/4, /*keys_per_file=*/100, /*value_size=*/1000);
+
+  // Abort before starting compaction
+  dbfull()->AbortAllCompactions();
+
+  // Compaction should immediately return aborted
+  CompactRangeOptions cro;
+  Status s = dbfull()->CompactRange(cro, nullptr, nullptr);
+  ASSERT_TRUE(s.IsIncomplete());
+  ASSERT_TRUE(s.IsCompactionAborted());
+
+  // Resume
+  dbfull()->ResumeAllCompactions();
+
+  // Now compaction should work
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+
+  // Verify L0 files are compacted
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+}
+
+// Test that in-progress blob and SST files are properly cleaned up when
+// compaction is aborted. This specifically tests the case where abort happens
+// while files are being written (opened but not yet completed/closed).
+// This catches the bug where files exist on disk but are removed from the
+// outputs_ vector (e.g., by RemoveLastEmptyOutput when file_size is 0 because
+// the builder was abandoned), leaving orphan files.
+TEST_F(DBCompactionAbortTest, AbortWithInProgressFileCleanup) {
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 4;
+  options.max_subcompactions =
+      1;  // Single subcompaction for deterministic behavior
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = 32 * 1024;  // 32KB
+
+  // Enable BlobDB with garbage collection to force blob rewriting during
+  // compaction
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;  // All values go to blob files
+  options.blob_file_size =
+      1024 * 1024;  // 1MB - large enough to not close during test
+  // Enable blob garbage collection - this forces blob data to be rewritten
+  // during compaction, creating new blob files
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 1.0;  // Include all blob files
+  options.blob_garbage_collection_force_threshold = 0.0;  // Always force GC
+
+  Reopen(options);
+
+  // Write enough data to trigger the periodic abort check (every 1000 records).
+  // 4 files * 2000 keys = 2000 unique overlapping keys processed during
+  // compaction. The sync point triggers at 999, 1999, etc.
+  PopulateData(/*num_files=*/4, /*keys_per_file=*/2000, /*value_size=*/500);
+
+  // Helper function to get blob files on disk with their names
+  auto GetBlobFilesOnDisk = [this]() -> std::vector<std::string> {
+    std::vector<std::string> blob_files;
+    std::vector<std::string> files;
+    EXPECT_OK(env_->GetChildren(dbname_, &files));
+    for (const auto& f : files) {
+      if (f.find(".blob") != std::string::npos) {
+        blob_files.push_back(f);
+      }
+    }
+    std::sort(blob_files.begin(), blob_files.end());
+    return blob_files;
+  };
+
+  // Helper function to get blob file count in metadata
+  auto GetBlobFilesInMetadata = [this]() -> std::vector<uint64_t> {
+    std::vector<uint64_t> blob_file_numbers;
+    ColumnFamilyMetaData cf_meta;
+    dbfull()->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta);
+    for (const auto& blob_meta : cf_meta.blob_files) {
+      blob_file_numbers.push_back(blob_meta.blob_file_number);
+    }
+    std::sort(blob_file_numbers.begin(), blob_file_numbers.end());
+    return blob_file_numbers;
+  };
+
+  // Helper function to get SST files on disk
+  auto GetSstFilesOnDisk = [this]() -> std::vector<std::string> {
+    std::vector<std::string> sst_files;
+    std::vector<std::string> files;
+    EXPECT_OK(env_->GetChildren(dbname_, &files));
+    for (const auto& f : files) {
+      if (f.find(".sst") != std::string::npos) {
+        sst_files.push_back(f);
+      }
+    }
+    std::sort(sst_files.begin(), sst_files.end());
+    return sst_files;
+  };
+
+  // Helper function to get SST file numbers in metadata
+  auto GetSstFilesInMetadata = [this]() -> std::vector<uint64_t> {
+    std::vector<uint64_t> sst_file_numbers;
+    ColumnFamilyMetaData cf_meta;
+    dbfull()->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta);
+    for (const auto& level : cf_meta.levels) {
+      for (const auto& file : level.files) {
+        // Extract file number from the file name (e.g., "000010.sst" -> 10)
+        uint64_t file_num = 0;
+        std::string fname = file.name;
+        // Remove leading path separators if present
+        size_t pos = fname.rfind('/');
+        if (pos != std::string::npos) {
+          fname = fname.substr(pos + 1);
+        }
+        if (sscanf(fname.c_str(), "%" PRIu64, &file_num) == 1) {
+          sst_file_numbers.push_back(file_num);
+        }
+      }
+    }
+    std::sort(sst_file_numbers.begin(), sst_file_numbers.end());
+    return sst_file_numbers;
+  };
+
+  std::vector<std::string> initial_blob_files = GetBlobFilesOnDisk();
+  std::vector<uint64_t> initial_meta_blobs = GetBlobFilesInMetadata();
+  std::vector<std::string> initial_sst_files = GetSstFilesOnDisk();
+  std::vector<uint64_t> initial_meta_ssts = GetSstFilesInMetadata();
+
+  ASSERT_GT(initial_blob_files.size(), 0u) << "Expected initial blob files";
+  ASSERT_EQ(initial_blob_files.size(), initial_meta_blobs.size())
+      << "Initial blob files should match between disk and metadata";
+  ASSERT_GT(initial_sst_files.size(), 0u) << "Expected initial SST files";
+  ASSERT_EQ(initial_sst_files.size(), initial_meta_ssts.size())
+      << "Initial SST files should match between disk and metadata";
+
+  // Tracking variables for blob file lifecycle
+  std::atomic<int> blob_writes{0};
+  std::atomic<bool> abort_triggered{false};
+  AbortSynchronizer abort_sync;
+
+  // Set up dependency: the wait point will block until FlagSet is hit
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::AbortAllCompactions:FlagSet",
+       "DBCompactionAbortTest::InProgressBlob:WaitForAbort"},
+  });
+
+  // Trigger abort after some blob writes during compaction output.
+  // This ensures we have an in-progress blob file when abort happens.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileBuilder::WriteBlobToFile:AddRecord", [&](void* /*arg*/) {
+        int count = blob_writes.fetch_add(1) + 1;
+
+        // Trigger abort after 100 blob writes - this ensures:
+        // 1. A blob file has been opened (for writing)
+        // 2. Some data has been written to it
+        // 3. But it's not yet completed (blob_file_size is 1MB)
+        if (count == 100 && !abort_triggered.exchange(true)) {
+          abort_sync.TriggerAbort(dbfull());
+          // Wait for abort flag to be set - this sync point blocks until
+          // FlagSet is processed
+          TEST_SYNC_POINT_CALLBACK(
+              "DBCompactionAbortTest::InProgressBlob:WaitForAbort", nullptr);
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Run compaction - it should be aborted while blob file is in-progress
+  CompactRangeOptions cro;
+  Status s = dbfull()->CompactRange(cro, nullptr, nullptr);
+
+  ASSERT_TRUE(s.IsIncomplete())
+      << "Expected compaction to be aborted, got: " << s.ToString();
+
+  CleanupSyncPoints();
+  abort_sync.WaitForAbortCompletion();
+
+  // Check state after abort
+  std::vector<std::string> post_abort_disk_blobs = GetBlobFilesOnDisk();
+  std::vector<uint64_t> post_abort_meta_blobs = GetBlobFilesInMetadata();
+  std::vector<std::string> post_abort_disk_ssts = GetSstFilesOnDisk();
+  std::vector<uint64_t> post_abort_meta_ssts = GetSstFilesInMetadata();
+
+  // This is the key assertion for blob files: files on disk should match
+  // metadata. If the in-progress blob file was NOT cleaned up, there will be an
+  // extra file on disk that's not in metadata (orphan).
+  ASSERT_EQ(post_abort_disk_blobs.size(), post_abort_meta_blobs.size())
+      << "Orphan blob file detected! In-progress blob file was not cleaned up "
+         "after abort. Files on disk: "
+      << post_abort_disk_blobs.size()
+      << ", Files in metadata: " << post_abort_meta_blobs.size()
+      << ". The difference indicates orphaned in-progress blob file(s).";
+
+  // This is the key assertion for SST files: files on disk should match
+  // metadata. If the in-progress SST file was NOT cleaned up, there will be an
+  // extra file on disk that's not in metadata (orphan).
+  ASSERT_EQ(post_abort_disk_ssts.size(), post_abort_meta_ssts.size())
+      << "Orphan SST file detected! In-progress SST file was not cleaned up "
+         "after abort. Files on disk: "
+      << post_abort_disk_ssts.size()
+      << ", Files in metadata: " << post_abort_meta_ssts.size()
+      << ". The difference indicates orphaned in-progress SST file(s).";
+
+  // Resume and complete compaction to verify DB is still functional
+  dbfull()->ResumeAllCompactions();
+
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+
+  // Verify data integrity - we wrote 4 files * 2000 keys with overlapping keys
+  VerifyDataIntegrity(/*num_keys=*/2000);
+}
+
+TEST_F(DBCompactionAbortTest, AbortBottommostLevelCompaction) {
+  Options options = CurrentOptions();
+  options.num_levels = 4;
+  options.level0_file_num_compaction_trigger = 2;
+  options.max_bytes_for_level_base = 1024 * 10;  // 10KB
+  options.max_bytes_for_level_multiplier = 2;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+
+  // Write data to fill multiple levels (non-overlapping keys)
+  PopulateData(/*num_files=*/6, /*keys_per_file=*/100,
+               /*value_size=*/500, /*overlapping=*/false);
+
+  // First compact to push data to lower levels
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  // Write more data to L0 (overlapping keys)
+  PopulateData(/*num_files=*/4, /*keys_per_file=*/100, /*value_size=*/500);
+
+  SyncPointAbortHelper helper("CompactionJob::ProcessKeyValueCompaction:Start");
+  helper.Setup(dbfull());
+
+  // Trigger bottommost level compaction
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  Status s = dbfull()->CompactRange(cro, nullptr, nullptr);
+  ASSERT_TRUE(s.IsIncomplete());
+  ASSERT_TRUE(s.IsCompactionAborted());
+
+  helper.CleanupAndWait();
+  dbfull()->ResumeAllCompactions();
+
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+
+  VerifyDataIntegrity(/*num_keys=*/600);
+}
+
+// Test that while compactions are aborted, atomic range replace
+// (IngestExternalFiles with atomic_replace_range) works correctly.
+// This verifies that the abort state doesn't block other write operations
+// like atomic range replace.
+TEST_F(DBCompactionAbortTest, AbortThenAtomicRangeReplace) {
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 4;
+  options.max_subcompactions = 2;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+
+  // Create a directory for SST files
+  std::string sst_files_dir = dbname_ + "_sst_files/";
+  ASSERT_OK(env_->CreateDirIfMissing(sst_files_dir));
+
+  // Populate initial data with overlapping keys
+  PopulateData(/*num_files=*/4, /*keys_per_file=*/100, /*value_size=*/500);
+
+  // Verify initial data
+  VerifyDataIntegrity(/*num_keys=*/100);
+
+  // Trigger compaction and abort it
+  SyncPointAbortHelper helper("CompactionJob::ProcessKeyValueCompaction:Start");
+  helper.Setup(dbfull());
+
+  CompactRangeOptions cro;
+  Status s = dbfull()->CompactRange(cro, nullptr, nullptr);
+  ASSERT_TRUE(s.IsIncomplete());
+  ASSERT_TRUE(s.IsCompactionAborted());
+
+  helper.CleanupAndWait();
+
+  // While compaction is still aborted, perform atomic range replace using
+  // IngestExternalFiles with atomic_replace_range. This verifies that the
+  // abort state doesn't block other write operations.
+  // Using RangeOpt() (empty range) means replace everything in the CF.
+
+  // Create an SST file with new data for keys 0-49 (replacing keys 0-99)
+  std::string sst_file_path = sst_files_dir + "atomic_replace_1.sst";
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+  ASSERT_OK(sst_file_writer.Open(sst_file_path));
+
+  // Write new values for keys 0-49
+  Random rnd(42);
+  std::unordered_map<std::string, std::string> new_values;
+  for (int j = 0; j < 50; ++j) {
+    std::string key = Key(j);
+    std::string value = "replaced_" + rnd.RandomString(100);
+    ASSERT_OK(sst_file_writer.Put(key, value));
+    new_values[key] = value;
+  }
+  ASSERT_OK(sst_file_writer.Finish());
+
+  // Perform atomic range replace for the entire column family.
+  // Using RangeOpt() (default constructor) means replace everything in the CF.
+  IngestExternalFileArg arg;
+  arg.column_family = db_->DefaultColumnFamily();
+  arg.external_files = {sst_file_path};
+  arg.atomic_replace_range = RangeOpt();
+  // snapshot_consistency must be false when using atomic_replace_range
+  arg.options.snapshot_consistency = false;
+
+  // Atomic range replace should work even while compactions are aborted
+  ASSERT_OK(db_->IngestExternalFiles({arg}));
+
+  // Now resume compactions after the atomic range replace
+  dbfull()->ResumeAllCompactions();
+
+  // Verify that the atomic range replace worked correctly:
+  // 1. Keys 0-49 should have new replaced values
+  std::string val;
+  for (int j = 0; j < 50; ++j) {
+    std::string key = Key(j);
+    ASSERT_OK(db_->Get(ReadOptions(), key, &val));
+    auto it = new_values.find(key);
+    ASSERT_NE(it, new_values.end());
+    ASSERT_EQ(it->second, val) << "Value mismatch for replaced key: " << key;
+  }
+
+  // 2. Keys 50-99 should not exist (they were replaced/deleted by atomic
+  // replace)
+  for (int j = 50; j < 100; ++j) {
+    std::string key = Key(j);
+    Status get_status = db_->Get(ReadOptions(), key, &val);
+    ASSERT_TRUE(get_status.IsNotFound())
+        << "Key " << key << " should not exist after full CF replace";
+  }
+
+  // Clean up SST files directory
+  ASSERT_OK(DestroyDir(env_, sst_files_dir));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index a17a5a6ebe02..a97d3461501a 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -19,6 +19,7 @@
 #include "rocksdb/advanced_options.h"
 #include "rocksdb/concurrent_task_limiter.h"
 #include "rocksdb/experimental.h"
+#include "rocksdb/iostats_context.h"
 #include "rocksdb/sst_file_writer.h"
 #include "test_util/mock_time_env.h"
 #include "test_util/sync_point.h"
@@ -74,6 +75,43 @@ class CompactionStatsCollector : public EventListener {
   std::vector<std::atomic<int>> compaction_completed_;
 };
 
+class DeletionTriggeredCompactionWithMinFileSizeTestListener
+    : public EventListener {
+ public:
+  explicit DeletionTriggeredCompactionWithMinFileSizeTestListener(
+      uint64_t min_file_size)
+      : min_file_size_(min_file_size) {}
+
+  void OnCompactionBegin(DB* db, const CompactionJobInfo& ci) override {
+    ASSERT_EQ(ci.compaction_reason,
+              CompactionReason::kFilesMarkedForCompaction);
+
+    auto env = db->GetEnv();
+    const std::vector<DbPath>& db_paths = db->GetOptions().db_paths;
+    for (const auto& file : ci.input_file_infos) {
+      uint64_t file_size = GetSstFileSize(env, db_paths, file.file_number);
+
+      // Assert that the file size respects the minimum threshold
+      ASSERT_GE(file_size, min_file_size_);
+    }
+  }
+
+ private:
+  static uint64_t GetSstFileSize(Env* env, const std::vector<DbPath>& db_paths,
+                                 uint64_t file_number) {
+    uint32_t path_id = 0;  // since only one path
+    std::string sst_file_name = TableFileName(db_paths, file_number, path_id);
+    uint64_t file_size = 0;
+    Status s = env->GetFileSize(sst_file_name, &file_size);
+    if (!s.ok()) {
+      return 0;
+    }
+    return file_size;
+  }
+
+  uint64_t min_file_size_;
+};
+
 class DBCompactionTest : public DBTestBase {
  public:
   DBCompactionTest()
@@ -127,6 +165,19 @@ class DBCompactionTestWithParam
     exclusive_manual_compaction_ = std::get<1>(GetParam());
   }
 
+  class TrivialMoveEventListener : public EventListener {
+   public:
+    explicit TrivialMoveEventListener(size_t expected_trivially_moved_files)
+        : expected_trivially_moved_files_(expected_trivially_moved_files) {}
+    void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override {
+      ASSERT_EQ(ci.stats.num_input_files_trivially_moved,
+                expected_trivially_moved_files_);
+    }
+
+   private:
+    size_t expected_trivially_moved_files_ = 0;
+  };
+
   // Required if inheriting from testing::WithParamInterface<>
   static void SetUpTestCase() {}
   static void TearDownTestCase() {}
@@ -442,6 +493,72 @@ TEST_P(DBCompactionTestWithParam, CompactionDeletionTrigger) {
   }
 }
 #endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+TEST_F(DBCompactionTest, UniversalReduceFileLockingRepickNothing) {
+  const int kFileNumCompactionTrigger = 3;
+
+  Options options = CurrentOptions();
+  options.compaction_options_universal.reduce_file_locking = true;
+  // Set `max_background_jobs` to be 3 to allow low and bottom priority thread
+  // to run compaction together
+  options.max_background_jobs = 3;
+  Env::Default()->SetBackgroundThreads(1, Env::Priority::BOTTOM);
+  options.num_levels = 3;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.level0_file_num_compaction_trigger = kFileNumCompactionTrigger;
+  options.compaction_options_universal.max_size_amplification_percent = 1;
+
+  DestroyAndReopen(options);
+
+  // Need to get a token to enable compaction parallelism up to
+  // `max_background_compactions` jobs.
+  auto pressure_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {// Wait for the full (bottom-priority) compaction to be pre-picked as an
+       // intent (that is allowing files to be picked by other compactions and
+       // will pick later when the bottom-priority thread is available to
+       // execute the compaction) before triggering the low-priority compaction.
+       {"DBImpl::BackgroundCompaction:ForwardToBottomPriPool",
+        "LowPriCompaction"},
+       // Wait for low-priority compaction to start before
+       // repicking for the full compaction intent (bottom-priority), enabling
+       // them to run in parallel.
+       {"DBImpl::BackgroundCompaction:NonTrivial",
+        "DBImpl::BGWorkBottomCompaction"}});
+
+  bool bottom_pri_compaction_attempt_repick = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction():AfterPickCompactionBottomPri",
+      [&](void* arg) {
+        bottom_pri_compaction_attempt_repick = true;
+        Compaction* c = static_cast<Compaction*>(arg);
+        // Verify the intended full compaction for bottom priority thread does
+        // not get to run (i.e, output to bottommost level) since when it
+        // repicks its files, some of the the intended input files are already
+        // compacted by the low priority thread
+        assert(c == nullptr);
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  for (int i = 0; i < kFileNumCompactionTrigger; ++i) {
+    if (i == 0) {
+      ASSERT_OK(Put("file_locked_for_bottom_pri_compaction", "value"));
+    } else {
+      ASSERT_OK(
+          Put("file_not_locked_for_bottom_pri_compaction" + std::to_string(i),
+              "value"));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  TEST_SYNC_POINT("LowPriCompaction");
+  ASSERT_OK(Put("a_new_file_to_pick_for_low_pri_compaction", "value"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_TRUE(bottom_pri_compaction_attempt_repick);
+}
 
 TEST_F(DBCompactionTest, SkipStatsUpdateTest) {
   // This test verify UpdateAccumulatedStats is not on
@@ -1292,6 +1409,89 @@ TEST_F(DBCompactionTest, RecoverDuringMemtableCompaction) {
   } while (ChangeOptions());
 }
 
+TEST_F(DBCompactionTest, CompactionWithDeletionsAndMinFileSize) {
+  const uint64_t kMinFileSize = 32 * 1024;  // 32KB
+  const int kDeletionTriggerCount = 50;
+  const int kInitialKeyCount = 100;
+  const int kAdditionalKeyCount = 50;
+  const int kValueSize = 1024;
+  const int kSmallValueSize = 512;
+  const int kSeed = 301;
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.write_buffer_size = 1024 * 1024;  // 1MB
+  options.level0_file_num_compaction_trigger = 100;
+
+  options.table_properties_collector_factories = {
+      NewCompactOnDeletionCollectorFactory(
+          kInitialKeyCount /* sliding window size */, kDeletionTriggerCount,
+          0.5 /* deletion ratio */, kMinFileSize)};
+  auto listener =
+      new DeletionTriggeredCompactionWithMinFileSizeTestListener(kMinFileSize);
+  options.listeners.emplace_back(listener);
+
+  DestroyAndReopen(options);
+  Random rnd(kSeed);
+
+  // Create a large file that will be subject to DTC later
+  for (int i = 0; i < kInitialKeyCount; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
+  }
+  ASSERT_OK(Flush());
+
+  std::vector<LiveFileMetaData> initial_metadata;
+  db_->GetLiveFilesMetaData(&initial_metadata);
+  ASSERT_EQ(initial_metadata.size(), 1);
+
+  // Create small files that should not trigger compaction
+  ASSERT_OK(Put("small_file_key1", rnd.RandomString(kSmallValueSize)));
+  ASSERT_OK(Put("small_file_key2", rnd.RandomString(kSmallValueSize)));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Delete("small_file_key1"));
+  ASSERT_OK(Flush());
+
+  // Create a file with enough deletions and size to trigger DTC
+  // Delete keys from the large file to reach deletion threshold
+  for (int i = 0; i < kDeletionTriggerCount; i++) {
+    ASSERT_OK(Delete(Key(i)));
+  }
+
+  // Add new keys to ensure the deletion file meets the min_file_size threshold
+  for (int i = kInitialKeyCount; i < kInitialKeyCount + kAdditionalKeyCount;
+       i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Verify file count after compaction
+  ASSERT_EQ(NumTableFilesAtLevel(0), 2);  // Small file and deletion file
+  ASSERT_EQ(NumTableFilesAtLevel(1), 1);  // Compacted large file
+
+  // Verify deleted keys are gone
+  for (int i = 0; i < kDeletionTriggerCount; i++) {
+    std::string value;
+    ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &value).IsNotFound());
+  }
+
+  // Verify non-deleted keys from large file are still accessible
+  for (int i = kDeletionTriggerCount; i < kInitialKeyCount; i++) {
+    std::string value;
+    ASSERT_OK(db_->Get(ReadOptions(), Key(i), &value));
+    ASSERT_EQ(value.size(), kValueSize);
+  }
+
+  // Verify new keys are accessible
+  for (int i = kInitialKeyCount; i < kInitialKeyCount + kAdditionalKeyCount;
+       i++) {
+    std::string value;
+    ASSERT_OK(db_->Get(ReadOptions(), Key(i), &value));
+    ASSERT_EQ(value.size(), kValueSize);
+  }
+}
+
 TEST_P(DBCompactionTestWithParam, TrivialMoveOneFile) {
   int32_t trivial_move = 0;
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
@@ -1301,6 +1501,9 @@ TEST_P(DBCompactionTestWithParam, TrivialMoveOneFile) {
 
   Options options = CurrentOptions();
   options.write_buffer_size = 100000000;
+  TrivialMoveEventListener* trivial_move_listener =
+      new TrivialMoveEventListener(1 /*expected_trivially_moved_files*/);
+  options.listeners.emplace_back(trivial_move_listener);
   options.max_subcompactions = max_subcompactions_;
   DestroyAndReopen(options);
 
@@ -1361,6 +1564,10 @@ TEST_P(DBCompactionTestWithParam, TrivialMoveNonOverlappingFiles) {
 
   Options options = CurrentOptions();
   options.disable_auto_compactions = true;
+  // 8 is number of `ranges` that each is a non overlapping file.
+  TrivialMoveEventListener* trivial_move_listener =
+      new TrivialMoveEventListener(8 /*expected_trivially_moved_files*/);
+  options.listeners.emplace_back(trivial_move_listener);
   options.write_buffer_size = 10 * 1024 * 1024;
   options.max_subcompactions = max_subcompactions_;
 
@@ -1408,6 +1615,11 @@ TEST_P(DBCompactionTestWithParam, TrivialMoveNonOverlappingFiles) {
   trivial_move = 0;
   non_trivial_move = 0;
   values.clear();
+  options.listeners.clear();
+  // Same ranges of files, but now overlapping, trivial move not applicable.
+  TrivialMoveEventListener* trivial_move_listener2 =
+      new TrivialMoveEventListener(0 /*expected_trivially_moved_files*/);
+  options.listeners.emplace_back(trivial_move_listener2);
   DestroyAndReopen(options);
   // Same ranges as above but overlapping
   ranges = {
@@ -1455,6 +1667,11 @@ TEST_P(DBCompactionTestWithParam, TrivialMoveTargetLevel) {
 
   Options options = CurrentOptions();
   options.disable_auto_compactions = true;
+  // Two non overlapping files in L0 trivialy moved:
+  // file 1 [0 => 300], file 2 [600 => 700]
+  TrivialMoveEventListener* trivial_move_listener1 =
+      new TrivialMoveEventListener(2 /*expected_trivially_moved_files*/);
+  options.listeners.emplace_back(trivial_move_listener1);
   options.write_buffer_size = 10 * 1024 * 1024;
   options.num_levels = 7;
   options.max_subcompactions = max_subcompactions_;
@@ -1991,7 +2208,8 @@ TEST_P(DBDeleteFileRangeTest, DeleteFileRange) {
   std::string end_string = Key(2000);
   Slice begin(begin_string);
   Slice end(end_string);
-  ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end));
+  ASSERT_OK(
+      DeleteFilesInRange(db_.get(), db_->DefaultColumnFamily(), &begin, &end));
 
   int32_t deleted_count = 0;
   for (int32_t i = 0; i < 4300; i++) {
@@ -2012,8 +2230,8 @@ TEST_P(DBDeleteFileRangeTest, DeleteFileRange) {
   Slice begin1(begin_string);
   Slice end1(end_string);
   // Try deleting files in range which contain no keys
-  ASSERT_OK(
-      DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin1, &end1));
+  ASSERT_OK(DeleteFilesInRange(db_.get(), db_->DefaultColumnFamily(), &begin1,
+                               &end1));
 
   // Push data from level 0 to level 1 to force all data to be deleted
   // Note that we don't delete level 0 files
@@ -2022,8 +2240,8 @@ TEST_P(DBDeleteFileRangeTest, DeleteFileRange) {
   ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
-  ASSERT_OK(
-      DeleteFilesInRange(db_, db_->DefaultColumnFamily(), nullptr, nullptr));
+  ASSERT_OK(DeleteFilesInRange(db_.get(), db_->DefaultColumnFamily(), nullptr,
+                               nullptr));
 
   int32_t deleted_count2 = 0;
   for (int32_t i = 0; i < 4300; i++) {
@@ -2087,14 +2305,11 @@ TEST_P(DBDeleteFileRangeTest, DeleteFilesInRanges) {
     auto begin_str1 = Key(0), end_str1 = Key(100);
     auto begin_str2 = Key(100), end_str2 = Key(200);
     auto begin_str3 = Key(200), end_str3 = Key(299);
-    Slice begin1(begin_str1), end1(end_str1);
-    Slice begin2(begin_str2), end2(end_str2);
-    Slice begin3(begin_str3), end3(end_str3);
-    std::vector<RangePtr> ranges;
-    ranges.emplace_back(&begin1, &end1);
-    ranges.emplace_back(&begin2, &end2);
-    ranges.emplace_back(&begin3, &end3);
-    ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(),
+    std::vector<RangeOpt> ranges;
+    ranges.emplace_back(begin_str1, end_str1);
+    ranges.emplace_back(begin_str2, end_str2);
+    ranges.emplace_back(begin_str3, end_str3);
+    ASSERT_OK(DeleteFilesInRanges(db_.get(), db_->DefaultColumnFamily(),
                                   ranges.data(), ranges.size()));
     ASSERT_EQ("0,3,7", FilesPerLevel(0));
 
@@ -2121,7 +2336,7 @@ TEST_P(DBDeleteFileRangeTest, DeleteFilesInRanges) {
     ranges.emplace_back(&begin1, &end1);
     ranges.emplace_back(&begin2, &end2);
     ranges.emplace_back(&begin3, &end3);
-    ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(),
+    ASSERT_OK(DeleteFilesInRanges(db_.get(), db_->DefaultColumnFamily(),
                                   ranges.data(), ranges.size(), false));
     ASSERT_EQ("0,1,4", FilesPerLevel(0));
 
@@ -2141,8 +2356,9 @@ TEST_P(DBDeleteFileRangeTest, DeleteFilesInRanges) {
 
   // Delete all files.
   {
-    RangePtr range;
-    ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(), &range, 1));
+    RangeOpt range;
+    ASSERT_OK(
+        DeleteFilesInRanges(db_.get(), db_->DefaultColumnFamily(), &range, 1));
     ASSERT_EQ("", FilesPerLevel(0));
 
     for (auto i = 0; i < 1000; i++) {
@@ -2204,7 +2420,8 @@ TEST_P(DBDeleteFileRangeTest, DeleteFileRangeFileEndpointsOverlapBug) {
   // "1 -> vals[0]" to reappear.
   std::string begin_str = Key(0), end_str = Key(1);
   Slice begin = begin_str, end = end_str;
-  ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end));
+  ASSERT_OK(
+      DeleteFilesInRange(db_.get(), db_->DefaultColumnFamily(), &begin, &end));
   ASSERT_EQ(vals[1], GetValue(Key(1)));
 
   db_->ReleaseSnapshot(snapshot);
@@ -2797,46 +3014,99 @@ TEST_F(DBCompactionTest, L0_CompactionBug_Issue44_b) {
 }
 
 TEST_F(DBCompactionTest, ManualAutoRace) {
-  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
-      {{"DBImpl::BGWorkCompaction", "DBCompactionTest::ManualAutoRace:1"},
-       {"DBImpl::RunManualCompaction:WaitScheduled",
-        "BackgroundCallCompaction:0"}});
+  const int kNumL0FilesTrigger = 4;
+  // Verify that the auto compaction is retried after the conflicting exclusive
+  // manual compaction finishes for:
+  // 1. Non-bottom-priority compactions (tested with level compaction)
+  // 2. Bottom-priority compactions (tested with universal compaction)
+  for (auto compaction_style :
+       {kCompactionStyleLevel, kCompactionStyleUniversal}) {
+    Env::Default()->SetBackgroundThreads(
+        compaction_style == kCompactionStyleUniversal ? 2 : 0,
+        Env::Priority::BOTTOM);
+    for (auto universal_reduce_file_locking : {false, true}) {
+      if (compaction_style != kCompactionStyleUniversal &&
+          universal_reduce_file_locking) {
+        continue;
+      }
 
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+      Options options = CurrentOptions();
+      options.num_levels = 3;
+      options.level0_file_num_compaction_trigger = kNumL0FilesTrigger;
+      options.compaction_style = compaction_style;
+      options.compaction_options_universal.reduce_file_locking =
+          universal_reduce_file_locking;
 
-  ASSERT_OK(Put(1, "foo", ""));
-  ASSERT_OK(Put(1, "bar", ""));
-  ASSERT_OK(Flush(1));
-  ASSERT_OK(Put(1, "foo", ""));
-  ASSERT_OK(Put(1, "bar", ""));
-  // Generate four files in CF 0, which should trigger an auto compaction
-  ASSERT_OK(Put("foo", ""));
-  ASSERT_OK(Put("bar", ""));
-  ASSERT_OK(Flush());
-  ASSERT_OK(Put("foo", ""));
-  ASSERT_OK(Put("bar", ""));
-  ASSERT_OK(Flush());
-  ASSERT_OK(Put("foo", ""));
-  ASSERT_OK(Put("bar", ""));
-  ASSERT_OK(Flush());
-  ASSERT_OK(Put("foo", ""));
-  ASSERT_OK(Put("bar", ""));
-  ASSERT_OK(Flush());
+      DestroyAndReopen(options);
+      CreateAndReopenWithCF({"exclusive_manual_compaction_cf"}, options);
 
-  // The auto compaction is scheduled but waited until here
-  TEST_SYNC_POINT("DBCompactionTest::ManualAutoRace:1");
-  // The auto compaction will wait until the manual compaction is registerd
-  // before processing so that it will be cancelled.
-  CompactRangeOptions cro;
-  cro.exclusive_manual_compaction = true;
-  ASSERT_OK(dbfull()->CompactRange(cro, handles_[1], nullptr, nullptr));
-  ASSERT_EQ("0,1", FilesPerLevel(1));
+      // Set up sync points to ensure that the auto compaction
+      // encounters a conflict from exclusive manual compaction before the auto
+      // compaction gets to pick files, This will trigger a retry later.
+      //
+      // Specifically, the sync points are set up as following:
+      // 1. Wait until background low-pri scheduled (not picking files yet) or
+      // bottom-pri scheduled (not repicking files yet) for
+      // `universal_reduce_file_locking = true` before triggering
+      // CompactRange()
+      //
+      // 2. Wait until the triggered CompactRange()
+      // registers its compaction and creates conflict before the auto
+      // compaction picks or repicks files for the background compaction.
+      if (compaction_style == kCompactionStyleLevel ||
+          !universal_reduce_file_locking) {
+        ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+            {{"DBImpl::BGWorkCompaction", "DBCompactionTest::ManualAutoRace:1"},
+             {"DBImpl::RunManualCompaction:WaitScheduled",
+              "BackgroundCallCompaction:0"}});
+      } else {
+        ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+            {{"DBImpl::BackgroundCompaction:ForwardToBottomPriPool",
+              "DBCompactionTest::ManualAutoRace:1"},
+             {"DBImpl::RunManualCompaction:WaitScheduled",
+              "BackgroundCallCompaction:0:BottomPri"}});
+      }
 
-  // Eventually the cancelled compaction will be rescheduled and executed.
-  ASSERT_OK(dbfull()->TEST_WaitForCompact());
-  ASSERT_EQ("0,1", FilesPerLevel(0));
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+      bool encounter_conflict = false;
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+          "DBImpl::BackgroundCompaction()::Conflict",
+          [&](void* /*arg*/) { encounter_conflict = true; });
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+      // Generate files in CF 1 for exclusive CompactRange()
+      ASSERT_OK(Put(1, "foo", ""));
+      ASSERT_OK(Put(1, "bar", ""));
+      ASSERT_OK(Flush(1));
+      ASSERT_OK(Put(1, "foo", ""));
+      ASSERT_OK(Put(1, "bar", ""));
+      // Generate files in CF0 to trigger full compaction
+      for (int i = 0; i < kNumL0FilesTrigger; ++i) {
+        ASSERT_OK(Put("foo", ""));
+        ASSERT_OK(Put("bar", ""));
+        ASSERT_OK(Flush());
+      }
+
+      TEST_SYNC_POINT("DBCompactionTest::ManualAutoRace:1");
+      CompactRangeOptions cro;
+      cro.exclusive_manual_compaction = true;
+      ASSERT_OK(dbfull()->CompactRange(cro, handles_[1], nullptr, nullptr));
+      ASSERT_EQ(compaction_style == kCompactionStyleLevel ? "0,1" : "0,0,1",
+                FilesPerLevel(1));
+
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+      ASSERT_TRUE(encounter_conflict);
+
+      // Verify that the auto compaction is eventually executed after the
+      // exclusive CompactRange() finishes.
+      ASSERT_EQ(compaction_style == kCompactionStyleLevel ? "0,1" : "0,0,1",
+                FilesPerLevel(0));
+
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+    }
+    Env::Default()->SetBackgroundThreads(0, Env::Priority::BOTTOM);
+  }
 }
 
 TEST_P(DBCompactionTestWithParam, ManualCompaction) {
@@ -3390,7 +3660,7 @@ TEST_F(DBCompactionTest, SuggestCompactRangeNoTwoLevel0Compactions) {
 
   GenerateNewRandomFile(&rnd, /* nowait */ true);
   ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
-  ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr));
+  ASSERT_OK(experimental::SuggestCompactRange(db_.get(), nullptr, nullptr));
   for (int num = 0; num < options.level0_file_num_compaction_trigger + 1;
        num++) {
     GenerateNewRandomFile(&rnd, /* nowait */ true);
@@ -3959,41 +4229,51 @@ TEST_P(DBCompactionTestWithParam, IntraL0CompactionDoesNotObsoleteDeletions) {
 TEST_P(DBCompactionTestWithParam, FullCompactionInBottomPriThreadPool) {
   const int kNumFilesTrigger = 3;
   Env::Default()->SetBackgroundThreads(1, Env::Priority::BOTTOM);
-  for (bool use_universal_compaction : {false, true}) {
-    Options options = CurrentOptions();
-    if (use_universal_compaction) {
-      options.compaction_style = kCompactionStyleUniversal;
-    } else {
-      options.compaction_style = kCompactionStyleLevel;
-      options.level_compaction_dynamic_level_bytes = true;
-    }
-    options.num_levels = 4;
-    options.write_buffer_size = 100 << 10;     // 100KB
-    options.target_file_size_base = 32 << 10;  // 32KB
-    options.level0_file_num_compaction_trigger = kNumFilesTrigger;
-    // Trigger compaction if size amplification exceeds 110%
-    options.compaction_options_universal.max_size_amplification_percent = 110;
-    DestroyAndReopen(options);
+  for (auto compaction_style :
+       {kCompactionStyleLevel, kCompactionStyleUniversal}) {
+    for (auto universal_reduce_file_locking : {false, true}) {
+      if (compaction_style != kCompactionStyleUniversal &&
+          universal_reduce_file_locking) {
+        continue;
+      }
+      Options options = CurrentOptions();
+      options.compaction_style = compaction_style;
+      if (compaction_style == kCompactionStyleLevel) {
+        options.level_compaction_dynamic_level_bytes = true;
+      } else {
+        options.compaction_options_universal.reduce_file_locking =
+            universal_reduce_file_locking;
+        // Trigger compaction if size amplification exceeds 110%
+        options.compaction_options_universal.max_size_amplification_percent =
+            110;
+      }
+      options.num_levels = 4;
+      options.write_buffer_size = 100 << 10;     // 100KB
+      options.target_file_size_base = 32 << 10;  // 32KB
+      options.level0_file_num_compaction_trigger = kNumFilesTrigger;
 
-    int num_bottom_pri_compactions = 0;
-    SyncPoint::GetInstance()->SetCallBack(
-        "DBImpl::BGWorkBottomCompaction",
-        [&](void* /*arg*/) { ++num_bottom_pri_compactions; });
-    SyncPoint::GetInstance()->EnableProcessing();
+      DestroyAndReopen(options);
 
-    Random rnd(301);
-    for (int num = 0; num < kNumFilesTrigger; num++) {
-      ASSERT_EQ(NumSortedRuns(), num);
-      int key_idx = 0;
-      GenerateNewFile(&rnd, &key_idx);
-    }
-    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+      int num_bottom_pri_compactions = 0;
+      SyncPoint::GetInstance()->SetCallBack(
+          "DBImpl::BGWorkBottomCompaction",
+          [&](void* /*arg*/) { ++num_bottom_pri_compactions; });
+      SyncPoint::GetInstance()->EnableProcessing();
 
-    ASSERT_EQ(1, num_bottom_pri_compactions);
+      Random rnd(301);
+      for (int num = 0; num < kNumFilesTrigger; num++) {
+        ASSERT_EQ(NumSortedRuns(), num);
+        int key_idx = 0;
+        GenerateNewFile(&rnd, &key_idx);
+      }
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
-    // Verify that size amplification did occur
-    ASSERT_EQ(NumSortedRuns(), 1);
-    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+      ASSERT_EQ(1, num_bottom_pri_compactions);
+
+      // Verify that size amplification did occur
+      ASSERT_EQ(NumSortedRuns(), 1);
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    }
   }
   Env::Default()->SetBackgroundThreads(0, Env::Priority::BOTTOM);
 }
@@ -4256,7 +4536,8 @@ TEST_F(DBCompactionTest, DeleteFilesInRangeConflictWithCompaction) {
   std::string end_string = Key(kMaxKey + 1);
   Slice begin(begin_string);
   Slice end(end_string);
-  ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end));
+  ASSERT_OK(
+      DeleteFilesInRange(db_.get(), db_->DefaultColumnFamily(), &begin, &end));
   SyncPoint::GetInstance()->DisableProcessing();
 }
 
@@ -5912,6 +6193,9 @@ TEST_F(DBCompactionTest, SubcompactionEvent) {
       ASSERT_EQ(running_compactions_.find(ci.job_id),
                 running_compactions_.end());
       running_compactions_.emplace(ci.job_id, std::unordered_set<int>());
+      if (expected_num_l0_files_pre_compaction_ != -1) {
+        ASSERT_EQ(expected_num_l0_files_pre_compaction_, ci.num_l0_files);
+      }
     }
 
     void OnCompactionCompleted(DB* /*db*/,
@@ -5921,6 +6205,9 @@ TEST_F(DBCompactionTest, SubcompactionEvent) {
       ASSERT_NE(it, running_compactions_.end());
       ASSERT_EQ(it->second.size(), 0);
       running_compactions_.erase(it);
+      if (expected_num_l0_files_post_compaction_ != -1) {
+        ASSERT_EQ(expected_num_l0_files_post_compaction_, ci.num_l0_files);
+      }
     }
 
     void OnSubcompactionBegin(const SubcompactionJobInfo& si) override {
@@ -5950,10 +6237,25 @@ TEST_F(DBCompactionTest, SubcompactionEvent) {
       return total_subcompaction_cnt_;
     }
 
+    void SetExpectedNumL0FilesPreCompaction(int num) {
+      expected_num_l0_files_pre_compaction_ = num;
+    }
+
+    void SetExpectedNumL0FilesPostCompaction(int num) {
+      expected_num_l0_files_post_compaction_ = num;
+    }
+
+    void ResetExpectedNumL0Files() {
+      SetExpectedNumL0FilesPreCompaction(-1);
+      SetExpectedNumL0FilesPostCompaction(-1);
+    }
+
    private:
     InstrumentedMutex mutex_;
     std::unordered_map<int, std::unordered_set<int>> running_compactions_;
     size_t total_subcompaction_cnt_ = 0;
+    int expected_num_l0_files_pre_compaction_ = -1;
+    int expected_num_l0_files_post_compaction_ = -1;
   };
 
   Options options = CurrentOptions();
@@ -5973,6 +6275,7 @@ TEST_F(DBCompactionTest, SubcompactionEvent) {
     ASSERT_OK(Flush());
   }
   MoveFilesToLevel(2);
+  ASSERT_EQ(FilesPerLevel(), "0,0,4");
 
   // generate 2 files @ L1 which overlaps with L2 files
   for (int i = 0; i < 2; i++) {
@@ -5982,11 +6285,18 @@ TEST_F(DBCompactionTest, SubcompactionEvent) {
     }
     ASSERT_OK(Flush());
   }
+  listener->SetExpectedNumL0FilesPreCompaction(2 /* num */);
+  listener->SetExpectedNumL0FilesPostCompaction(0 /* num */);
+
   MoveFilesToLevel(1);
   ASSERT_EQ(FilesPerLevel(), "0,2,4");
 
+  listener->ResetExpectedNumL0Files();
+
   CompactRangeOptions comp_opts;
   comp_opts.max_subcompactions = 4;
+
+  listener->SetExpectedNumL0FilesPreCompaction(0 /* num */);
   Status s = dbfull()->CompactRange(comp_opts, nullptr, nullptr);
   ASSERT_OK(s);
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
@@ -5994,6 +6304,8 @@ TEST_F(DBCompactionTest, SubcompactionEvent) {
   ASSERT_EQ(listener->GetRunningCompactionCount(), 0);
   // and sub compaction is triggered
   ASSERT_GT(listener->GetTotalSubcompactionCount(), 0);
+
+  listener->ResetExpectedNumL0Files();
 }
 
 TEST_F(DBCompactionTest, CompactFilesOutputRangeConflict) {
@@ -6561,7 +6873,11 @@ INSTANTIATE_TEST_CASE_P(RoundRobinSubcompactionsAgainstPressureToken,
                         RoundRobinSubcompactionsAgainstPressureToken,
                         testing::Bool());
 
-TEST_P(RoundRobinSubcompactionsAgainstResources, SubcompactionsUsingResources) {
+// FIXME: the test is flaky and failing the assertion
+// ASSERT_EQ(actual_reserved_threads, expected_reserved_threads);
+// It's likely a test set up issue, fix if we are to use RoubdRobin compaction.
+TEST_P(RoundRobinSubcompactionsAgainstResources,
+       DISABLED_SubcompactionsUsingResources) {
   const int kKeysPerBuffer = 200;
   Options options = CurrentOptions();
   options.num_levels = 4;
@@ -6576,7 +6892,7 @@ TEST_P(RoundRobinSubcompactionsAgainstResources, SubcompactionsUsingResources) {
   // compaction is enough to make post-compaction L1 size less than
   // the maximum size (this test assumes only one round-robin compaction
   // is triggered by kLevelMaxLevelSize)
-  options.max_compaction_bytes = 100000000;
+  options.max_compaction_bytes = std::numeric_limits<uint64_t>::max();
 
   DestroyAndReopen(options);
   env_->SetBackgroundThreads(total_low_pri_threads_, Env::LOW);
@@ -6609,41 +6925,33 @@ TEST_P(RoundRobinSubcompactionsAgainstResources, SubcompactionsUsingResources) {
         // More than 10 files are selected for round-robin under auto
         // compaction. The number of planned subcompaction is restricted by
         // the minimum number between available threads and compaction limits
-        ASSERT_EQ(num_planned_subcompactions - options.max_subcompactions,
-                  std::min(total_low_pri_threads_, max_compaction_limits_) - 1);
+        auto actual_reserved_threads =
+            num_planned_subcompactions - options.max_subcompactions;
+        auto expected_reserved_threads =
+            std::min(total_low_pri_threads_, max_compaction_limits_) - 1;
+        ASSERT_EQ(actual_reserved_threads, expected_reserved_threads);
         num_planned_subcompactions_verified = true;
       });
-  SyncPoint::GetInstance()->LoadDependency(
-      {{"RoundRobinSubcompactionsAgainstResources:0",
-        "BackgroundCallCompaction:0"},
-       {"CompactionJob::AcquireSubcompactionResources:0",
-        "RoundRobinSubcompactionsAgainstResources:1"},
-       {"RoundRobinSubcompactionsAgainstResources:2",
-        "CompactionJob::AcquireSubcompactionResources:1"},
-       {"CompactionJob::ReleaseSubcompactionResources:0",
-        "RoundRobinSubcompactionsAgainstResources:3"},
-       {"RoundRobinSubcompactionsAgainstResources:4",
-        "CompactionJob::ReleaseSubcompactionResources:1"}});
+
+  int acquire_count = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::AcquireSubcompactionResources:0",
+      [&](void* /*arg*/) { acquire_count++; });
+  int release_count = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::ReleaseSubcompactionResources",
+      [&](void* /*arg*/) { release_count++; });
+
   SyncPoint::GetInstance()->EnableProcessing();
 
-  ASSERT_OK(dbfull()->TEST_WaitForCompact());
-  ASSERT_OK(dbfull()->EnableAutoCompaction({dbfull()->DefaultColumnFamily()}));
-  TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:0");
-  TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:1");
   auto pressure_token =
       dbfull()->TEST_write_controler().GetCompactionPressureToken();
-
-  TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:2");
-  TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:3");
-  // We can reserve more threads now except one is being used
-  ASSERT_EQ(total_low_pri_threads_ - 1,
-            env_->ReserveThreads(total_low_pri_threads_, Env::Priority::LOW));
-  ASSERT_EQ(
-      total_low_pri_threads_ - 1,
-      env_->ReleaseThreads(total_low_pri_threads_ - 1, Env::Priority::LOW));
-  TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:4");
+  ASSERT_OK(dbfull()->EnableAutoCompaction({dbfull()->DefaultColumnFamily()}));
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
   ASSERT_TRUE(num_planned_subcompactions_verified);
+  ASSERT_EQ(acquire_count, release_count);
+
   SyncPoint::GetInstance()->DisableProcessing();
   SyncPoint::GetInstance()->ClearAllCallBacks();
 }
@@ -6825,6 +7133,70 @@ TEST_F(DBCompactionTest, PartialManualCompaction) {
   ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
 }
 
+TEST_F(DBCompactionTest, ConcurrentFIFOPickingSameFileBug) {
+  Options opts = CurrentOptions();
+  opts.compaction_style = CompactionStyle::kCompactionStyleLevel;
+  opts.num_levels = 3;
+  opts.disable_auto_compactions = true;
+  opts.max_background_jobs = 3;
+
+  DestroyAndReopen(opts);
+
+  ASSERT_OK(Put("k1", "v1"));
+  ASSERT_OK(Flush());
+
+  // Create a non-L0 SST file for multi-level FIFO size-based compaction later
+  MoveFilesToLevel(2);
+
+  Options opts_new(opts);
+  opts_new.compaction_style = CompactionStyle::kCompactionStyleFIFO;
+  opts_new.max_open_files = -1;
+  // Set a low threshold to trigger multi-level size-based compaction
+  opts_new.compaction_options_fifo.max_table_files_size = 1;
+
+  Reopen(opts_new);
+
+  const CompactRangeOptions cro;
+  const Slice begin_key("k1");
+  const Slice end_key("k2");
+
+  std::unique_ptr<port::Thread> concurrent_compaction;
+
+  bool within_first_compaction = true;
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifestStart", [&](void* /*arg*/) {
+        if (!within_first_compaction) {
+          return;
+        }
+        within_first_compaction = false;
+
+        // To allow the second/concurrent compaction to still see the non-L0
+        // SST file and coerce the bug of picking that file
+        SyncPoint::GetInstance()->LoadDependency({
+            {"DBImpl::BackgroundCompaction:BeforeCompaction",
+             "VersionSet::LogAndApply:WriteManifest"},
+        });
+
+        concurrent_compaction.reset(new port::Thread([&]() {
+          // Before the fix, the second CompactRange() will either fail the
+          // assertion of double file picking `being_compacted !=
+          // inputs_[i][j]->being_compacted` in debug mode or cause LSM shape
+          // corruption "Cannot delete table file XXX from level 2 since it is
+          // not in the LSM tree" in release mode
+          Status s = db_->CompactRange(cro, &begin_key, &end_key);
+          ASSERT_OK(s);
+        }));
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+  Status s = db_->CompactRange(cro, &begin_key, &end_key);
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  ASSERT_OK(s);
+
+  concurrent_compaction->join();
+}
+
 TEST_F(DBCompactionTest, ManualCompactionFailsInReadOnlyMode) {
   // Regression test for bug where manual compaction hangs forever when the DB
   // is in read-only mode. Verify it now at least returns, despite failing.
@@ -7472,7 +7844,7 @@ class DBCompactionTestL0FilesMisorderCorruption : public DBCompactionTest {
       options_.level0_file_num_compaction_trigger = 3;
 
       CompactionOptionsFIFO fifo_options;
-      if (compaction_path_to_test == "FindIntraL0Compaction" ||
+      if (compaction_path_to_test == "PickCostBasedIntraL0Compaction" ||
           compaction_path_to_test == "CompactRange") {
         fifo_options.allow_compaction = true;
       } else if (compaction_path_to_test == "CompactFile") {
@@ -7572,7 +7944,7 @@ class DBCompactionTestL0FilesMisorderCorruption : public DBCompactionTest {
 
   void SetupSyncPoints(const std::string& compaction_path_to_test) {
     compaction_path_sync_point_called_.store(false);
-    if (compaction_path_to_test == "FindIntraL0Compaction" &&
+    if (compaction_path_to_test == "PickCostBasedIntraL0Compaction" &&
         options_.compaction_style == CompactionStyle::kCompactionStyleLevel) {
       ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
           "PostPickFileToCompact", [&](void* arg) {
@@ -7582,7 +7954,7 @@ class DBCompactionTestL0FilesMisorderCorruption : public DBCompactionTest {
             *picked_file_to_compact = false;
           });
       ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-          "FindIntraL0Compaction", [&](void* /*arg*/) {
+          "PickCostBasedIntraL0Compaction", [&](void* /*arg*/) {
             compaction_path_sync_point_called_.store(true);
           });
 
@@ -7618,12 +7990,12 @@ class DBCompactionTestL0FilesMisorderCorruption : public DBCompactionTest {
           "PickDeleteTriggeredCompactionReturnNonnullptr", [&](void* /*arg*/) {
             compaction_path_sync_point_called_.store(true);
           });
-    } else if ((compaction_path_to_test == "FindIntraL0Compaction" ||
+    } else if ((compaction_path_to_test == "PickCostBasedIntraL0Compaction" ||
                 compaction_path_to_test == "CompactRange") &&
                options_.compaction_style ==
                    CompactionStyle::kCompactionStyleFIFO) {
       ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-          "FindIntraL0Compaction", [&](void* /*arg*/) {
+          "PickCostBasedIntraL0Compaction", [&](void* /*arg*/) {
             compaction_path_sync_point_called_.store(true);
           });
     }
@@ -7695,7 +8067,7 @@ TEST_F(DBCompactionTest, CompactFilesSupportKeyPlacementRangeConflict) {
   ASSERT_OK(Flush());
   ASSERT_OK(Put("k4", "v"));
   ASSERT_OK(Flush());
-  ASSERT_OK(experimental::PromoteL0(db_, db_->DefaultColumnFamily(), 1));
+  ASSERT_OK(experimental::PromoteL0(db_.get(), db_->DefaultColumnFamily(), 1));
   ASSERT_EQ("0,2,1", FilesPerLevel());
 
   ASSERT_OK(Put("k2", "v"));
@@ -7783,7 +8155,7 @@ TEST_F(DBCompactionTestL0FilesMisorderCorruption,
     IngestOneKeyValue(dbfull(), Key(i), "new", options_);
   }
 
-  SetupSyncPoints("FindIntraL0Compaction");
+  SetupSyncPoints("PickCostBasedIntraL0Compaction");
   ResumeCompactionThread();
 
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
@@ -7916,7 +8288,8 @@ TEST_F(DBCompactionTestL0FilesMisorderCorruption,
 
 TEST_F(DBCompactionTestL0FilesMisorderCorruption,
        FlushAfterIntraL0FIFOCompactionWithIngestedFile) {
-  for (const std::string compaction_path_to_test : {"FindIntraL0Compaction"}) {
+  for (const std::string compaction_path_to_test :
+       {"PickCostBasedIntraL0Compaction"}) {
     SetupOptions(CompactionStyle::kCompactionStyleFIFO,
                  compaction_path_to_test);
     DestroyAndReopen(options_);
@@ -9376,105 +9749,393 @@ TEST_F(DBCompactionTest, CompactionWithChecksumHandoffManifest2) {
 }
 
 TEST_F(DBCompactionTest, FIFOChangeTemperature) {
-  for (bool write_time_default : {false, true}) {
-    SCOPED_TRACE("write time default? " + std::to_string(write_time_default));
+  for (bool should_allow_trivial_copy : {false, true}) {
+    for (bool write_time_default : {false, true}) {
+      int32_t before_compaction_calls = 0;
+      int32_t after_compaction_calls = 0;
+      if (should_allow_trivial_copy) {
+        ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+            "DBImpl::BackgroundCompaction:TriviaCopyBeforeCompaction",
+            [&](void*) { ++before_compaction_calls; });
+        ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+            "DBImpl::BackgroundCompaction:TriviaCopyAfterCompaction",
+            [&](void*) { ++after_compaction_calls; });
+      } else {
+        ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+            "DBImpl::BackgroundCompaction:BeforeCompaction",
+            [&](void*) { ++before_compaction_calls; });
 
-    Options options = CurrentOptions();
-    options.compaction_style = kCompactionStyleFIFO;
-    options.num_levels = 1;
-    options.max_open_files = -1;
-    options.level0_file_num_compaction_trigger = 2;
-    options.create_if_missing = true;
-    CompactionOptionsFIFO fifo_options;
-    fifo_options.file_temperature_age_thresholds = {{Temperature::kCold, 1000}};
-    fifo_options.max_table_files_size = 100000000;
-    options.compaction_options_fifo = fifo_options;
-    env_->SetMockSleep();
-    if (write_time_default) {
-      options.default_write_temperature = Temperature::kWarm;
-    }
-    // Should be ignored (TODO: fail?)
-    options.last_level_temperature = Temperature::kHot;
-    Reopen(options);
+        ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+            "DBImpl::BackgroundCompaction:AfterCompaction",
+            [&](void*) { ++after_compaction_calls; });
+      }
 
-    int total_cold = 0;
-    int total_warm = 0;
-    int total_hot = 0;
-    int total_unknown = 0;
-    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-        "NewWritableFile::FileOptions.temperature", [&](void* arg) {
-          Temperature temperature = *(static_cast<Temperature*>(arg));
-          if (temperature == Temperature::kCold) {
-            total_cold++;
-          } else if (temperature == Temperature::kWarm) {
-            total_warm++;
-          } else if (temperature == Temperature::kHot) {
+      SCOPED_TRACE("write time default? " + std::to_string(write_time_default));
+
+      Options options = CurrentOptions();
+      options.compaction_style = kCompactionStyleFIFO;
+      options.num_levels = 1;
+      options.max_open_files = -1;
+      options.level0_file_num_compaction_trigger = 2;
+      options.create_if_missing = true;
+      CompactionOptionsFIFO fifo_options;
+      fifo_options.file_temperature_age_thresholds = {
+          {Temperature::kCold, 1000}};
+      fifo_options.max_table_files_size = 100000000;
+      fifo_options.allow_trivial_copy_when_change_temperature =
+          should_allow_trivial_copy;
+      fifo_options.trivial_copy_buffer_size = 4096;
+      options.compaction_options_fifo = fifo_options;
+      env_->SetMockSleep();
+      if (write_time_default) {
+        options.default_write_temperature = Temperature::kWarm;
+      }
+      // Should be ignored (TODO: fail?)
+      options.last_level_temperature = Temperature::kHot;
+      Reopen(options);
+
+      int total_cold = 0;
+      int total_warm = 0;
+      int total_hot = 0;
+      int total_ice = 0;
+      int total_unknown = 0;
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+          "NewWritableFile::FileOptions.temperature", [&](void* arg) {
+            Temperature temperature = *(static_cast<Temperature*>(arg));
+            if (temperature == Temperature::kCold) {
+              total_cold++;
+            } else if (temperature == Temperature::kWarm) {
+              total_warm++;
+            } else if (temperature == Temperature::kHot) {
+              total_hot++;
+            } else if (temperature == Temperature::kIce) {
+              total_ice++;
+            } else {
+              assert(temperature == Temperature::kUnknown);
+              total_unknown++;
+            }
+          });
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+      // The file system does not support checksum handoff. The check
+      // will be ignored.
+      ASSERT_OK(Put(Key(0), "value1"));
+      env_->MockSleepForSeconds(800);
+      ASSERT_OK(Put(Key(2), "value2"));
+      ASSERT_OK(Flush());
+
+      ASSERT_OK(Put(Key(0), "value1"));
+      ASSERT_OK(Put(Key(2), "value2"));
+      ASSERT_OK(Flush());
+
+      // First two L0 files both become eligible for temperature change
+      // compaction They should be compacted one-by-one.
+      ASSERT_OK(Put(Key(0), "value1"));
+      env_->MockSleepForSeconds(1200);
+      ASSERT_OK(Put(Key(2), "value2"));
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+      if (write_time_default) {
+        // Also test dynamic option change
+        ASSERT_OK(db_->SetOptions({{"default_write_temperature", "kHot"}}));
+      }
+
+      ASSERT_OK(Put(Key(0), "value1"));
+      env_->MockSleepForSeconds(800);
+      ASSERT_OK(Put(Key(2), "value2"));
+      ASSERT_OK(Flush());
+
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+      ColumnFamilyMetaData metadata;
+      db_->GetColumnFamilyMetaData(&metadata);
+      ASSERT_EQ(4, metadata.file_count);
+      if (write_time_default) {
+        ASSERT_EQ(Temperature::kHot, metadata.levels[0].files[0].temperature);
+        ASSERT_EQ(Temperature::kWarm, metadata.levels[0].files[1].temperature);
+        // Includes obsolete/deleted files moved to cold
+        ASSERT_EQ(total_warm, 3);
+        ASSERT_EQ(total_hot, 1);
+        // Includes non-SST DB files
+        ASSERT_GT(total_unknown, 0);
+      } else {
+        ASSERT_EQ(Temperature::kUnknown,
+                  metadata.levels[0].files[0].temperature);
+        ASSERT_EQ(Temperature::kUnknown,
+                  metadata.levels[0].files[1].temperature);
+        ASSERT_EQ(total_warm, 0);
+        ASSERT_EQ(total_hot, 0);
+        // Includes non-SST DB files
+        ASSERT_GT(total_unknown, 4);
+      }
+      ASSERT_EQ(Temperature::kCold, metadata.levels[0].files[2].temperature);
+      ASSERT_EQ(Temperature::kCold, metadata.levels[0].files[3].temperature);
+      ASSERT_EQ(2, total_cold);
+
+      ASSERT_EQ(2, before_compaction_calls);
+      ASSERT_EQ(2, after_compaction_calls);
+
+      Destroy(options);
+    }
+  }
+}
+
+using TemperatureSet = SmallEnumSet<Temperature, Temperature::kLastTemperature>;
+static void VerifyTemperatureFileReadStats(const Statistics& st,
+                                           TemperatureSet temps) {
+  SCOPED_TRACE("Temp set size = " + std::to_string(temps.count()));
+  constexpr uint64_t min_bytes = 100;
+  constexpr uint64_t min_count = 1;
+
+  IOStatsContext* iostats = get_iostats_context();
+  if (temps.Contains(Temperature::kHot)) {
+    EXPECT_GE(st.getTickerCount(HOT_FILE_READ_BYTES), min_bytes);
+    EXPECT_GE(st.getTickerCount(HOT_FILE_READ_COUNT), min_count);
+    EXPECT_GE(iostats->file_io_stats_by_temperature.hot_file_bytes_read,
+              min_bytes);
+    EXPECT_GE(iostats->file_io_stats_by_temperature.hot_file_read_count,
+              min_count);
+
+  } else {
+    EXPECT_EQ(st.getTickerCount(HOT_FILE_READ_BYTES), 0);
+    EXPECT_EQ(st.getTickerCount(HOT_FILE_READ_COUNT), 0);
+    EXPECT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0);
+    EXPECT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+  }
+
+  if (temps.Contains(Temperature::kWarm)) {
+    EXPECT_GE(st.getTickerCount(WARM_FILE_READ_BYTES), min_bytes);
+    EXPECT_GE(st.getTickerCount(WARM_FILE_READ_COUNT), min_count);
+    EXPECT_GE(iostats->file_io_stats_by_temperature.warm_file_bytes_read,
+              min_bytes);
+    EXPECT_GE(iostats->file_io_stats_by_temperature.warm_file_read_count,
+              min_count);
+  } else {
+    EXPECT_EQ(st.getTickerCount(WARM_FILE_READ_BYTES), 0);
+    EXPECT_EQ(st.getTickerCount(WARM_FILE_READ_COUNT), 0);
+    EXPECT_EQ(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0);
+    EXPECT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0);
+  }
+
+  if (temps.Contains(Temperature::kCool)) {
+    EXPECT_GE(st.getTickerCount(COOL_FILE_READ_BYTES), min_bytes);
+    EXPECT_GE(st.getTickerCount(COOL_FILE_READ_COUNT), min_count);
+    EXPECT_GE(iostats->file_io_stats_by_temperature.cool_file_bytes_read,
+              min_bytes);
+    EXPECT_GE(iostats->file_io_stats_by_temperature.cool_file_read_count,
+              min_count);
+  } else {
+    EXPECT_EQ(st.getTickerCount(COOL_FILE_READ_BYTES), 0);
+    EXPECT_EQ(st.getTickerCount(COOL_FILE_READ_COUNT), 0);
+    EXPECT_EQ(iostats->file_io_stats_by_temperature.cool_file_bytes_read, 0);
+    EXPECT_EQ(iostats->file_io_stats_by_temperature.cool_file_read_count, 0);
+  }
+
+  if (temps.Contains(Temperature::kCold)) {
+    EXPECT_GE(st.getTickerCount(COLD_FILE_READ_BYTES), min_bytes);
+    EXPECT_GE(st.getTickerCount(COLD_FILE_READ_COUNT), min_count);
+    EXPECT_GE(iostats->file_io_stats_by_temperature.cold_file_bytes_read,
+              min_bytes);
+    EXPECT_GE(iostats->file_io_stats_by_temperature.cold_file_read_count,
+              min_count);
+  } else {
+    EXPECT_EQ(st.getTickerCount(COLD_FILE_READ_BYTES), 0);
+    EXPECT_EQ(st.getTickerCount(COLD_FILE_READ_COUNT), 0);
+    EXPECT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0);
+    EXPECT_EQ(iostats->file_io_stats_by_temperature.cold_file_read_count, 0);
+  }
+
+  if (temps.Contains(Temperature::kIce)) {
+    EXPECT_GE(st.getTickerCount(ICE_FILE_READ_BYTES), min_bytes);
+    EXPECT_GE(st.getTickerCount(ICE_FILE_READ_COUNT), min_count);
+    EXPECT_GE(iostats->file_io_stats_by_temperature.ice_file_bytes_read,
+              min_bytes);
+    EXPECT_GE(iostats->file_io_stats_by_temperature.ice_file_read_count,
+              min_count);
+  } else {
+    EXPECT_EQ(st.getTickerCount(ICE_FILE_READ_BYTES), 0);
+    EXPECT_EQ(st.getTickerCount(ICE_FILE_READ_COUNT), 0);
+    EXPECT_EQ(iostats->file_io_stats_by_temperature.ice_file_bytes_read, 0);
+    EXPECT_EQ(iostats->file_io_stats_by_temperature.ice_file_read_count, 0);
+  }
+}
+
+TEST_F(DBCompactionTest, FIFOMultiTierTemperatureAging) {
+  // Test multi-tier aging: Hot -> Warm -> Cool -> Cold -> Ice
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleFIFO;
+  options.num_levels = 1;
+  options.max_open_files = -1;
+  options.level0_file_num_compaction_trigger = 2;
+  options.create_if_missing = true;
+  options.statistics = CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.no_block_cache = true;  // Simplify statistics
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  CompactionOptionsFIFO fifo_options;
+  // Multi-tier aging: files age through multiple temperatures
+  fifo_options.file_temperature_age_thresholds = {
+      {Temperature::kWarm, 500},   // Hot -> Warm after 500s
+      {Temperature::kCool, 1000},  // Warm -> Cool
+      {Temperature::kCold, 1500},  // Cool -> Cold
+      {Temperature::kIce, 2000}    // Cold -> Ice
+  };
+  fifo_options.max_table_files_size = 100000000;
+  fifo_options.allow_trivial_copy_when_change_temperature = true;
+  options.compaction_options_fifo = fifo_options;
+  options.default_write_temperature = Temperature::kHot;
+
+  Reopen(options);
+  env_->SetMockSleep();
+
+  // Track all temperature file creations
+  int total_hot = 0, total_warm = 0, total_cool = 0, total_cold = 0,
+      total_ice = 0, total_unknown = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "NewWritableFile::FileOptions.temperature", [&](void* arg) {
+        Temperature temperature = *(static_cast<Temperature*>(arg));
+        switch (temperature) {
+          case Temperature::kHot:
             total_hot++;
-          } else {
-            assert(temperature == Temperature::kUnknown);
+            break;
+          case Temperature::kWarm:
+            total_warm++;
+            break;
+          case Temperature::kCool:
+            total_cool++;
+            break;
+          case Temperature::kCold:
+            total_cold++;
+            break;
+          case Temperature::kIce:
+            total_ice++;
+            break;
+          case Temperature::kUnknown:
             total_unknown++;
-          }
-        });
-    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+            break;
+          default:
+            break;
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
-    // The file system does not support checksum handoff. The check
-    // will be ignored.
-    ASSERT_OK(Put(Key(0), "value1"));
-    env_->MockSleepForSeconds(800);
-    ASSERT_OK(Put(Key(2), "value2"));
+  // Create initial three files (will start as Hot), enough to ensure key
+  // range filtering will be applied in FilePicker::GetNextFile() with one
+  // more file
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put(Key(0), Random::GetTLSInstance()->RandomBinaryString(100)));
     ASSERT_OK(Flush());
+  }
 
-    ASSERT_OK(Put(Key(0), "value1"));
-    ASSERT_OK(Put(Key(2), "value2"));
-    ASSERT_OK(Flush());
+  // Test reading from Hot temperature file
+  ASSERT_OK(options.statistics->Reset());
+  get_iostats_context()->Reset();
 
-    // First two L0 files both become eligible for temperature change compaction
-    // They should be compacted one-by-one.
-    ASSERT_OK(Put(Key(0), "value1"));
-    env_->MockSleepForSeconds(1200);
-    ASSERT_OK(Put(Key(2), "value2"));
-    ASSERT_OK(Flush());
-    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(100U, Get(Key(0)).size());
 
-    if (write_time_default) {
-      // Also test dynamic option change
-      ASSERT_OK(db_->SetOptions({{"default_write_temperature", "kHot"}}));
-    }
+  VerifyTemperatureFileReadStats(*options.statistics, Temperature::kHot);
 
-    ASSERT_OK(Put(Key(0), "value1"));
-    env_->MockSleepForSeconds(800);
-    ASSERT_OK(Put(Key(2), "value2"));
-    ASSERT_OK(Flush());
+  // Land well into each time interval
+  env_->MockSleepForSeconds(100);
 
-    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // Age initial files to warm
+  env_->MockSleepForSeconds(500);
+  ASSERT_OK(Put(Key(1), Random::GetTLSInstance()->RandomBinaryString(101)));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
-    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  // Test reading from Warm temperature file (the aged file)
+  ASSERT_OK(options.statistics->Reset());
+  get_iostats_context()->Reset();
 
-    ColumnFamilyMetaData metadata;
-    db_->GetColumnFamilyMetaData(&metadata);
-    ASSERT_EQ(4, metadata.file_count);
-    if (write_time_default) {
-      ASSERT_EQ(Temperature::kHot, metadata.levels[0].files[0].temperature);
-      ASSERT_EQ(Temperature::kWarm, metadata.levels[0].files[1].temperature);
-      // Includes obsolete/deleted files moved to cold
-      ASSERT_EQ(total_warm, 3);
-      ASSERT_EQ(total_hot, 1);
-      // Includes non-SST DB files
-      ASSERT_GT(total_unknown, 0);
-    } else {
-      ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature);
-      ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[1].temperature);
-      ASSERT_EQ(total_warm, 0);
-      ASSERT_EQ(total_hot, 0);
-      // Includes non-SST DB files
-      ASSERT_GT(total_unknown, 4);
-    }
-    ASSERT_EQ(Temperature::kCold, metadata.levels[0].files[2].temperature);
-    ASSERT_EQ(Temperature::kCold, metadata.levels[0].files[3].temperature);
-    ASSERT_EQ(2, total_cold);
+  ASSERT_EQ(100U, Get(Key(0)).size());
 
-    Destroy(options);
+  // Verify Warm file statistics
+  VerifyTemperatureFileReadStats(*options.statistics, Temperature::kWarm);
+
+  // Age initial files to cool
+  env_->MockSleepForSeconds(500);
+  ASSERT_OK(Put(Key(2), Random::GetTLSInstance()->RandomBinaryString(102)));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Test reading from Cool temperature file (the aged file)
+  ASSERT_OK(options.statistics->Reset());
+  get_iostats_context()->Reset();
+
+  ASSERT_EQ(100U, Get(Key(0)).size());
+
+  VerifyTemperatureFileReadStats(*options.statistics, Temperature::kCool);
+
+  // Age initial files to cold
+  env_->MockSleepForSeconds(500);
+  ASSERT_OK(Put(Key(3), Random::GetTLSInstance()->RandomBinaryString(103)));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Test reading from Cold temperature file (the aged file)
+  ASSERT_OK(options.statistics->Reset());
+  get_iostats_context()->Reset();
+
+  ASSERT_EQ(100U, Get(Key(0)).size());
+
+  VerifyTemperatureFileReadStats(*options.statistics, Temperature::kCold);
+
+  // Age initial files to ice
+  env_->MockSleepForSeconds(500);
+  ASSERT_OK(Put(Key(4), Random::GetTLSInstance()->RandomBinaryString(104)));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Test reading from Ice temperature file (the aged file)
+  ASSERT_OK(options.statistics->Reset());
+  get_iostats_context()->Reset();
+
+  ASSERT_EQ(100U, Get(Key(0)).size());
+
+  VerifyTemperatureFileReadStats(*options.statistics, Temperature::kIce);
+
+  // Verify temperature progression in metadata
+  ColumnFamilyMetaData metadata;
+  db_->GetColumnFamilyMetaData(&metadata);
+
+  // Should have files at different temperatures
+  std::map<Temperature, int> temp_counts;
+  for (const auto& file : metadata.levels[0].files) {
+    temp_counts[file.temperature]++;
   }
+
+  // Verify current files temperatures
+  EXPECT_EQ(temp_counts[Temperature::kHot], 1);
+  EXPECT_EQ(temp_counts[Temperature::kWarm], 1);
+  EXPECT_EQ(temp_counts[Temperature::kCool], 1);
+  EXPECT_EQ(temp_counts[Temperature::kCold], 1);
+  EXPECT_EQ(temp_counts[Temperature::kIce], 3);
+
+  // Verify historical (and current) file temperatures
+  EXPECT_EQ(total_hot, 7);
+  EXPECT_EQ(total_warm, 6);
+  EXPECT_EQ(total_cool, 5);
+  EXPECT_EQ(total_cold, 4);
+  EXPECT_EQ(total_ice, 3);
+
+  // Final comprehensive test: read from all temperature files
+  Reopen(options);
+  ASSERT_OK(options.statistics->Reset());
+  get_iostats_context()->Reset();
+
+  // Read from all files to verify cumulative statistics
+  for (int i = 0; i < 5; i++) {
+    ASSERT_EQ(static_cast<unsigned>(100 + i), Get(Key(i)).size());
+  }
+
+  VerifyTemperatureFileReadStats(*options.statistics, TemperatureSet::All());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 }
 
 TEST_F(DBCompactionTest, DisableMultiManualCompaction) {
@@ -9918,55 +10579,60 @@ TEST_F(DBCompactionTest, BottomPriCompactionCountsTowardConcurrencyLimit) {
 
   env_->SetBackgroundThreads(1, Env::Priority::BOTTOM);
 
-  Options options = CurrentOptions();
-  options.level0_file_num_compaction_trigger = kNumL0Files;
-  options.num_levels = kNumLevels;
-  DestroyAndReopen(options);
+  for (bool universal_reduce_file_locking : {false, true}) {
+    Options options = CurrentOptions();
+    options.level0_file_num_compaction_trigger = kNumL0Files;
+    options.num_levels = kNumLevels;
+    options.compaction_style = kCompactionStyleUniversal;
+    options.compaction_options_universal.reduce_file_locking =
+        universal_reduce_file_locking;
+    DestroyAndReopen(options);
 
-  // Setup last level to be non-empty since it's a bit unclear whether
-  // compaction to an empty level would be considered "bottommost".
-  ASSERT_OK(Put(Key(0), "val"));
-  ASSERT_OK(Flush());
-  MoveFilesToLevel(kNumLevels - 1);
+    // Setup last level to be non-empty since it's a bit unclear whether
+    // compaction to an empty level would be considered "bottommost".
+    ASSERT_OK(Put(Key(0), "val"));
+    ASSERT_OK(Flush());
+    MoveFilesToLevel(kNumLevels - 1);
 
-  SyncPoint::GetInstance()->LoadDependency(
-      {{"DBImpl::BGWorkBottomCompaction",
-        "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:"
-        "PreTriggerCompaction"},
-       {"DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:"
-        "PostTriggerCompaction",
-        "BackgroundCallCompaction:0"}});
-  SyncPoint::GetInstance()->EnableProcessing();
+    SyncPoint::GetInstance()->LoadDependency(
+        {{"DBImpl::BGWorkBottomCompaction",
+          "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:"
+          "PreTriggerCompaction"},
+         {"DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:"
+          "PostTriggerCompaction",
+          "BackgroundCallCompaction:0"}});
+    SyncPoint::GetInstance()->EnableProcessing();
 
-  port::Thread compact_range_thread([&] {
-    CompactRangeOptions cro;
-    cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
-    cro.exclusive_manual_compaction = false;
-    ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
-  });
+    port::Thread compact_range_thread([&] {
+      CompactRangeOptions cro;
+      cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+      cro.exclusive_manual_compaction = false;
+      ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+    });
 
-  // Sleep in the low-pri thread so any newly scheduled compaction will be
-  // queued. Otherwise it might finish before we check its existence.
-  test::SleepingBackgroundTask sleeping_task_low;
-  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
-                 Env::Priority::LOW);
-  sleeping_task_low.WaitUntilSleeping();
+    // Sleep in the low-pri thread so any newly scheduled compaction will be
+    // queued. Otherwise it might finish before we check its existence.
+    test::SleepingBackgroundTask sleeping_task_low;
+    env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                   &sleeping_task_low, Env::Priority::LOW);
+    sleeping_task_low.WaitUntilSleeping();
 
-  TEST_SYNC_POINT(
-      "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:"
-      "PreTriggerCompaction");
-  for (int i = 0; i < kNumL0Files; ++i) {
-    ASSERT_OK(Put(Key(0), "val"));
-    ASSERT_OK(Flush());
-  }
-  ASSERT_EQ(0u, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
-  TEST_SYNC_POINT(
-      "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:"
-      "PostTriggerCompaction");
+    TEST_SYNC_POINT(
+        "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:"
+        "PreTriggerCompaction");
+    for (int i = 0; i < kNumL0Files; ++i) {
+      ASSERT_OK(Put(Key(0), "val"));
+      ASSERT_OK(Flush());
+    }
+    ASSERT_EQ(0u, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+    TEST_SYNC_POINT(
+        "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:"
+        "PostTriggerCompaction");
 
-  sleeping_task_low.WakeUp();
-  sleeping_task_low.WaitUntilDone();
-  compact_range_thread.join();
+    sleeping_task_low.WakeUp();
+    sleeping_task_low.WaitUntilDone();
+    compact_range_thread.join();
+  }
 }
 
 TEST_F(DBCompactionTest, BottommostFileCompactionAllowIngestBehind) {
@@ -10472,7 +11138,7 @@ TEST_F(DBCompactionTest, NumberOfSubcompactions) {
   }
 }
 
-TEST_F(DBCompactionTest, VerifyRecordCount) {
+TEST_F(DBCompactionTest, VerifyInputRecordCount) {
   Options options = CurrentOptions();
   options.compaction_style = kCompactionStyleLevel;
   options.level0_file_num_compaction_trigger = 3;
@@ -10510,6 +11176,103 @@ TEST_F(DBCompactionTest, VerifyRecordCount) {
   ASSERT_TRUE(std::strstr(s.getState(), expect));
 }
 
+TEST_F(DBCompactionTest, VerifyOutputRecordCountBlockBasedTable) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.level0_file_num_compaction_trigger = 3;
+  options.compaction_verify_record_count = true;
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  // Create 2 overlapping L0 files
+  for (int i = 1; i < 20; i += 2) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), Key(10), Key(15)));
+
+  for (int i = 0; i < 20; i += 2) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
+  }
+  ASSERT_OK(Flush());
+
+  // Skip adding every 7th key in the output table
+  int num_iter = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTableBuilder::Add::skip", [&](void* skip) {
+        num_iter++;
+        if (num_iter % 7 == 0) {
+          *(bool*)skip = true;
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s = db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_TRUE(s.IsCorruption());
+  const char* expect =
+      "Number of keys in compaction output SST files does not match number of "
+      "keys added.";
+  ASSERT_TRUE(std::strstr(s.getState(), expect));
+}
+
+TEST_F(DBCompactionTest, VerifyOutputRecordCountPlainTable) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.level0_file_num_compaction_trigger = 3;
+  options.compaction_verify_record_count = true;
+
+  PlainTableOptions plain_table_options;
+  plain_table_options.user_key_len = 0;
+  plain_table_options.bloom_bits_per_key = 2;
+  plain_table_options.hash_table_ratio = 0.8;
+  plain_table_options.index_sparseness = 3;
+  plain_table_options.huge_page_tlb_size = 0;
+  plain_table_options.encoding_type = kPrefix;
+  plain_table_options.full_scan_mode = false;
+  plain_table_options.store_index_in_file = false;
+
+  options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+  options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true));
+
+  options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+  options.allow_mmap_reads = false;
+  options.allow_concurrent_memtable_write = false;
+  options.unordered_write = false;
+
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  // Create 2 overlapping L0 files
+  for (int i = 1; i < 20; i += 2) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
+  }
+  ASSERT_OK(Flush());
+
+  for (int i = 0; i < 20; i += 2) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
+  }
+  ASSERT_OK(Flush());
+
+  // Skip adding every 7th key in the output table
+  int num_iter = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "PlainTableBuilder::Add::skip", [&](void* skip) {
+        num_iter++;
+        if (num_iter % 7 == 0) {
+          *(bool*)skip = true;
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s = db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_TRUE(s.IsCorruption());
+  const char* expect =
+      "Number of keys in compaction output SST files does not match number of "
+      "keys added.";
+  ASSERT_TRUE(std::strstr(s.getState(), expect));
+}
+
 TEST_F(DBCompactionTest, ErrorWhenReadFileHead) {
   // This is to test a bug that is fixed in
   // https://github.com/facebook/rocksdb/pull/11782.
@@ -10782,6 +11545,124 @@ TEST_F(DBCompactionTest, RecordNewestKeyTimeForTtlCompaction) {
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(NumTableFilesAtLevel(0), 0);
 }
+
+// Test verifies compaction file cutting logic when using tail size estimation
+// maintains output files at or below the target file size.
+TEST_F(DBCompactionTest, CompactionRespectsTargetSizeWithTailEstimation) {
+  const int kInitialKeyCount = 10000;  // 10k keys
+  const int kValueSize = 100;          // 100 bytes per key
+  const int kSeed = 301;
+
+  Options options = CurrentOptions();
+  options.target_file_size_is_upper_bound = true;
+  options.target_file_size_base = 256 * 1024;
+  options.write_buffer_size = 2 * 1024 * 1024;
+  options.level0_file_num_compaction_trigger = 100;  // Never trigger L0->L1
+  options.compression = kNoCompression;
+
+  BlockBasedTableOptions table_options;
+  table_options.partition_filters = true;
+  table_options.metadata_block_size = 4 * 1024;
+  table_options.index_type = BlockBasedTableOptions::kBinarySearch;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  DestroyAndReopen(options);
+
+  // Generate 2 L0 files
+  // Generate first file with 10k keys (each ~100 bytes) approx 1.2MB total
+  Random rnd(kSeed);
+  for (int i = 0; i < kInitialKeyCount; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
+  }
+  ASSERT_OK(Flush());
+
+  // Generate second file with overlapping keys to force compaction (prevent
+  // trivial move)
+  for (int i = kInitialKeyCount / 2; i < kInitialKeyCount * 1.5; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
+  }
+  ASSERT_OK(Flush());
+
+  // Capture file metadata and assert two L0 files
+  std::vector<LiveFileMetaData> file_metadata;
+  db_->GetLiveFilesMetaData(&file_metadata);
+  ASSERT_EQ(file_metadata.size(), 2);
+  for (const auto& file : file_metadata) {
+    ASSERT_EQ(file.level, 0);
+  };
+
+  // Manually compact LO files to L1
+  CompactRangeOptions cro;
+  cro.change_level = true;
+  cro.target_level = 1;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Verify that compacted output files are under target file size
+  for (const auto& file : file_metadata) {
+    if (file.level > 0) {
+      EXPECT_LE(file.size, options.target_file_size_base)
+          << "Output file size exceeds target size: " << " File: " << file.name
+          << " level: " << file.level << " File size: " << file.size
+          << " Target size: " << options.target_file_size_base;
+    }
+  }
+}
+
+class PeriodicCompactionListener : public EventListener {
+ public:
+  explicit PeriodicCompactionListener() {}
+  void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override {
+    if (ci.compaction_reason == CompactionReason::kPeriodicCompaction) {
+      ++num_periodic_compactions;
+    }
+  }
+
+  std::atomic<int> num_periodic_compactions = 0;
+};
+
+TEST_F(DBCompactionTest, PeriodicTask) {
+  // Tests that when no trigger event is fired (flush/compaction/setoptions),
+  // periodic compaction is still triggered by a scheduled periodic function.
+  auto mock_clock = std::make_shared<MockSystemClock>(env_->GetSystemClock());
+  mock_clock->SetCurrentTime(100);
+  mock_clock->InstallTimedWaitFixCallback();
+  auto mock_env = std::make_unique<CompositeEnvWrapper>(env_, mock_clock);
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::StartPeriodicTaskScheduler:Init", [&](void* arg) {
+        auto periodic_task_scheduler_ptr =
+            static_cast<PeriodicTaskScheduler*>(arg);
+        periodic_task_scheduler_ptr->TEST_OverrideTimer(mock_clock.get());
+      });
+
+  Options options;
+  options.env = mock_env.get();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.statistics = CreateDBStatistics();
+  int kPeriodicCompactionSeconds = 7 * 24 * 60 * 60;  // 1 week
+  options.periodic_compaction_seconds = kPeriodicCompactionSeconds;
+  options.num_levels = 50;
+  auto listener = std::make_shared<PeriodicCompactionListener>();
+  options.listeners.push_back(listener);
+  ASSERT_OK(TryReopen(options));
+
+  Random* rnd = Random::GetTLSInstance();
+  for (int k = 0; k < 10; ++k) {
+    ASSERT_OK(Put(Key(k), rnd->RandomString(100)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange({}, nullptr, nullptr));
+  ASSERT_EQ(1, NumTableFilesAtLevel(49));
+
+  dbfull()->TEST_WaitForPeriodicTaskRun(
+      [&] { mock_clock->MockSleepForSeconds(kPeriodicCompactionSeconds + 1); });
+  ASSERT_OK(db_->WaitForCompact({}));
+
+  ASSERT_EQ(listener->num_periodic_compactions, 1);
+  Close();
+}
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/db_encryption_test.cc b/db/db_encryption_test.cc
index 1d17e5d9bbd1..7967719888bb 100644
--- a/db/db_encryption_test.cc
+++ b/db/db_encryption_test.cc
@@ -17,9 +17,10 @@ class DBEncryptionTest : public DBTestBase {
  public:
   DBEncryptionTest()
       : DBTestBase("db_encryption_test", /*env_do_fsync=*/true) {}
-  Env* GetTargetEnv() {
+  Env* GetNonEncryptedEnv() {
     if (encrypted_env_ != nullptr) {
-      return (static_cast<EnvWrapper*>(encrypted_env_))->target();
+      return (static_cast_with_check<CompositeEnvWrapper>(encrypted_env_))
+          ->env_target();
     } else {
       return env_;
     }
@@ -38,7 +39,7 @@ TEST_F(DBEncryptionTest, CheckEncrypted) {
   auto status = env_->GetChildren(dbname_, &fileNames);
   ASSERT_OK(status);
 
-  Env* target = GetTargetEnv();
+  Env* target = GetNonEncryptedEnv();
   int hits = 0;
   for (auto it = fileNames.begin(); it != fileNames.end(); ++it) {
     if (*it == "LOCK") {
@@ -89,7 +90,7 @@ TEST_F(DBEncryptionTest, CheckEncrypted) {
 }
 
 TEST_F(DBEncryptionTest, ReadEmptyFile) {
-  auto defaultEnv = GetTargetEnv();
+  auto defaultEnv = GetNonEncryptedEnv();
 
   // create empty file for reading it back in later
   auto envOptions = EnvOptions(CurrentOptions());
@@ -116,6 +117,40 @@ TEST_F(DBEncryptionTest, ReadEmptyFile) {
   ASSERT_TRUE(data.empty());
 }
 
+TEST_F(DBEncryptionTest, NotSupportedGetFileSize) {
+  // Validate envrypted env does not support GetFileSize.
+  // The goal of the test is to validate the encrypted env/fs does not support
+  // GetFileSize API on FSRandomAccessFile interface.
+  // This test combined with the rest of the integration tests validate that
+  // the new API GetFileSize on FSRandomAccessFile interface is not required to
+  // be supported for database to work properly.
+  // The GetFileSize API is used in ReadFooterFromFile() API to get the file
+  // size. When GetFileSize API is not supported, the ReadFooterFromFile() API
+  // will use FileSystem GetFileSize API as fallback. Refer to the
+  // EncryptedRandomAccessFile class definition for more details.
+  if (!encrypted_env_) {
+    return;
+  }
+
+  auto fs = encrypted_env_->GetFileSystem();
+
+  // create empty file for reading it back in later
+  auto filePath = dbname_ + "/empty.empty";
+
+  // Create empty file
+  CreateFile(fs.get(), filePath, "", false);
+
+  // Open it for reading footer
+  std::unique_ptr<FSRandomAccessFile> randomAccessFile;
+  auto status = fs->NewRandomAccessFile(filePath, FileOptions(),
+                                        &randomAccessFile, nullptr);
+  ASSERT_OK(status);
+
+  uint64_t fileSize;
+  status = randomAccessFile->GetFileSize(&fileSize);
+  ASSERT_TRUE(status.IsNotSupported());
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/db_etc3_test.cc b/db/db_etc3_test.cc
new file mode 100644
index 000000000000..e5152fcd58d2
--- /dev/null
+++ b/db/db_etc3_test.cc
@@ -0,0 +1,161 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_test_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBEtc3Test : public DBTestBase {
+ public:
+  DBEtc3Test() : DBTestBase("db_etc3_test", /*env_do_fsync=*/true) {}
+};
+
+TEST_F(DBEtc3Test, ManifestRollOver) {
+  do {
+    Options options;
+    // Force new manifest on each manifest write
+    options.max_manifest_file_size = 0;
+    options.max_manifest_space_amp_pct = 0;
+    options = CurrentOptions(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+    {
+      ASSERT_OK(Put(1, "key1", std::string(1000, '1')));
+      ASSERT_OK(Put(1, "key2", std::string(1000, '2')));
+      ASSERT_OK(Put(1, "key3", std::string(1000, '3')));
+      uint64_t manifest_before_flush = dbfull()->TEST_Current_Manifest_FileNo();
+      ASSERT_OK(Flush(1));  // This should trigger LogAndApply.
+      uint64_t manifest_after_flush = dbfull()->TEST_Current_Manifest_FileNo();
+      ASSERT_GT(manifest_after_flush, manifest_before_flush);
+      // Re-open should always re-create manifest file
+      ReopenWithColumnFamilies({"default", "pikachu"}, options);
+      ASSERT_GT(dbfull()->TEST_Current_Manifest_FileNo(), manifest_after_flush);
+      ASSERT_EQ(std::string(1000, '1'), Get(1, "key1"));
+      ASSERT_EQ(std::string(1000, '2'), Get(1, "key2"));
+      ASSERT_EQ(std::string(1000, '3'), Get(1, "key3"));
+    }
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBEtc3Test, AutoTuneManifestSize) {
+  // Ensure we have auto-tuning beyond max_manifest_file_size by default
+  ASSERT_EQ(DBOptions{}.max_manifest_space_amp_pct, 500);
+
+  Options options = CurrentOptions();
+  ASSERT_OK(db_->SetOptions({{"level0_file_num_compaction_trigger", "20"}}));
+
+  // Use large column family names to essentially control the amount of payload
+  // data needed for the manifest file. Drop manifest entries don't include the
+  // CF name so are small.
+  uint64_t prev_manifest_num = 0, cur_manifest_num = 0;
+  std::deque<ColumnFamilyHandle*> handles;
+  int counter = 5;
+  auto AddCfFn = [&]() {
+    std::string name = "cf" + std::to_string(counter++);
+    name.resize(1000, 'a');
+    ASSERT_OK(db_->CreateColumnFamily(options, name, &handles.emplace_back()));
+    prev_manifest_num = cur_manifest_num;
+    cur_manifest_num = dbfull()->TEST_Current_Manifest_FileNo();
+  };
+  auto DropCfFn = [&]() {
+    ASSERT_OK(db_->DropColumnFamily(handles.front()));
+    ASSERT_OK(db_->DestroyColumnFamilyHandle(handles.front()));
+    handles.pop_front();
+    prev_manifest_num = cur_manifest_num;
+    cur_manifest_num = dbfull()->TEST_Current_Manifest_FileNo();
+  };
+  auto TrivialManifestWriteFn = [&]() {
+    ASSERT_OK(Put("x", std::to_string(counter++)));
+    ASSERT_OK(Flush());
+    prev_manifest_num = cur_manifest_num;
+    cur_manifest_num = dbfull()->TEST_Current_Manifest_FileNo();
+  };
+
+  options.max_manifest_file_size = 1000000;
+  options.max_manifest_space_amp_pct = 0;  // no auto-tuning yet
+  DestroyAndReopen(options);
+
+  // With the generous (minimum) maximum manifest size, should not be rotated
+  AddCfFn();
+  AddCfFn();
+  AddCfFn();
+  ASSERT_EQ(prev_manifest_num, cur_manifest_num);
+
+  // Change options for small max and (still) no auto-tuning
+  ASSERT_OK(db_->SetDBOptions({{"max_manifest_file_size", "3000"}}));
+
+  // Takes effect on the next manifest write
+  TrivialManifestWriteFn();
+  ASSERT_LT(prev_manifest_num, cur_manifest_num);
+
+  // Now we have to rewrite the whole manifest on each write because the
+  // compacted size exceeds the "max" size.
+  AddCfFn();
+  ASSERT_LT(prev_manifest_num, cur_manifest_num);
+  DropCfFn();
+  ASSERT_LT(prev_manifest_num, cur_manifest_num);
+  AddCfFn();
+  ASSERT_LT(prev_manifest_num, cur_manifest_num);
+  TrivialManifestWriteFn();
+  ASSERT_LT(prev_manifest_num, cur_manifest_num);
+
+  // Enabling auto-tuning should fix this, immediately for next manifest writes.
+  // This will allow up to double-ish the size of the compacted manifest,
+  // which last should have been 4000 + some bytes.
+  ASSERT_EQ(handles.size(), 4U);
+  ASSERT_OK(db_->SetDBOptions({{"max_manifest_space_amp_pct", "105"}}));
+
+  // After 9 CF names should be enough to rotate the manifest
+  for (int i = 1; i <= 5; ++i) {
+    if ((i % 2) == 1) {
+      DropCfFn();
+    }
+    AddCfFn();
+    ASSERT_EQ(prev_manifest_num, cur_manifest_num);
+  }
+  TrivialManifestWriteFn();
+  ASSERT_LT(prev_manifest_num, cur_manifest_num);
+
+  // We now have a different last compacted manifest size, should be
+  // able to go beyond 9 CFs named in manifest this time.
+  ASSERT_EQ(handles.size(), 6U);
+
+  DropCfFn();
+  DropCfFn();
+  for (int i = 1; i <= 4; ++i) {
+    DropCfFn();
+    AddCfFn();
+    ASSERT_EQ(prev_manifest_num, cur_manifest_num);
+  }
+  // We've written 10 named CFs to the manifest. We should be able to
+  // dynamically change the auto-tuning still based on the last "compacted"
+  // manifest size of 7000 + some bytes.
+  ASSERT_OK(db_->SetDBOptions({{"max_manifest_space_amp_pct", "51"}}));
+  TrivialManifestWriteFn();
+  ASSERT_LT(prev_manifest_num, cur_manifest_num);
+  // And the "compacted" manifest size has reset again, so should be changed
+  // again sooner.
+  ASSERT_EQ(handles.size(), 4U);
+  for (int i = 1; i <= 2; ++i) {
+    AddCfFn();
+    ASSERT_EQ(prev_manifest_num, cur_manifest_num);
+  }
+  // Enough for manifest change
+  AddCfFn();
+  ASSERT_LT(prev_manifest_num, cur_manifest_num);
+
+  // Wrap up
+  while (!handles.empty()) {
+    DropCfFn();
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc
index e9ae7981ae2c..7bf821170031 100644
--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@@ -75,11 +75,9 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
 
   ret.emplace_back(CurrentFileName(""));
   ret.emplace_back(DescriptorFileName("", versions_->manifest_file_number()));
-  // The OPTIONS file number is zero in read-write mode when OPTIONS file
-  // writing failed and the DB was configured with
-  // `fail_if_options_file_error == false`. In read-only mode the OPTIONS file
-  // number is zero when no OPTIONS file exist at all. In those cases we do not
-  // record any OPTIONS file in the live file list.
+  // In read-only mode the OPTIONS file number is zero when no OPTIONS file
+  // exist at all. In this cases we do not record any OPTIONS file in the live
+  // file list.
   if (versions_->options_file_number() != 0) {
     ret.emplace_back(OptionsFileName("", versions_->options_file_number()));
   }
@@ -111,6 +109,7 @@ Status DBImpl::GetSortedWalFilesImpl(VectorWalPtr& files, bool need_seqnos) {
   {
     InstrumentedMutexLock l(&mutex_);
     while (pending_purge_obsolete_files_ > 0 || bg_purge_scheduled_ > 0) {
+      TEST_SYNC_POINT("DBImpl::GetSortedWalFilesImpl:WaitPurge");
       bg_cv_.Wait();
     }
 
@@ -185,14 +184,14 @@ Status DBImpl::GetSortedWalFilesImpl(VectorWalPtr& files, bool need_seqnos) {
   return s;
 }
 
-Status DBImpl::GetCurrentWalFile(std::unique_ptr<WalFile>* current_log_file) {
+Status DBImpl::GetCurrentWalFile(std::unique_ptr<WalFile>* current_wal_file) {
   uint64_t current_logfile_number;
   {
     InstrumentedMutexLock l(&mutex_);
-    current_logfile_number = logfile_number_;
+    current_logfile_number = cur_wal_number_;
   }
 
-  return wal_manager_.GetLiveWalFile(current_logfile_number, current_log_file);
+  return wal_manager_.GetLiveWalFile(current_logfile_number, current_wal_file);
 }
 
 Status DBImpl::GetLiveFilesStorageInfo(
@@ -332,7 +331,7 @@ Status DBImpl::GetLiveFilesStorageInfo(
   const uint64_t options_size = versions_->options_file_size_;
   const uint64_t min_log_num = MinLogNumberToKeep();
   // Ensure consistency with manifest for track_and_verify_wals_in_manifest
-  const uint64_t max_log_num = logfile_number_;
+  const uint64_t max_log_num = cur_wal_number_;
 
   mutex_.Unlock();
 
@@ -369,11 +368,9 @@ Status DBImpl::GetLiveFilesStorageInfo(
     }
   }
 
-  // The OPTIONS file number is zero in read-write mode when OPTIONS file
-  // writing failed and the DB was configured with
-  // `fail_if_options_file_error == false`. In read-only mode the OPTIONS file
-  // number is zero when no OPTIONS file exist at all. In those cases we do not
-  // record any OPTIONS file in the live file list.
+  // In read-only mode the OPTIONS file number is zero when no OPTIONS file
+  // exist at all. In this cases we do not record any OPTIONS file in the live
+  // file list.
   if (options_number != 0) {
     results.emplace_back();
     LiveFileStorageInfo& info = results.back();
diff --git a/db/db_flush_test.cc b/db/db_flush_test.cc
index b72de9a6886e..e1000c576fd2 100644
--- a/db/db_flush_test.cc
+++ b/db/db_flush_test.cc
@@ -101,7 +101,7 @@ TEST_F(DBFlushTest, SyncFail) {
   TEST_SYNC_POINT("DBFlushTest::SyncFail:2");
   fault_injection_env->SetFilesystemActive(true);
   // Now the background job will do the flush; wait for it.
-  // Returns the IO error happend during flush.
+  // Returns the IO error happened during flush.
   ASSERT_NOK(dbfull()->TEST_WaitForFlushMemTable());
   ASSERT_EQ("", FilesPerLevel());  // flush failed.
   Destroy(options);
@@ -518,11 +518,11 @@ TEST_F(DBFlushTest, StatisticsGarbageInsertAndDeletes) {
   // Note : one set of delete for KEY1, KEY2, KEY3 is written to
   // SSTable to propagate the delete operations to K-V pairs
   // that could have been inserted into the database during past Flush
-  // opeartions.
+  // operations.
   EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH -=
       KEY1.size() + KEY2.size() + KEY3.size() + 3 * sizeof(uint64_t);
 
-  // Additional useful paylaod.
+  // Additional useful payload.
   ASSERT_OK(Delete(KEY4));
   ASSERT_OK(Delete(KEY5));
   ASSERT_OK(Delete(KEY6));
@@ -614,7 +614,7 @@ TEST_F(DBFlushTest, StatisticsGarbageRangeDeletes) {
 
   // Note : one set of deleteRange for (KEY1, KEY2) and (KEY2, KEY3) is written
   // to SSTable to propagate the deleteRange operations to K-V pairs that could
-  // have been inserted into the database during past Flush opeartions.
+  // have been inserted into the database during past Flush operations.
   EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH -=
       (KEY1.size() + KEY2.size() + sizeof(uint64_t)) +
       (KEY2.size() + KEY3.size() + sizeof(uint64_t));
@@ -709,7 +709,7 @@ class TestFlushListener : public EventListener {
     // that assumption does not hold (see the test case MultiDBMultiListeners
     // below).
     ASSERT_TRUE(test_);
-    if (db == test_->db_) {
+    if (db == test_->db_.get()) {
       std::vector<std::vector<FileMetaData>> files_by_level;
       test_->dbfull()->TEST_GetFilesMetaData(db->DefaultColumnFamily(),
                                              &files_by_level);
@@ -842,7 +842,7 @@ TEST_F(DBFlushTest, FixFlushReasonRaceFromConcurrentFlushes) {
     ASSERT_OK(Put(1, Key(idx), std::string(1, 'v')));
   }
 
-  // To coerce a manual flush happenning in the middle of GetLiveFiles's flush,
+  // To coerce a manual flush happening in the middle of GetLiveFiles's flush,
   // we need to pause background flush thread and enable it later.
   std::shared_ptr<test::SleepingBackgroundTask> sleeping_task =
       std::make_shared<test::SleepingBackgroundTask>();
@@ -851,7 +851,7 @@ TEST_F(DBFlushTest, FixFlushReasonRaceFromConcurrentFlushes) {
                  sleeping_task.get(), Env::Priority::HIGH);
   sleeping_task->WaitUntilSleeping();
 
-  // Coerce a manual flush happenning in the middle of GetLiveFiles's flush
+  // Coerce a manual flush happening in the middle of GetLiveFiles's flush
   bool get_live_files_paused_at_sync_point = false;
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::AtomicFlushMemTables:AfterScheduleFlush", [&](void* /* arg */) {
@@ -1428,7 +1428,7 @@ TEST_F(DBFlushTest, MemPurgeDeleteAndDeleteRange) {
   Close();
 }
 
-// Create a Compaction Fitler that will be invoked
+// Create a Compaction Filter that will be invoked
 // at flush time and will update the value of a KV pair
 // if the key string is "lower" than the filter_key_ string.
 class ConditionalUpdateFilter : public CompactionFilter {
@@ -2533,7 +2533,7 @@ TEST_F(DBFlushTest, TombstoneVisibleInSnapshot) {
 
   ASSERT_OK(db_->Put(WriteOptions(), "foo", "value0"));
 
-  ManagedSnapshot snapshot_guard(db_);
+  ManagedSnapshot snapshot_guard(db_.get());
 
   ColumnFamilyHandle* default_cf = db_->DefaultColumnFamily();
   ASSERT_OK(db_->Flush(FlushOptions(), default_cf));
@@ -2574,7 +2574,7 @@ TEST_P(DBAtomicFlushTest, ManualFlushUnder2PC) {
   txn_db_opts.write_policy = TxnDBWritePolicy::WRITE_COMMITTED;
   ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, &txn_db));
   ASSERT_NE(txn_db, nullptr);
-  db_ = txn_db;
+  db_.reset(txn_db);
 
   // Create two more columns other than default CF.
   std::vector<std::string> cfs = {"puppy", "kitty"};
@@ -2638,9 +2638,8 @@ TEST_P(DBAtomicFlushTest, ManualFlushUnder2PC) {
   // it means atomic flush didn't write the min_log_number_to_keep to MANIFEST.
   cfs.push_back(kDefaultColumnFamilyName);
   ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
-  DBImpl* db_impl = static_cast<DBImpl*>(db_);
-  ASSERT_TRUE(db_impl->allow_2pc());
-  ASSERT_NE(db_impl->MinLogNumberToKeep(), 0);
+  ASSERT_TRUE(dbfull()->allow_2pc());
+  ASSERT_NE(dbfull()->MinLogNumberToKeep(), 0);
 }
 
 TEST_P(DBAtomicFlushTest, ManualAtomicFlush) {
@@ -3504,6 +3503,209 @@ TEST_F(DBFlushTest, DBStuckAfterAtomicFlushError) {
   ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
   ASSERT_EQ(1, NumTableFilesAtLevel(0));
 }
+
+TEST_F(DBFlushTest, VerifyOutputRecordCount) {
+  for (bool use_plain_table : {false, true}) {
+    Options options = CurrentOptions();
+    options.flush_verify_memtable_count = true;
+    options.merge_operator = MergeOperators::CreateStringAppendOperator();
+    DestroyAndReopen(options);
+    // Verify flush output record count verification in different table
+    // formats
+    if (use_plain_table) {
+      options.table_factory.reset(NewPlainTableFactory());
+    }
+
+    // Verify that flush output record count verification does not produce false
+    // positives.
+    ASSERT_OK(Merge("k0", "v1"));
+    ASSERT_OK(Put("k1", "v1"));
+    ASSERT_OK(Put("k2", "v1"));
+    ASSERT_OK(SingleDelete("k2"));
+    ASSERT_OK(Delete("k2"));
+    ASSERT_OK(Delete("k3"));
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), "k1", "k3"));
+    ASSERT_OK(Flush());
+
+    // Verify that flush output record count verification catch corruption
+    DestroyAndReopen(options);
+    if (use_plain_table) {
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+          "PlainTableBuilder::Add::skip",
+          [&](void* skip) { *(bool*)skip = true; });
+
+    } else {
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+          "BlockBasedTableBuilder::Add::skip",
+          [&](void* skip) { *(bool*)skip = true; });
+    }
+    SyncPoint::GetInstance()->EnableProcessing();
+    const char* expect =
+        "Number of keys in flush output SST files does not match";
+
+    // 1. During DB open flush
+    ASSERT_OK(Put("k1", "v1"));
+    ASSERT_OK(Put("k2", "v1"));
+    Status s = TryReopen(options);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), expect));
+
+    // 2. During regular flush
+    DestroyAndReopen(options);
+    ASSERT_OK(Put("k1", "v1"));
+    ASSERT_OK(Put("k2", "v1"));
+    s = Flush();
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), expect));
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+class DBFlushSuperBlockTest
+    : public DBFlushTest,
+      public ::testing::WithParamInterface<std::tuple<bool, size_t, size_t>> {
+ public:
+  DBFlushSuperBlockTest() : DBFlushTest() {}
+
+  std::string formatKey(int i) {
+    int desired_length = 10;
+    char buffer[64];
+    snprintf(buffer, 64, "%0*d", desired_length, i);
+    return buffer;
+  }
+
+  void VerifyReadWithGet(int key_count) {
+    for (int i = 0; i < key_count; ++i) {
+      PinnableSlice value;
+      ASSERT_OK(Get(formatKey(i), &value));
+      ASSERT_EQ(value.ToString(), added_data[formatKey(i)]);
+    }
+  }
+
+  void VerifyReadWithIterator(int key_count) {
+    {
+      std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+      int i = 0;
+      for (it->SeekToFirst(); it->Valid(); it->Next()) {
+        ASSERT_OK(it->status());
+        ASSERT_EQ((it->key()).ToString(), formatKey(i));
+        ASSERT_EQ((it->value()).ToString(), added_data[formatKey(i)]);
+        i++;
+      }
+      ASSERT_OK(it->status());
+      ASSERT_EQ(i, key_count);
+    }
+  }
+
+ protected:
+  Random rnd{123};
+  std::unordered_map<std::string, std::string> added_data;
+};
+
+constexpr size_t kLowSpaceOverheadRatio = 256;
+
+TEST_P(DBFlushSuperBlockTest, SuperBlock) {
+  constexpr int key_count = 12345;
+  Options options;
+  options.env = env_;
+  options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  options.paranoid_file_checks = true;
+  options.write_buffer_size = 1024 * 1024;
+  BlockBasedTableOptions block_options;
+  block_options.block_align = get<0>(GetParam());
+  block_options.index_block_restart_interval = 3;
+  block_options.super_block_alignment_size = get<1>(GetParam());
+  block_options.super_block_alignment_space_overhead_ratio = get<2>(GetParam());
+  options.table_factory.reset(NewBlockBasedTableFactory(block_options));
+  if (block_options.block_align) {
+    // When block align is enabled, disable compression
+    options.compression = kNoCompression;
+  }
+
+  ASSERT_OK(options.table_factory->ValidateOptions(
+      DBOptions(options), ColumnFamilyOptions(options)));
+
+  Reopen(options);
+
+  int super_block_pad_count = 0;
+  int super_block_pad_exceed_limit_count = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTableBuilder::WriteMaybeCompressedBlock:"
+      "SuperBlockAlignment",
+      [&super_block_pad_count](void* /*arg*/) { super_block_pad_count++; });
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTableBuilder::WriteMaybeCompressedBlock:"
+      "SuperBlockAlignmentPaddingBytesExceedLimit",
+      [&super_block_pad_exceed_limit_count](void* /*arg*/) {
+        super_block_pad_exceed_limit_count++;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Add lots of keys
+  for (int i = 0; i < key_count; ++i) {
+    added_data[formatKey(i)] = std::string(rnd.RandomString(rnd.Next() % 1000));
+    ASSERT_OK(Put(formatKey(i), added_data[formatKey(i)]));
+  }
+
+  // flush the data in memory to disk to verify with super block alignment, the
+  // data could be read back properly
+  Reopen(options);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // When block_align is enabled, super block is always aligned, so there should
+  // be 0 padding for super block alignment
+  if (block_options.super_block_alignment_size != 0 &&
+      !block_options.block_align) {
+    ASSERT_GT(super_block_pad_count, 0);
+  } else {
+    ASSERT_EQ(super_block_pad_count, 0);
+  }
+
+  if (!block_options.block_align &&
+      block_options.super_block_alignment_size != 0 &&
+      block_options.super_block_alignment_space_overhead_ratio ==
+          kLowSpaceOverheadRatio) {
+    ASSERT_GT(super_block_pad_exceed_limit_count, 0);
+  }
+
+  // verify the values are correct
+  VerifyReadWithGet(key_count);
+  Reopen(options);
+  VerifyReadWithIterator(key_count);
+
+  // verify checksum
+  ASSERT_OK(db_->VerifyFileChecksums(ReadOptions()));
+
+  // Reopen options and flip the option of super block configuration, read still
+  // works. This verifies the forward/backward compatibility
+  if (block_options.super_block_alignment_size == 0) {
+    block_options.super_block_alignment_size = 16 * 1024;
+  } else {
+    block_options.super_block_alignment_size = 0;
+  }
+  options.table_factory.reset(NewBlockBasedTableFactory(block_options));
+
+  Reopen(options);
+
+  // verify the values are correct
+  VerifyReadWithGet(key_count);
+  Reopen(options);
+  VerifyReadWithIterator(key_count);
+
+  // verify checksum
+  ASSERT_OK(db_->VerifyFileChecksums(ReadOptions()));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    SuperBlockTests, DBFlushSuperBlockTest,
+    testing::Combine(testing::Bool(), testing::Values(0, 32 * 1024, 16 * 1024),
+                     // Use very low space overhead ratio to test
+                     // the case where required padded bytes is
+                     // larger than the max allowed padding size
+                     testing::Values(4, kLowSpaceOverheadRatio)));
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/db_follower_test.cc b/db/db_follower_test.cc
index a0f35a46b619..c032464052c2 100644
--- a/db/db_follower_test.cc
+++ b/db/db_follower_test.cc
@@ -370,10 +370,10 @@ TEST_F(DBFollowerTest, RetryCatchupManifestRollover) {
 
 // This test creates 4 L0 files and compacts them. The follower, during catchup,
 // successfully instantiates 4 Versions corresponding to the 4 files (but
-// donesn't install them yet), followed by deleting those 4 and adding a new
+// doesn't install them yet), followed by deleting those 4 and adding a new
 // file from compaction. The test verifies that the 4 L0 files are deleted
 // correctly by the follower.
-// We use teh Barrier* functions to ensure that the follower first sees the 4
+// We use the Barrier* functions to ensure that the follower first sees the 4
 // L0 files and is able to link them, and then sees the compaction that
 // obsoletes those L0 files (so those L0 files are intermediates that it has
 // to explicitly delete). Suppose we don't have any barriers, its possible
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 96613dfad050..fea401477cc5 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -74,12 +74,14 @@
 #include "options/cf_options.h"
 #include "options/options_helper.h"
 #include "options/options_parser.h"
+#include "util/udt_util.h"
 #ifdef ROCKSDB_JEMALLOC
 #include "port/jemalloc_helper.h"
 #endif
 #include "port/port.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/compaction_filter.h"
+#include "rocksdb/convenience.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/merge_operator.h"
@@ -168,7 +170,6 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
                bool read_only)
     : dbname_(dbname),
       own_info_log_(options.info_log == nullptr),
-      init_logger_creation_s_(),
       initial_db_options_(SanitizeOptions(dbname, options, read_only,
                                           &init_logger_creation_s_)),
       env_(initial_db_options_.env),
@@ -184,7 +185,6 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
       mutex_(stats_, immutable_db_options_.clock, DB_MUTEX_WAIT_MICROS,
              immutable_db_options_.use_adaptive_mutex),
 #endif  // COERCE_CONTEXT_SWITCH
-      default_cf_handle_(nullptr),
       error_handler_(this, immutable_db_options_, &mutex_),
       event_logger_(immutable_db_options_.info_log.get()),
       max_total_in_memory_state_(0),
@@ -193,45 +193,15 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
           file_options_, immutable_db_options_)),
       seq_per_batch_(seq_per_batch),
       batch_per_txn_(batch_per_txn),
-      next_job_id_(1),
-      shutting_down_(false),
-      reject_new_background_jobs_(false),
-      db_lock_(nullptr),
-      manual_compaction_paused_(false),
       bg_cv_(&mutex_),
-      logfile_number_(0),
-      log_dir_synced_(false),
-      log_empty_(true),
-      persist_stats_cf_handle_(nullptr),
-      log_sync_cv_(&log_write_mutex_),
-      total_log_size_(0),
-      is_snapshot_supported_(true),
+      wal_sync_cv_(&wal_write_mutex_),
       write_buffer_manager_(immutable_db_options_.write_buffer_manager.get()),
       write_thread_(immutable_db_options_),
       nonmem_write_thread_(immutable_db_options_),
       write_controller_(mutable_db_options_.delayed_write_rate),
-      last_batch_group_size_(0),
-      unscheduled_flushes_(0),
-      unscheduled_compactions_(0),
-      bg_bottom_compaction_scheduled_(0),
-      bg_compaction_scheduled_(0),
-      num_running_compactions_(0),
-      bg_flush_scheduled_(0),
-      num_running_flushes_(0),
-      bg_purge_scheduled_(0),
-      disable_delete_obsolete_files_(0),
-      pending_purge_obsolete_files_(0),
       delete_obsolete_files_last_run_(immutable_db_options_.clock->NowMicros()),
-      has_unpersisted_data_(false),
-      unable_to_release_oldest_log_(false),
-      num_running_ingest_file_(0),
       wal_manager_(immutable_db_options_, file_options_, io_tracer_,
                    seq_per_batch),
-      bg_work_paused_(0),
-      bg_compaction_paused_(0),
-      refitting_level_(false),
-      opened_successfully_(false),
-      periodic_task_scheduler_(),
       two_write_queues_(options.two_write_queues),
       manual_wal_flush_(options.manual_wal_flush),
       // last_sequencee_ is always maintained by the main queue that also writes
@@ -249,14 +219,11 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
       // requires a custom gc for compaction, we use that to set use_custom_gc_
       // as well.
       use_custom_gc_(seq_per_batch),
-      shutdown_initiated_(false),
       own_sfm_(options.sst_file_manager == nullptr),
-      closed_(false),
       atomic_flush_install_cv_(&mutex_),
       blob_callback_(immutable_db_options_.sst_file_manager.get(), &mutex_,
                      &error_handler_, &event_logger_,
-                     immutable_db_options_.listeners, dbname_),
-      lock_wal_count_(0) {
+                     immutable_db_options_.listeners, dbname_) {
   // !batch_per_trx_ implies seq_per_batch_ because it is only unset for
   // WriteUnprepared, which should use seq_per_batch_.
   assert(batch_per_txn_ || seq_per_batch_);
@@ -284,15 +251,17 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
   periodic_task_functions_.emplace(PeriodicTaskType::kFlushInfoLog,
                                    [this]() { this->FlushInfoLog(); });
   periodic_task_functions_.emplace(
-      PeriodicTaskType::kRecordSeqnoTime, [this]() {
-        this->RecordSeqnoToTimeMapping(/*populate_historical_seconds=*/0);
-      });
+      PeriodicTaskType::kRecordSeqnoTime,
+      [this]() { this->RecordSeqnoToTimeMapping(); });
+  periodic_task_functions_.emplace(
+      PeriodicTaskType::kTriggerCompaction,
+      [this]() { this->TriggerPeriodicCompaction(); });
 
   versions_.reset(new VersionSet(
-      dbname_, &immutable_db_options_, file_options_, table_cache_.get(),
-      write_buffer_manager_, &write_controller_, &block_cache_tracer_,
-      io_tracer_, db_id_, db_session_id_, options.daily_offpeak_time_utc,
-      &error_handler_, read_only));
+      dbname_, &immutable_db_options_, mutable_db_options_, file_options_,
+      table_cache_.get(), write_buffer_manager_, &write_controller_,
+      &block_cache_tracer_, io_tracer_, db_id_, db_session_id_,
+      options.daily_offpeak_time_utc, &error_handler_, read_only));
   column_family_memtables_.reset(
       new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet()));
 
@@ -351,6 +320,22 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) {
 
   WaitForBackgroundWork();
 
+  TEST_SYNC_POINT("DBImpl::ResumeImpl:Start");
+
+  // With two_write_queues=true, sequence numbers are allocated via
+  // FetchAddLastAllocatedSequence() before writes complete, but only
+  // published via SetLastSequence() after success. If we're recovering from
+  // an error, there may be allocated-but-not-published sequence numbers.
+  // We must sync last_sequence_ with last_allocated_sequence_ before creating
+  // any new memtables/WALs, otherwise the new WAL could start with a sequence
+  // number lower than what was already written, causing "sequence number
+  // going backwards" corruption on subsequent recovery.
+  if (immutable_db_options_.two_write_queues) {
+    versions_->SyncLastSequenceWithAllocated();
+  }
+
+  TEST_SYNC_POINT("DBImpl::ResumeImpl:AfterSyncSeq");
+
   Status s;
   if (shutdown_initiated_) {
     // Returning shutdown status to SFM during auto recovery will cause it
@@ -636,8 +621,8 @@ Status DBImpl::CloseHelper() {
     mutex_.Lock();
   }
   {
-    InstrumentedMutexLock lock(&log_write_mutex_);
-    for (auto l : logs_to_free_) {
+    InstrumentedMutexLock lock(&wal_write_mutex_);
+    for (auto l : wals_to_free_) {
       delete l;
     }
     for (auto& log : logs_) {
@@ -821,7 +806,8 @@ Status DBImpl::StartPeriodicTaskScheduler() {
     Status s = periodic_task_scheduler_.Register(
         PeriodicTaskType::kDumpStats,
         periodic_task_functions_.at(PeriodicTaskType::kDumpStats),
-        mutable_db_options_.stats_dump_period_sec);
+        mutable_db_options_.stats_dump_period_sec,
+        /*run_immediately=*/true);
     if (!s.ok()) {
       return s;
     }
@@ -830,7 +816,8 @@ Status DBImpl::StartPeriodicTaskScheduler() {
     Status s = periodic_task_scheduler_.Register(
         PeriodicTaskType::kPersistStats,
         periodic_task_functions_.at(PeriodicTaskType::kPersistStats),
-        mutable_db_options_.stats_persist_period_sec);
+        mutable_db_options_.stats_persist_period_sec,
+        /*run_immediately=*/true);
     if (!s.ok()) {
       return s;
     }
@@ -838,64 +825,55 @@ Status DBImpl::StartPeriodicTaskScheduler() {
 
   Status s = periodic_task_scheduler_.Register(
       PeriodicTaskType::kFlushInfoLog,
-      periodic_task_functions_.at(PeriodicTaskType::kFlushInfoLog));
+      periodic_task_functions_.at(PeriodicTaskType::kFlushInfoLog),
+      /*run_immediately=*/true);
+
+  if (s.ok()) {
+    s = periodic_task_scheduler_.Register(
+        PeriodicTaskType::kTriggerCompaction,
+        periodic_task_functions_.at(PeriodicTaskType::kTriggerCompaction),
+        /*run_immediately=*/false);
+  }
 
   return s;
 }
 
-Status DBImpl::RegisterRecordSeqnoTimeWorker(const ReadOptions& read_options,
-                                             const WriteOptions& write_options,
-                                             bool is_new_db) {
+Status DBImpl::RegisterRecordSeqnoTimeWorker() {
   options_mutex_.AssertHeld();
 
-  uint64_t min_preserve_seconds = std::numeric_limits<uint64_t>::max();
-  uint64_t max_preserve_seconds = std::numeric_limits<uint64_t>::min();
-  std::vector<SuperVersionContext> sv_contexts;
+  // We assume InstallSuperVersionForConfigChange has already ensured suitable
+  // mappings are present for each relevant CF. We just need to be sure the DB's
+  // seqno_to_time_mapping_ and worker scheduler are appropriate for the
+  // combination of CF settings.
+
+  MinAndMaxPreserveSeconds preserve_info;
+  uint64_t seqno_time_cadence;
   {
     InstrumentedMutexLock l(&mutex_);
 
     for (auto cfd : *versions_->GetColumnFamilySet()) {
       auto& mopts = cfd->GetLatestMutableCFOptions();
-      // preserve time is the max of 2 options.
-      uint64_t preserve_seconds =
-          std::max(mopts.preserve_internal_time_seconds,
-                   mopts.preclude_last_level_data_seconds);
-      if (!cfd->IsDropped() && preserve_seconds > 0) {
-        min_preserve_seconds = std::min(preserve_seconds, min_preserve_seconds);
-        max_preserve_seconds = std::max(preserve_seconds, max_preserve_seconds);
+      if (!cfd->IsDropped()) {
+        preserve_info.Combine(mopts);
       }
     }
-    size_t old_mapping_size = seqno_to_time_mapping_.Size();
-    if (min_preserve_seconds == std::numeric_limits<uint64_t>::max()) {
-      // Don't track
+    seqno_time_cadence = preserve_info.GetRecodingCadence();
+    if (seqno_time_cadence == 0) {
+      // To return as much as possible to the feature being disabled,
+      // clear the existing mapping
       seqno_to_time_mapping_.SetCapacity(0);
       seqno_to_time_mapping_.SetMaxTimeSpan(UINT64_MAX);
+      assert(seqno_to_time_mapping_.Empty());
     } else {
       uint64_t cap = std::min(kMaxSeqnoToTimeEntries,
-                              max_preserve_seconds * kMaxSeqnoTimePairsPerCF /
-                                  min_preserve_seconds);
+                              preserve_info.max_preserve_seconds *
+                                  kMaxSeqnoTimePairsPerCF /
+                                  preserve_info.min_preserve_seconds);
       seqno_to_time_mapping_.SetCapacity(cap);
-      seqno_to_time_mapping_.SetMaxTimeSpan(max_preserve_seconds);
-    }
-    if (old_mapping_size != seqno_to_time_mapping_.Size()) {
-      InstallSeqnoToTimeMappingInSV(&sv_contexts);
+      seqno_to_time_mapping_.SetMaxTimeSpan(preserve_info.max_preserve_seconds);
     }
   }
 
-  // clean up outside db mutex
-  for (SuperVersionContext& sv_context : sv_contexts) {
-    sv_context.Clean();
-  }
-  sv_contexts.clear();
-
-  uint64_t seqno_time_cadence = 0;
-  if (min_preserve_seconds != std::numeric_limits<uint64_t>::max()) {
-    // round up to 1 when the time_duration is smaller than
-    // kMaxSeqnoTimePairsPerCF
-    seqno_time_cadence = (min_preserve_seconds + kMaxSeqnoTimePairsPerCF - 1) /
-                         kMaxSeqnoTimePairsPerCF;
-  }
-
   TEST_SYNC_POINT_CALLBACK(
       "DBImpl::RegisterRecordSeqnoTimeWorker:BeforePeriodicTaskType", nullptr);
 
@@ -903,68 +881,10 @@ Status DBImpl::RegisterRecordSeqnoTimeWorker(const ReadOptions& read_options,
   if (seqno_time_cadence == 0) {
     s = periodic_task_scheduler_.Unregister(PeriodicTaskType::kRecordSeqnoTime);
   } else {
-    // Before registering the periodic task, we need to be sure to fulfill two
-    // promises:
-    // 1) Any DB created with preserve/preclude options set from the beginning
-    // will get pre-allocated seqnos with pre-populated time mappings back to
-    // the times we are interested in. (This will enable future import of data
-    // while preserving rough write time. We can only do this reliably from
-    // DB::Open, as otherwise there could be a race between CreateColumnFamily
-    // and the first Write to the DB, and seqno-to-time mappings need to be
-    // monotonic.
-    // 2) In any DB, any data written after setting preserve/preclude options
-    // must have a reasonable time estimate (so that we can accurately place
-    // the data), which means at least one entry in seqno_to_time_mapping_.
-    //
-    // FIXME: We don't currently guarantee that if the first column family with
-    // that setting is added or configured after initial DB::Open but before
-    // the first user Write. Fixing this causes complications with the crash
-    // test because if DB starts without preserve/preclude option, does some
-    // user writes but all those writes are lost in crash, then re-opens with
-    // preserve/preclude option, it sees seqno==1 which looks like one of the
-    // user writes was recovered, when actually it was not.
-    bool last_seqno_zero = GetLatestSequenceNumber() == 0;
-    assert(!is_new_db || last_seqno_zero);
-    if (is_new_db && last_seqno_zero) {
-      // Pre-allocate seqnos and pre-populate historical mapping
-      // We can simply modify these, before writes are allowed
-      constexpr uint64_t kMax = kMaxSeqnoTimePairsPerSST;
-      versions_->SetLastAllocatedSequence(kMax);
-      versions_->SetLastPublishedSequence(kMax);
-      versions_->SetLastSequence(kMax);
-
-      // And record in manifest, to avoid going backwards in seqno on re-open
-      // (potentially with different options). Concurrency is simple because we
-      // are in DB::Open
-      {
-        InstrumentedMutexLock l(&mutex_);
-        VersionEdit edit;
-        edit.SetLastSequence(kMax);
-        s = versions_->LogAndApplyToDefaultColumnFamily(
-            read_options, write_options, &edit, &mutex_,
-            directories_.GetDbDir());
-        if (!s.ok() && versions_->io_status().IsIOError()) {
-          error_handler_.SetBGError(versions_->io_status(),
-                                    BackgroundErrorReason::kManifestWrite);
-        }
-      }
-
-      // Pre-populate mappings for reserved sequence numbers.
-      RecordSeqnoToTimeMapping(max_preserve_seconds);
-    } else {
-      if (!last_seqno_zero) {
-        // Ensure at least one mapping (or log a warning), and
-        // an updated entry whenever relevant SetOptions is called
-        RecordSeqnoToTimeMapping(/*populate_historical_seconds=*/0);
-      } else {
-        // FIXME (see limitation described above)
-      }
-    }
-
     s = periodic_task_scheduler_.Register(
         PeriodicTaskType::kRecordSeqnoTime,
         periodic_task_functions_.at(PeriodicTaskType::kRecordSeqnoTime),
-        seqno_time_cadence);
+        seqno_time_cadence, /*run_immediately=*/true);
   }
 
   return s;
@@ -1165,7 +1085,7 @@ void DBImpl::DumpStats() {
   {
     InstrumentedMutexLock l(&mutex_);
     for (auto cfd : versions_->GetRefedColumnFamilySet()) {
-      if (!cfd->initialized()) {
+      if (!cfd->initialized() || cfd->IsDropped()) {
         continue;
       }
 
@@ -1255,11 +1175,11 @@ Status DBImpl::TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family,
 
 void DBImpl::ScheduleBgLogWriterClose(JobContext* job_context) {
   mutex_.AssertHeld();
-  if (!job_context->logs_to_free.empty()) {
-    for (auto l : job_context->logs_to_free) {
+  if (!job_context->wals_to_free.empty()) {
+    for (auto l : job_context->wals_to_free) {
       AddToLogsToFreeQueue(l);
     }
-    job_context->logs_to_free.clear();
+    job_context->wals_to_free.clear();
   }
 }
 
@@ -1273,23 +1193,38 @@ FSDirectory* DBImpl::GetDataDir(ColumnFamilyData* cfd, size_t path_id) const {
 }
 
 Status DBImpl::SetOptions(
-    ColumnFamilyHandle* column_family,
-    const std::unordered_map<std::string, std::string>& options_map) {
+    const std::unordered_map<ColumnFamilyHandle*,
+                             std::unordered_map<std::string, std::string>>&
+        column_families_opts_map) {
   // TODO: plumb Env::IOActivity, Env::IOPriority
   const ReadOptions read_options;
   const WriteOptions write_options;
 
-  auto* cfd =
-      static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
-  if (options_map.empty()) {
-    ROCKS_LOG_WARN(immutable_db_options_.info_log,
-                   "SetOptions() on column family [%s], empty input",
-                   cfd->GetName().c_str());
-    return Status::InvalidArgument("empty input");
+  if (column_families_opts_map.empty()) {
+    return Status::OK();
+  }
+
+  for (const auto& cf_opts : column_families_opts_map) {
+    if (cf_opts.second.empty()) {
+      ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                     "SetOptions() on column family [%s], empty input",
+                     cf_opts.first->GetName().c_str());
+      return Status::InvalidArgument("empty input");
+    }
+  }
+
+  autovector<std::pair<ColumnFamilyData*,
+                       const std::unordered_map<std::string, std::string>*>>
+      column_family_datas;
+  for (const auto& cf_opts : column_families_opts_map) {
+    column_family_datas.push_back(
+        {static_cast_with_check<ColumnFamilyHandleImpl>(cf_opts.first)->cfd(),
+         &cf_opts.second});
   }
 
   InstrumentedMutexLock ol(&options_mutex_);
-  MutableCFOptions new_options_copy;  // For logging outside of DB mutex
+  autovector<MutableCFOptions>
+      new_options_copy;  // For logging outside of DB mutex
   Status s;
   Status persist_options_status;
   SuperVersionContext sv_context(/* create_superversion */ true);
@@ -1309,73 +1244,107 @@ Status DBImpl::SetOptions(
     //
     // (b) Append a new Version without manifest write nor DB mutex release
     //
-    // Thus aren't releasing the DB mutex again until the end of this block,
-    // after installing the new SuperVersion.
-    auto pre_cb = [&]() -> Status {
-      Status cb_s = cfd->SetOptions(db_options, options_map);
-      if (cb_s.ok()) {
-        new_options_copy = cfd->GetLatestMutableCFOptions();
-      }
-      return cb_s;
-    };
+    // Thus aren't releasing the DB mutex from LogAndApply calling pre_cb,
+    // through installing the new Version until the end of this block, after
+    // installing the new SuperVersion.
     VersionEdit dummy_edit;
     dummy_edit.MarkNoManifestWriteDummy();
     TEST_SYNC_POINT_CALLBACK("DBImpl::SetOptions:dummy_edit", &dummy_edit);
-    s = versions_->LogAndApply(
-        cfd, read_options, write_options, &dummy_edit, &mutex_,
-        directories_.GetDbDir(), false /*new_descriptor_log=*/,
-        nullptr /*new_opts*/, {} /*manifest_wcb*/, pre_cb);
-    if (!versions_->io_status().ok()) {
-      assert(!s.ok());
-      error_handler_.SetBGError(versions_->io_status(),
-                                BackgroundErrorReason::kManifestWrite);
+    for (const auto& cfd_opts : column_family_datas) {
+      auto* cfd = cfd_opts.first;
+      const auto* options_map_ptr = cfd_opts.second;
+      auto pre_cb = [&]() -> Status {
+        Status cb_s = cfd->SetOptions(db_options, *options_map_ptr);
+        if (cb_s.ok()) {
+          new_options_copy.emplace_back(cfd->GetLatestMutableCFOptions());
+        }
+        return cb_s;
+      };
+
+      s = versions_->LogAndApply(
+          cfd, read_options, write_options, &dummy_edit, &mutex_,
+          directories_.GetDbDir(), false /*new_descriptor_log=*/,
+          nullptr /*new_opts*/, {} /*manifest_wcb*/, pre_cb);
+      if (!versions_->io_status().ok()) {
+        assert(!s.ok());
+        error_handler_.SetBGError(versions_->io_status(),
+                                  BackgroundErrorReason::kManifestWrite);
+      }
+      if (!s.ok()) {
+        break;
+      }
     }
 
     if (s.ok()) {
       // Trigger possible flush/compactions. This has to be before we persist
       // options to file, otherwise there will be a deadlock with writer
       // thread.
-      InstallSuperVersionAndScheduleWork(cfd, &sv_context);
+      for (const auto& cfd_opts : column_family_datas) {
+        InstallSuperVersionForConfigChange(cfd_opts.first, &sv_context);
+      }
       persist_options_status =
           WriteOptionsFile(write_options, true /*db_mutex_already_held*/);
       bg_cv_.SignalAll();
 
-#if __cplusplus >= 202002L
-      assert(new_options_copy == cfd->GetLatestMutableCFOptions());
-      assert(cfd->GetLatestMutableCFOptions() ==
-             cfd->GetCurrentMutableCFOptions());
-      assert(cfd->GetCurrentMutableCFOptions() ==
-             cfd->current()->GetMutableCFOptions());
+#ifndef NDEBUG
+      for (size_t i = 0; i < column_family_datas.size(); ++i) {
+        auto* cfd = column_family_datas[i].first;
+        assert(new_options_copy[i] == cfd->GetLatestMutableCFOptions());
+        assert(cfd->GetLatestMutableCFOptions() ==
+               cfd->GetCurrentMutableCFOptions());
+        assert(cfd->GetCurrentMutableCFOptions() ==
+               cfd->current()->GetMutableCFOptions());
+      }
 #endif
     }
   }
   sv_context.Clean();
 
-  if (s.ok() && (options_map.count("preserve_internal_time_seconds") > 0 ||
-                 options_map.count("preclude_last_level_data_seconds") > 0)) {
-    s = RegisterRecordSeqnoTimeWorker(read_options, write_options,
-                                      false /* is_new_db*/);
+  if (s.ok()) {
+    bool needs_seqno_worker = false;
+    for (const auto& cf_opts : column_families_opts_map) {
+      if (cf_opts.second.count("preserve_internal_time_seconds") > 0 ||
+          cf_opts.second.count("preclude_last_level_data_seconds") > 0) {
+        needs_seqno_worker = true;
+        break;
+      }
+    }
+    if (needs_seqno_worker) {
+      s = RegisterRecordSeqnoTimeWorker();
+    }
   }
 
-  ROCKS_LOG_INFO(
-      immutable_db_options_.info_log,
-      "SetOptions() on column family [%s], inputs:", cfd->GetName().c_str());
-  for (const auto& o : options_map) {
-    ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s: %s\n", o.first.c_str(),
-                   o.second.c_str());
+  ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                 "SetOptions() on [%zu] column families, inputs:",
+                 column_family_datas.size());
+  for (size_t i = 0; i < column_family_datas.size(); ++i) {
+    const auto* cfd = column_family_datas[i].first;
+    const auto* options_map_ptr = column_family_datas[i].second;
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "Set options on column family [%s] (%zu/%zu), inputs:",
+                   cfd->GetName().c_str(), i, column_family_datas.size());
+    for (const auto& o : *options_map_ptr) {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s: %s\n",
+                     o.first.c_str(), o.second.c_str());
+    }
   }
   if (s.ok()) {
-    ROCKS_LOG_INFO(immutable_db_options_.info_log,
-                   "[%s] SetOptions() succeeded", cfd->GetName().c_str());
-    new_options_copy.Dump(immutable_db_options_.info_log.get());
+    for (size_t i = 0; i < column_family_datas.size(); ++i) {
+      const auto* cfd = column_family_datas[i].first;
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "Set options on column family [%s] (%zu/%zu) succeeded, "
+                     "updated CF options:",
+                     cfd->GetName().c_str(), i, column_family_datas.size());
+      new_options_copy[i].Dump(immutable_db_options_.info_log.get());
+    }
     if (!persist_options_status.ok()) {
       // NOTE: WriteOptionsFile already logs on failure
       s = persist_options_status;
     }
   } else {
     persist_options_status.PermitUncheckedError();  // less important
-    ROCKS_LOG_WARN(immutable_db_options_.info_log, "[%s] SetOptions() failed",
-                   cfd->GetName().c_str());
+    ROCKS_LOG_WARN(immutable_db_options_.info_log, "SetOptions() failed: %s",
+                   s.ToString().c_str());
   }
   LogFlush(immutable_db_options_.info_log);
   return s;
@@ -1474,7 +1443,7 @@ Status DBImpl::SetDBOptions(
         s = periodic_task_scheduler_.Register(
             PeriodicTaskType::kDumpStats,
             periodic_task_functions_.at(PeriodicTaskType::kDumpStats),
-            new_options.stats_dump_period_sec);
+            new_options.stats_dump_period_sec, /*run_immediately=*/true);
       }
       if (new_options.max_total_wal_size !=
           mutable_db_options_.max_total_wal_size) {
@@ -1489,7 +1458,7 @@ Status DBImpl::SetDBOptions(
           s = periodic_task_scheduler_.Register(
               PeriodicTaskType::kPersistStats,
               periodic_task_functions_.at(PeriodicTaskType::kPersistStats),
-              new_options.stats_persist_period_sec);
+              new_options.stats_persist_period_sec, /*run_immediately=*/true);
         }
       }
       mutex_.Lock();
@@ -1510,7 +1479,7 @@ Status DBImpl::SetDBOptions(
       file_options_for_compaction_ = FileOptions(new_db_options);
       file_options_for_compaction_ = fs_->OptimizeForCompactionTableWrite(
           file_options_for_compaction_, immutable_db_options_);
-      versions_->ChangeFileOptions(mutable_db_options_);
+      versions_->UpdatedMutableDbOptions(mutable_db_options_, &mutex_);
       // TODO(xiez): clarify why apply optimize for read to write options
       file_options_for_compaction_ = fs_->OptimizeForCompactionTableRead(
           file_options_for_compaction_, immutable_db_options_);
@@ -1518,7 +1487,7 @@ Status DBImpl::SetDBOptions(
         WriteThread::Writer w;
         write_thread_.EnterUnbatched(&w, &mutex_);
         if (wal_other_option_changed ||
-            total_log_size_ > GetMaxTotalWalSize()) {
+            wals_total_size_.LoadRelaxed() > GetMaxTotalWalSize()) {
           Status purge_wal_status = SwitchWAL(&write_context);
           if (!purge_wal_status.ok()) {
             ROCKS_LOG_WARN(immutable_db_options_.info_log,
@@ -1545,14 +1514,9 @@ Status DBImpl::SetDBOptions(
     ROCKS_LOG_INFO(immutable_db_options_.info_log, "SetDBOptions() succeeded");
     new_options.Dump(immutable_db_options_.info_log.get());
     if (!persist_options_status.ok()) {
-      if (immutable_db_options_.fail_if_options_file_error) {
-        s = Status::IOError(
-            "SetDBOptions() succeeded, but unable to persist options",
-            persist_options_status.ToString());
-      }
-      ROCKS_LOG_WARN(immutable_db_options_.info_log,
-                     "Unable to persist options in SetDBOptions() -- %s",
-                     persist_options_status.ToString().c_str());
+      s = Status::IOError(
+          "SetDBOptions() succeeded, but unable to persist options",
+          persist_options_status.ToString());
     }
   } else {
     ROCKS_LOG_WARN(immutable_db_options_.info_log, "SetDBOptions failed");
@@ -1583,12 +1547,18 @@ int DBImpl::FindMinimumEmptyLevelFitting(
   return minimum_level;
 }
 
+Status DBImpl::FlushWAL(const FlushWALOptions& options) {
+  WriteOptions write_options;
+  write_options.rate_limiter_priority = options.rate_limiter_priority;
+  return FlushWAL(write_options, options.sync);
+}
+
 Status DBImpl::FlushWAL(const WriteOptions& write_options, bool sync) {
   if (manual_wal_flush_) {
     IOStatus io_s;
     {
-      // We need to lock log_write_mutex_ since logs_ might change concurrently
-      InstrumentedMutexLock wl(&log_write_mutex_);
+      // We need to lock wal_write_mutex_ since logs_ might change concurrently
+      InstrumentedMutexLock wl(&wal_write_mutex_);
       log::Writer* cur_log_writer = logs_.back().writer;
       io_s = cur_log_writer->WriteBuffer(write_options);
     }
@@ -1615,7 +1585,7 @@ Status DBImpl::FlushWAL(const WriteOptions& write_options, bool sync) {
 }
 
 bool DBImpl::WALBufferIsEmpty() {
-  InstrumentedMutexLock l(&log_write_mutex_);
+  InstrumentedMutexLock l(&wal_write_mutex_);
   log::Writer* cur_log_writer = logs_.back().writer;
   auto res = cur_log_writer->BufferIsEmpty();
   return res;
@@ -1623,7 +1593,7 @@ bool DBImpl::WALBufferIsEmpty() {
 
 Status DBImpl::GetOpenWalSizes(std::map<uint64_t, uint64_t>& number_to_size) {
   assert(number_to_size.empty());
-  InstrumentedMutexLock l(&log_write_mutex_);
+  InstrumentedMutexLock l(&wal_write_mutex_);
   for (auto& log : logs_) {
     auto* open_file = log.writer->file();
     if (open_file) {
@@ -1665,15 +1635,15 @@ IOStatus DBImpl::SyncWalImpl(bool include_current_wal,
   uint64_t up_to_number;
 
   {
-    InstrumentedMutexLock l(&log_write_mutex_);
+    InstrumentedMutexLock l(&wal_write_mutex_);
     assert(!logs_.empty());
 
-    maybe_active_number = logfile_number_;
+    maybe_active_number = cur_wal_number_;
     up_to_number =
         include_current_wal ? maybe_active_number : maybe_active_number - 1;
 
     while (logs_.front().number <= up_to_number && logs_.front().IsSyncing()) {
-      log_sync_cv_.Wait();
+      wal_sync_cv_.Wait();
     }
     // First check that logs are safe to sync in background.
     if (include_current_wal &&
@@ -1697,7 +1667,7 @@ IOStatus DBImpl::SyncWalImpl(bool include_current_wal,
       }
     }
 
-    need_wal_dir_sync = !log_dir_synced_;
+    need_wal_dir_sync = !wal_dir_synced_;
   }
 
   if (include_current_wal) {
@@ -1770,7 +1740,7 @@ IOStatus DBImpl::SyncWalImpl(bool include_current_wal,
                              /*arg=*/nullptr);
   }
   {
-    InstrumentedMutexLock l(&log_write_mutex_);
+    InstrumentedMutexLock l(&wal_write_mutex_);
     for (auto* wal : wals_internally_closed) {
       // We can only modify the state of log::Writer under the mutex
       bool was_closed = wal->PublishIfClosed();
@@ -1887,9 +1857,9 @@ Status DBImpl::UnlockWAL() {
 
 void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir,
                             VersionEdit* synced_wals) {
-  log_write_mutex_.AssertHeld();
-  if (synced_dir && logfile_number_ == up_to) {
-    log_dir_synced_ = true;
+  wal_write_mutex_.AssertHeld();
+  if (synced_dir && cur_wal_number_ == up_to) {
+    wal_dir_synced_ = true;
   }
   for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;) {
     auto& wal = *it;
@@ -1911,7 +1881,7 @@ void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir,
           (immutable_db_options_.background_close_inactive_wals &&
            wal.GetPreSyncSize() == wal.writer->file()->GetFlushedSize())) {
         // Fully synced
-        logs_to_free_.push_back(wal.ReleaseWriter());
+        wals_to_free_.push_back(wal.ReleaseWriter());
         it = logs_.erase(it);
       } else {
         wal.FinishSync();
@@ -1924,17 +1894,17 @@ void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir,
       ++it;
     }
   }
-  log_sync_cv_.SignalAll();
+  wal_sync_cv_.SignalAll();
 }
 
 void DBImpl::MarkLogsNotSynced(uint64_t up_to) {
-  log_write_mutex_.AssertHeld();
+  wal_write_mutex_.AssertHeld();
   for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;
        ++it) {
     auto& wal = *it;
     wal.FinishSync();
   }
-  log_sync_cv_.SignalAll();
+  wal_sync_cv_.SignalAll();
 }
 
 SequenceNumber DBImpl::GetLatestSequenceNumber() const {
@@ -1970,6 +1940,69 @@ Status DBImpl::GetFullHistoryTsLow(ColumnFamilyHandle* column_family,
   return Status::OK();
 }
 
+Status DBImpl::GetNewestUserDefinedTimestamp(ColumnFamilyHandle* column_family,
+                                             std::string* newest_timestamp) {
+  if (newest_timestamp == nullptr) {
+    return Status::InvalidArgument("newest_timestamp is nullptr");
+  }
+  ColumnFamilyData* cfd = nullptr;
+  if (column_family == nullptr) {
+    cfd = default_cf_handle_->cfd();
+  } else {
+    auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+    assert(cfh != nullptr);
+    cfd = cfh->cfd();
+  }
+  assert(cfd != nullptr && cfd->user_comparator() != nullptr);
+  if (cfd->user_comparator()->timestamp_size() == 0) {
+    return Status::InvalidArgument(
+        "Timestamp is not enabled in this column family");
+  }
+  if (cfd->ioptions().persist_user_defined_timestamps) {
+    return Status::NotSupported(
+        "GetNewestUserDefinedTimestamp doesn't support the case when user"
+        "defined timestamps are persisted.");
+  }
+
+  Status status;
+  // Acquire SuperVersion
+  SuperVersion* sv = GetAndRefSuperVersion(cfd);
+  {
+    InstrumentedMutexLock l(&mutex_);
+    bool enter_write_thread = sv->mem == cfd->mem();
+    WriteThread::Writer w;
+    // Enter write thread to read the mutable memtable to avoid racing access
+    // with concurrent writes. No need to enter nonmem_write_thread_ since this
+    // call only care about memtable writes, not WAL writes.
+    if (enter_write_thread) {
+      write_thread_.EnterUnbatched(&w, &mutex_);
+      WaitForPendingWrites();
+    }
+    *newest_timestamp = sv->mem->GetNewestUDT().ToString();
+    assert(!newest_timestamp->empty() || sv->mem->IsEmpty());
+    if (enter_write_thread) {
+      write_thread_.ExitUnbatched(&w);
+    }
+  }
+  // Read from immutable memtables if nothing found in mutable memtable.
+  if (newest_timestamp->empty()) {
+    *newest_timestamp = sv->imm->GetNewestUDT().ToString();
+  }
+  // Read from SST files if no result can be found in memtables.
+  if (newest_timestamp->empty() && sv->current->GetSstFilesSize() != 0) {
+    // full_history_ts_low is used to track the exclusive upperbound of
+    // flushed user defined timestamp. So we can use it to deduce the newest
+    // timestamp in the SST files that the column family has seen.
+    Slice full_history_ts_low = sv->full_history_ts_low;
+    if (!full_history_ts_low.empty()) {
+      GetU64CutoffTsFromFullHistoryTsLow(&full_history_ts_low,
+                                         newest_timestamp);
+    }
+  }
+  ReturnAndCleanupSuperVersion(cfd, sv);
+  return status;
+}
+
 InternalIterator* DBImpl::NewInternalIterator(const ReadOptions& read_options,
                                               Arena* arena,
                                               SequenceNumber sequence,
@@ -2003,10 +2036,10 @@ void DBImpl::BackgroundCallPurge() {
   TEST_SYNC_POINT("DBImpl::BackgroundCallPurge:beforeMutexLock");
   mutex_.Lock();
 
-  while (!logs_to_free_queue_.empty()) {
-    assert(!logs_to_free_queue_.empty());
-    log::Writer* log_writer = *(logs_to_free_queue_.begin());
-    logs_to_free_queue_.pop_front();
+  while (!wals_to_free_queue_.empty()) {
+    assert(!wals_to_free_queue_.empty());
+    log::Writer* log_writer = *(wals_to_free_queue_.begin());
+    wals_to_free_queue_.pop_front();
     mutex_.Unlock();
     delete log_writer;
     mutex_.Lock();
@@ -2704,7 +2737,7 @@ Status DBImpl::MultiCFSnapshot(const ReadOptions& read_options,
     }
   };
 
-  bool last_try = false;
+  bool acquire_mutex = false;
   if (cf_list->size() == 1) {
     // Fast path for a single column family. We can simply get the thread local
     // super version
@@ -2753,29 +2786,32 @@ Status DBImpl::MultiCFSnapshot(const ReadOptions& read_options,
     // sure.
     constexpr int num_retries = 3;
     for (int i = 0; i < num_retries; ++i) {
-      last_try = (i == num_retries - 1);
+      // When reading from kPersistedTier, we want a consistent view into CFs.
+      // So we take mutex to prevent any SV change in any CF.
+      acquire_mutex = ((i == num_retries - 1) && !read_options.snapshot) ||
+                      read_options.read_tier == kPersistedTier;
       bool retry = false;
 
       if (i > 0) {
         sv_cleanup_func();
       }
       if (read_options.snapshot == nullptr) {
-        if (last_try) {
-          TEST_SYNC_POINT("DBImpl::MultiCFSnapshot::LastTry");
-          // We're close to max number of retries. For the last retry,
-          // acquire the lock so we're sure to succeed
-          mutex_.Lock();
-        }
         *snapshot = GetLastPublishedSequence();
       } else {
         *snapshot =
             static_cast_with_check<const SnapshotImpl>(read_options.snapshot)
                 ->number_;
       }
+      if (acquire_mutex) {
+        TEST_SYNC_POINT("DBImpl::MultiCFSnapshot::LastTry");
+        // We're close to max number of retries. For the last retry,
+        // acquire the lock so we're sure to succeed
+        mutex_.Lock();
+      }
       for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end();
            ++cf_iter) {
         auto node = iter_deref_func(cf_iter);
-        if (!last_try) {
+        if (!acquire_mutex) {
           if (extra_sv_ref) {
             node->super_version = node->cfd->GetReferencedSuperVersion(this);
           } else {
@@ -2799,7 +2835,7 @@ Status DBImpl::MultiCFSnapshot(const ReadOptions& read_options,
           }
         }
         TEST_SYNC_POINT("DBImpl::MultiCFSnapshot::BeforeCheckingSnapshot");
-        if (read_options.snapshot != nullptr || last_try) {
+        if (read_options.snapshot != nullptr || acquire_mutex) {
           // If user passed a snapshot, then we don't care if a memtable is
           // sealed or compaction happens because the snapshot would ensure
           // that older key versions are kept around. If this is the last
@@ -2810,7 +2846,7 @@ Status DBImpl::MultiCFSnapshot(const ReadOptions& read_options,
         // memtables, which will include immutable memtables as well, but that
         // might be tricky to maintain in case we decide, in future, to do
         // memtable compaction.
-        if (!last_try) {
+        if (!acquire_mutex) {
           SequenceNumber seq =
               node->super_version->mem->GetEarliestSequenceNumber();
           if (seq > *snapshot) {
@@ -2820,19 +2856,20 @@ Status DBImpl::MultiCFSnapshot(const ReadOptions& read_options,
         }
       }
       if (!retry) {
-        if (last_try) {
+        if (acquire_mutex) {
           mutex_.Unlock();
           TEST_SYNC_POINT("DBImpl::MultiCFSnapshot::AfterLastTryRefSV");
         }
         break;
       }
+      assert(!acquire_mutex);
     }
   }
 
   TEST_SYNC_POINT("DBImpl::MultiCFSnapshot:AfterGetSeqNum1");
   TEST_SYNC_POINT("DBImpl::MultiCFSnapshot:AfterGetSeqNum2");
   PERF_TIMER_STOP(get_snapshot_time);
-  *sv_from_thread_local = !last_try;
+  *sv_from_thread_local = !acquire_mutex;
   if (!s.ok()) {
     sv_cleanup_func();
   }
@@ -3538,7 +3575,7 @@ void DBImpl::MultiGetEntityWithCallback(
 }
 
 Status DBImpl::WrapUpCreateColumnFamilies(
-    const ReadOptions& read_options, const WriteOptions& write_options,
+    const WriteOptions& write_options,
     const std::vector<const ColumnFamilyOptions*>& cf_options) {
   options_mutex_.AssertHeld();
 
@@ -3555,8 +3592,7 @@ Status DBImpl::WrapUpCreateColumnFamilies(
   // Attempt both follow-up actions even if one fails
   Status s = WriteOptionsFile(write_options, false /*db_mutex_already_held*/);
   if (register_worker) {
-    s.UpdateIfOk(RegisterRecordSeqnoTimeWorker(read_options, write_options,
-                                               /* is_new_db */ false));
+    s.UpdateIfOk(RegisterRecordSeqnoTimeWorker());
   }
   return s;
 }
@@ -3571,8 +3607,7 @@ Status DBImpl::CreateColumnFamily(const ReadOptions& read_options,
   Status s = CreateColumnFamilyImpl(read_options, write_options, cf_options,
                                     column_family, handle);
   if (s.ok()) {
-    s.UpdateIfOk(
-        WrapUpCreateColumnFamilies(read_options, write_options, {&cf_options}));
+    s.UpdateIfOk(WrapUpCreateColumnFamilies(write_options, {&cf_options}));
   }
   return s;
 }
@@ -3599,8 +3634,7 @@ Status DBImpl::CreateColumnFamilies(
     success_once = true;
   }
   if (success_once) {
-    s.UpdateIfOk(
-        WrapUpCreateColumnFamilies(read_options, write_options, {&cf_options}));
+    s.UpdateIfOk(WrapUpCreateColumnFamilies(write_options, {&cf_options}));
   }
   return s;
 }
@@ -3630,8 +3664,7 @@ Status DBImpl::CreateColumnFamilies(
     cf_opts.push_back(&column_families[i].options);
   }
   if (success_once) {
-    s.UpdateIfOk(
-        WrapUpCreateColumnFamilies(read_options, write_options, cf_opts));
+    s.UpdateIfOk(WrapUpCreateColumnFamilies(write_options, cf_opts));
   }
   return s;
 }
@@ -3672,7 +3705,7 @@ Status DBImpl::CreateColumnFamilyImpl(const ReadOptions& read_options,
     edit.AddColumnFamily(column_family_name);
     uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID();
     edit.SetColumnFamily(new_id);
-    edit.SetLogNumber(logfile_number_);
+    edit.SetLogNumber(cur_wal_number_);
     edit.SetComparatorName(cf_options.comparator->Name());
     edit.SetPersistUserDefinedTimestamps(
         cf_options.persist_user_defined_timestamps);
@@ -3700,7 +3733,7 @@ Status DBImpl::CreateColumnFamilyImpl(const ReadOptions& read_options,
       auto* cfd =
           versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name);
       assert(cfd != nullptr);
-      InstallSuperVersionAndScheduleWork(cfd, &sv_context);
+      InstallSuperVersionForConfigChange(cfd, &sv_context);
 
       if (!cfd->mem()->IsSnapshotSupported()) {
         is_snapshot_supported_ = false;
@@ -3784,7 +3817,7 @@ Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) {
   Status s;
   // Save re-aquiring lock for RegisterRecordSeqnoTimeWorker when not
   // applicable
-  bool used_preserve_preclude = false;
+  MinAndMaxPreserveSeconds preserve_info;
   {
     InstrumentedMutexLock l(&mutex_);
     if (cfd->IsDropped()) {
@@ -3802,8 +3835,7 @@ Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) {
       auto& moptions = cfd->GetLatestMutableCFOptions();
       max_total_in_memory_state_ -=
           moptions.write_buffer_size * moptions.max_write_buffer_number;
-      used_preserve_preclude = moptions.preserve_internal_time_seconds > 0 ||
-                               moptions.preclude_last_level_data_seconds > 0;
+      preserve_info.Combine(moptions);
     }
 
     if (!cf_support_snapshot) {
@@ -3821,9 +3853,8 @@ Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) {
     bg_cv_.SignalAll();
   }
 
-  if (used_preserve_preclude) {
-    s = RegisterRecordSeqnoTimeWorker(read_options, write_options,
-                                      /* is_new_db */ false);
+  if (preserve_info.IsEnabled()) {
+    s = RegisterRecordSeqnoTimeWorker();
   }
 
   if (s.ok()) {
@@ -3873,6 +3904,14 @@ bool DBImpl::KeyMayExist(const ReadOptions& read_options,
   return s.ok() || s.IsIncomplete();
 }
 
+std::unique_ptr<MultiScan> DBImpl::NewMultiScan(
+    const ReadOptions& _read_options, ColumnFamilyHandle* column_family,
+    const MultiScanArgs& scan_opts) {
+  std::unique_ptr<MultiScan> ms_iter = std::make_unique<MultiScan>(
+      _read_options, scan_opts, this, column_family);
+  return ms_iter;
+}
+
 Iterator* DBImpl::NewIterator(const ReadOptions& _read_options,
                               ColumnFamilyHandle* column_family) {
   if (_read_options.io_activity != Env::IOActivity::kUnknown &&
@@ -3886,10 +3925,6 @@ Iterator* DBImpl::NewIterator(const ReadOptions& _read_options,
     read_options.io_activity = Env::IOActivity::kDBIterator;
   }
 
-  if (read_options.managed) {
-    return NewErrorIterator(
-        Status::NotSupported("Managed iterator is not supported anymore."));
-  }
   Iterator* result = nullptr;
   if (read_options.read_tier == kPersistedTier) {
     return NewErrorIterator(Status::NotSupported(
@@ -3929,11 +3964,14 @@ Iterator* DBImpl::NewIterator(const ReadOptions& _read_options,
 
     auto iter = new ForwardIterator(this, read_options, cfd, sv,
                                     /* allow_unprepared_value */ true);
-    result = NewDBIterator(
-        env_, read_options, cfd->ioptions(), sv->mutable_cf_options,
-        cfd->user_comparator(), iter, sv->current, kMaxSequenceNumber,
-        sv->mutable_cf_options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */, cfh);
+    // TODO(cbi): Add support for `memtable_op_scan_flush_trigger` for tailing
+    // iterator. This requires refreshing DBIter's pointer to active_mem when
+    // tailing iterator refreshes to new memtable internally.
+    result = DBIter::NewIter(env_, read_options, cfd->ioptions(),
+                             sv->mutable_cf_options, cfd->user_comparator(),
+                             iter, sv->current, kMaxSequenceNumber,
+                             /*read_callback=*/nullptr, /*active_mem=*/nullptr,
+                             cfh, /*expose_blob_index=*/false);
   } else {
     // Note: no need to consider the special case of
     // last_seq_same_as_publish_seq_==false since NewIterator is overridden in
@@ -4011,18 +4049,9 @@ ArenaWrappedDBIter* DBImpl::NewIteratorImpl(
   // Laying out the iterators in the order of being accessed makes it more
   // likely that any iterator pointer is close to the iterator it points to so
   // that they are likely to be in the same cache line and/or page.
-  ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator(
-      env_, read_options, cfh->cfd()->ioptions(), sv->mutable_cf_options,
-      sv->current, snapshot,
-      sv->mutable_cf_options.max_sequential_skip_in_iterations,
-      sv->version_number, read_callback, cfh, expose_blob_index, allow_refresh);
-
-  InternalIterator* internal_iter = NewInternalIterator(
-      db_iter->GetReadOptions(), cfh->cfd(), sv, db_iter->GetArena(), snapshot,
-      /* allow_unprepared_value */ true, db_iter);
-  db_iter->SetIterUnderDBIter(internal_iter);
-
-  return db_iter;
+  return NewArenaWrappedDbIterator(
+      env_, read_options, cfh, sv, snapshot, read_callback, this,
+      expose_blob_index, allow_refresh, /*allow_mark_memtable_for_flush=*/true);
 }
 
 std::unique_ptr<Iterator> DBImpl::NewCoalescingIterator(
@@ -4095,9 +4124,6 @@ Status DBImpl::NewIterators(
   if (read_options.io_activity == Env::IOActivity::kUnknown) {
     read_options.io_activity = Env::IOActivity::kDBIterator;
   }
-  if (read_options.managed) {
-    return Status::NotSupported("Managed iterator is not supported anymore.");
-  }
   if (read_options.read_tier == kPersistedTier) {
     return Status::NotSupported(
         "ReadTier::kPersistedData is not yet supported in iterators.");
@@ -4146,14 +4172,12 @@ Status DBImpl::NewIterators(
       auto iter = new ForwardIterator(this, read_options, cf_sv_pair.cfd,
                                       cf_sv_pair.super_version,
                                       /* allow_unprepared_value */ true);
-      iterators->push_back(
-          NewDBIterator(env_, read_options, cf_sv_pair.cfd->ioptions(),
-                        cf_sv_pair.super_version->mutable_cf_options,
-                        cf_sv_pair.cfd->user_comparator(), iter,
-                        cf_sv_pair.super_version->current, kMaxSequenceNumber,
-                        cf_sv_pair.super_version->mutable_cf_options
-                            .max_sequential_skip_in_iterations,
-                        nullptr /*read_callback*/, cf_sv_pair.cfh));
+      iterators->push_back(DBIter::NewIter(
+          env_, read_options, cf_sv_pair.cfd->ioptions(),
+          cf_sv_pair.super_version->mutable_cf_options,
+          cf_sv_pair.cfd->user_comparator(), iter,
+          cf_sv_pair.super_version->current, kMaxSequenceNumber,
+          nullptr /*read_callback*/, /*active_mem=*/nullptr, cf_sv_pair.cfh));
     }
   } else {
     for (const auto& cf_sv_pair : cf_sv_pairs) {
@@ -4385,9 +4409,10 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) {
     CfdList cf_scheduled;
     if (oldest_snapshot > bottommost_files_mark_threshold_) {
       for (auto* cfd : *versions_->GetColumnFamilySet()) {
-        if (!cfd->ioptions().allow_ingest_behind) {
+        if (!cfd->AllowIngestBehind()) {
           cfd->current()->storage_info()->UpdateOldestSnapshot(
-              oldest_snapshot, /*allow_ingest_behind=*/false);
+              oldest_snapshot, /*allow_ingest_behind=*/false,
+              cfd->ioptions().user_comparator, cfd->GetFullHistoryTsLow());
           if (!cfd->current()
                    ->storage_info()
                    ->BottommostFilesMarkedForCompaction()
@@ -4405,8 +4430,7 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) {
       // inaccurate.
       SequenceNumber new_bottommost_files_mark_threshold = kMaxSequenceNumber;
       for (auto* cfd : *versions_->GetColumnFamilySet()) {
-        if (CfdListContains(cf_scheduled, cfd) ||
-            cfd->ioptions().allow_ingest_behind) {
+        if (CfdListContains(cf_scheduled, cfd) || cfd->AllowIngestBehind()) {
           continue;
         }
         new_bottommost_files_mark_threshold = std::min(
@@ -4485,7 +4509,7 @@ Status DBImpl::GetPropertiesOfTablesInRange(ColumnFamilyHandle* column_family,
   // Add timestamp if needed
   for (size_t i = 0; i < n; i++) {
     auto [start, limit] = MaybeAddTimestampsToRange(
-        &range[i].start, &range[i].limit, ts_sz, &keys.emplace_back(),
+        range[i].start, range[i].limit, ts_sz, &keys.emplace_back(),
         &keys.emplace_back(), /*exclusive_end=*/false);
     assert(start.has_value());
     assert(limit.has_value());
@@ -4502,6 +4526,29 @@ Status DBImpl::GetPropertiesOfTablesInRange(ColumnFamilyHandle* column_family,
   return s;
 }
 
+Status DBImpl::GetPropertiesOfTablesByLevel(
+    ColumnFamilyHandle* column_family,
+    std::vector<std::unique_ptr<TablePropertiesCollection>>* props_by_level) {
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  auto cfd = cfh->cfd();
+
+  // Increment the ref count
+  mutex_.Lock();
+  auto version = cfd->current();
+  version->Ref();
+  mutex_.Unlock();
+
+  const ReadOptions read_options;
+  auto s = version->GetPropertiesOfTablesByLevel(read_options, props_by_level);
+
+  // Decrement the ref count
+  mutex_.Lock();
+  version->Unref();
+  mutex_.Unlock();
+
+  return s;
+}
+
 const std::string& DBImpl::GetName() const { return dbname_; }
 
 Env* DBImpl::GetEnv() const { return env_; }
@@ -4802,7 +4849,7 @@ void DBImpl::GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
   // Add timestamp if needed
   std::string start_with_ts, limit_with_ts;
   auto [start, limit] = MaybeAddTimestampsToRange(
-      &range.start, &range.limit, ts_sz, &start_with_ts, &limit_with_ts);
+      range.start, range.limit, ts_sz, &start_with_ts, &limit_with_ts);
   assert(start.has_value());
   assert(limit.has_value());
   // Convert user_key into a corresponding internal key.
@@ -4840,9 +4887,8 @@ Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options,
   for (int i = 0; i < n; i++) {
     // Add timestamp if needed
     std::string start_with_ts, limit_with_ts;
-    auto [start, limit] =
-        MaybeAddTimestampsToRange(&range[i].start, &range[i].limit, ts_sz,
-                                  &start_with_ts, &limit_with_ts);
+    auto [start, limit] = MaybeAddTimestampsToRange(
+        range[i].start, range[i].limit, ts_sz, &start_with_ts, &limit_with_ts);
     assert(start.has_value());
     assert(limit.has_value());
     // Convert user_key into a corresponding internal key.
@@ -4918,7 +4964,7 @@ Status DBImpl::GetUpdatesSince(
 }
 
 Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family,
-                                   const RangePtr* ranges, size_t n,
+                                   const RangeOpt* ranges, size_t n,
                                    bool include_end) {
   // TODO: plumb Env::IOActivity, Env::IOPriority
   const ReadOptions read_options;
@@ -4930,7 +4976,7 @@ Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family,
   const Comparator* ucmp = cfd->user_comparator();
   assert(ucmp);
   const size_t ts_sz = ucmp->timestamp_size();
-  autovector<UserKeyRangePtr> ukey_ranges;
+  autovector<UserKeyRangeOpt> ukey_ranges;
   std::vector<std::string> keys;
   std::vector<Slice> key_slices;
   ukey_ranges.reserve(n);
@@ -4940,8 +4986,8 @@ Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family,
     auto [start, limit] = MaybeAddTimestampsToRange(
         ranges[i].start, ranges[i].limit, ts_sz, &keys.emplace_back(),
         &keys.emplace_back(), !include_end);
-    assert((ranges[i].start != nullptr) == start.has_value());
-    assert((ranges[i].limit != nullptr) == limit.has_value());
+    assert(ranges[i].start.has_value() == start.has_value());
+    assert(ranges[i].limit.has_value() == limit.has_value());
     ukey_ranges.emplace_back(start, limit);
   }
 
@@ -5002,7 +5048,8 @@ Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family,
     }
     if (!deleted_files.empty()) {
       vstorage->ComputeCompactionScore(cfd->ioptions(),
-                                       cfd->GetLatestMutableCFOptions());
+                                       cfd->GetLatestMutableCFOptions(),
+                                       cfd->GetFullHistoryTsLow());
     }
     if (edit.GetDeletedFiles().empty()) {
       job_context.Clean();
@@ -5047,7 +5094,6 @@ void DBImpl::GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
   assert(column_family);
   auto* cfd =
       static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
-  auto* sv = GetAndRefSuperVersion(cfd);
   {
     // Without mutex, Version::GetColumnFamilyMetaData will have data race
     // with Compaction::MarkFilesBeingCompacted. One solution is to use mutex,
@@ -5059,9 +5105,21 @@ void DBImpl::GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
     // DB::GetColumnFamilyMetaData is not called frequently, the regression
     // should not be big. We still need to keep an eye on it.
     InstrumentedMutexLock l(&mutex_);
-    sv->current->GetColumnFamilyMetaData(cf_meta);
+    cfd->current()->GetColumnFamilyMetaData(cf_meta);
+  }
+}
+
+void DBImpl::GetColumnFamilyMetaData(
+    ColumnFamilyHandle* column_family,
+    const GetColumnFamilyMetaDataOptions& options,
+    ColumnFamilyMetaData* metadata) {
+  assert(column_family);
+  auto* cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
+  {
+    InstrumentedMutexLock l(&mutex_);
+    cfd->current()->GetColumnFamilyMetaData(options, metadata);
   }
-  ReturnAndCleanupSuperVersion(cfd, sv);
 }
 
 void DBImpl::GetAllColumnFamilyMetaData(
@@ -5075,85 +5133,6 @@ void DBImpl::GetAllColumnFamilyMetaData(
   }
 }
 
-Status DBImpl::CheckConsistency() {
-  mutex_.AssertHeld();
-  std::vector<LiveFileMetaData> metadata;
-  versions_->GetLiveFilesMetaData(&metadata);
-  TEST_SYNC_POINT("DBImpl::CheckConsistency:AfterGetLiveFilesMetaData");
-
-  std::string corruption_messages;
-
-  if (immutable_db_options_.skip_checking_sst_file_sizes_on_db_open) {
-    // Instead of calling GetFileSize() for each expected file, call
-    // GetChildren() for the DB directory and check that all expected files
-    // are listed, without checking their sizes.
-    // Since sst files might be in different directories, do it for each
-    // directory separately.
-    std::map<std::string, std::vector<std::string>> files_by_directory;
-    for (const auto& md : metadata) {
-      // md.name has a leading "/". Remove it.
-      std::string fname = md.name;
-      if (!fname.empty() && fname[0] == '/') {
-        fname = fname.substr(1);
-      }
-      files_by_directory[md.db_path].push_back(fname);
-    }
-
-    IOOptions io_opts;
-    io_opts.do_not_recurse = true;
-    for (const auto& dir_files : files_by_directory) {
-      std::string directory = dir_files.first;
-      std::vector<std::string> existing_files;
-      Status s = fs_->GetChildren(directory, io_opts, &existing_files,
-                                  /*IODebugContext*=*/nullptr);
-      if (!s.ok()) {
-        corruption_messages +=
-            "Can't list files in " + directory + ": " + s.ToString() + "\n";
-        continue;
-      }
-      std::sort(existing_files.begin(), existing_files.end());
-
-      for (const std::string& fname : dir_files.second) {
-        if (!std::binary_search(existing_files.begin(), existing_files.end(),
-                                fname) &&
-            !std::binary_search(existing_files.begin(), existing_files.end(),
-                                Rocks2LevelTableFileName(fname))) {
-          corruption_messages +=
-              "Missing sst file " + fname + " in " + directory + "\n";
-        }
-      }
-    }
-  } else {
-    for (const auto& md : metadata) {
-      // md.name has a leading "/".
-      std::string file_path = md.db_path + md.name;
-
-      uint64_t fsize = 0;
-      TEST_SYNC_POINT("DBImpl::CheckConsistency:BeforeGetFileSize");
-      Status s = env_->GetFileSize(file_path, &fsize);
-      if (!s.ok() &&
-          env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok()) {
-        s = Status::OK();
-      }
-      if (!s.ok()) {
-        corruption_messages +=
-            "Can't access " + md.name + ": " + s.ToString() + "\n";
-      } else if (fsize != md.size) {
-        corruption_messages += "Sst file size mismatch: " + file_path +
-                               ". Size recorded in manifest " +
-                               std::to_string(md.size) + ", actual size " +
-                               std::to_string(fsize) + "\n";
-      }
-    }
-  }
-
-  if (corruption_messages.size() == 0) {
-    return Status::OK();
-  } else {
-    return Status::Corruption(corruption_messages);
-  }
-}
-
 Status DBImpl::GetDbIdentity(std::string& identity) const {
   identity.assign(db_id_);
   return Status::OK();
@@ -5490,12 +5469,7 @@ Status DBImpl::WriteOptionsFile(const WriteOptions& write_options,
   if (!s.ok()) {
     ROCKS_LOG_WARN(immutable_db_options_.info_log,
                    "Unnable to persist options -- %s", s.ToString().c_str());
-    if (immutable_db_options_.fail_if_options_file_error) {
-      s = Status::IOError("Unable to persist options.", s.ToString().c_str());
-    } else {
-      // Ignore error
-      s = Status::OK();
-    }
+    s = Status::IOError("Unable to persist options.", s.ToString().c_str());
   }
 
   // Restore lock if appropriate
@@ -5607,7 +5581,7 @@ Status DBImpl::RenameTempFileToOptionsFile(const std::string& file_name,
   return s;
 }
 
-#ifdef ROCKSDB_USING_THREAD_STATUS
+#ifndef NROCKSDB_THREAD_STATUS
 
 void DBImpl::NewThreadStatusCfInfo(ColumnFamilyData* cfd) const {
   if (immutable_db_options_.enable_thread_tracking) {
@@ -5634,7 +5608,7 @@ void DBImpl::NewThreadStatusCfInfo(ColumnFamilyData* /*cfd*/) const {}
 void DBImpl::EraseThreadStatusCfInfo(ColumnFamilyData* /*cfd*/) const {}
 
 void DBImpl::EraseThreadStatusDbInfo() const {}
-#endif  // ROCKSDB_USING_THREAD_STATUS
+#endif  // !NROCKSDB_THREAD_STATUS
 
 //
 // A global method that can dump out the build version
@@ -5865,10 +5839,6 @@ Status DBImpl::IngestExternalFiles(
   for (const auto& arg : args) {
     const IngestExternalFileOptions& ingest_opts = arg.options;
     if (ingest_opts.ingest_behind) {
-      if (!immutable_db_options_.allow_ingest_behind) {
-        return Status::InvalidArgument(
-            "can't ingest_behind file in DB with allow_ingest_behind=false");
-      }
       auto ucmp = arg.column_family->GetComparator();
       assert(ucmp);
       if (ucmp->timestamp_size() > 0) {
@@ -5876,7 +5846,36 @@ Status DBImpl::IngestExternalFiles(
             "Column family with user-defined "
             "timestamps enabled doesn't support ingest behind.");
       }
+
+      if (!static_cast<ColumnFamilyHandleImpl*>(arg.column_family)
+               ->cfd()
+               ->AllowIngestBehind()) {
+        return Status::InvalidArgument(
+            "Can't ingest_behind file in ColumnFamily %s with "
+            "cf_allow_ingest_behind=false");
+      }
+    }
+    if (arg.atomic_replace_range.has_value()) {
+      if (ingest_opts.ingest_behind) {
+        return Status::InvalidArgument(
+            "Can't combine atomic_replace_range with ingest_behind.");
+      }
+      if (ingest_opts.snapshot_consistency) {
+        // TODO: support generating and ingesting a big tombstone file, which
+        // might depend on non-nullptr start and limit
+        return Status::NotSupported(
+            "atomic_replace_range not yet supported with "
+            "snapshot_consistency.");
+      } else {
+        if (arg.atomic_replace_range->start.has_value() ^
+            arg.atomic_replace_range->limit.has_value()) {
+          return Status::NotSupported(
+              "Only one of atomic_replace_range.{start,limit}.has_value() is "
+              "not supported.");
+        }
+      }
     }
+
     if (ingest_opts.allow_db_generated_files) {
       if (ingest_opts.write_global_seqno) {
         return Status::NotSupported(
@@ -5925,8 +5924,8 @@ Status DBImpl::IngestExternalFiles(
             this);
     Status es = ingestion_jobs[i].Prepare(
         args[i].external_files, args[i].files_checksums,
-        args[i].files_checksum_func_names, args[i].file_temperature,
-        start_file_number, super_version);
+        args[i].files_checksum_func_names, args[i].atomic_replace_range,
+        args[i].file_temperature, start_file_number, super_version);
     // capture first error only
     if (!es.ok() && status.ok()) {
       status = es;
@@ -5941,8 +5940,8 @@ Status DBImpl::IngestExternalFiles(
             this);
     Status es = ingestion_jobs[0].Prepare(
         args[0].external_files, args[0].files_checksums,
-        args[0].files_checksum_func_names, args[0].file_temperature,
-        next_file_number, super_version);
+        args[0].files_checksum_func_names, args[0].atomic_replace_range,
+        args[0].file_temperature, next_file_number, super_version);
     if (!es.ok()) {
       status = es;
     }
@@ -6089,18 +6088,19 @@ Status DBImpl::IngestExternalFiles(
       // mutex when persisting MANIFEST file, and the snapshots taken during
       // that period will not be stable if VersionSet last seqno is updated
       // before LogAndApply.
-      int consumed_seqno_count =
-          ingestion_jobs[0].ConsumedSequenceNumbersCount();
+      SequenceNumber max_assigned_seqno =
+          ingestion_jobs[0].MaxAssignedSequenceNumber();
       for (size_t i = 1; i != num_cfs; ++i) {
-        consumed_seqno_count =
-            std::max(consumed_seqno_count,
-                     ingestion_jobs[i].ConsumedSequenceNumbersCount());
+        max_assigned_seqno = std::max(
+            max_assigned_seqno, ingestion_jobs[i].MaxAssignedSequenceNumber());
       }
-      if (consumed_seqno_count > 0) {
+      if (max_assigned_seqno > 0) {
         const SequenceNumber last_seqno = versions_->LastSequence();
-        versions_->SetLastAllocatedSequence(last_seqno + consumed_seqno_count);
-        versions_->SetLastPublishedSequence(last_seqno + consumed_seqno_count);
-        versions_->SetLastSequence(last_seqno + consumed_seqno_count);
+        if (max_assigned_seqno > last_seqno) {
+          versions_->SetLastAllocatedSequence(max_assigned_seqno);
+          versions_->SetLastPublishedSequence(max_assigned_seqno);
+          versions_->SetLastSequence(max_assigned_seqno);
+        }
       }
     }
 
@@ -6235,7 +6235,7 @@ Status DBImpl::CreateColumnFamilyWithImport(
           versions_->LogAndApply(cfd, read_options, write_options, &dummy_edit,
                                  &mutex_, directories_.GetDbDir());
       if (status.ok()) {
-        InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx);
+        InstallSuperVersionForConfigChange(cfd, &dummy_sv_ctx);
       }
     }
   }
@@ -6272,7 +6272,7 @@ Status DBImpl::CreateColumnFamilyWithImport(
                                         import_job.edit(), &mutex_,
                                         directories_.GetDbDir());
         if (status.ok()) {
-          InstallSuperVersionAndScheduleWork(cfd, &sv_context);
+          InstallSuperVersionForConfigChange(cfd, &sv_context);
         }
       }
 
@@ -6333,9 +6333,9 @@ Status DBImpl::ClipColumnFamily(ColumnFamilyHandle* column_family,
 
   if (status.ok()) {
     // DeleteFilesInRanges non-overlap files except L0
-    std::vector<RangePtr> ranges;
-    ranges.emplace_back(nullptr, &begin_key);
-    ranges.emplace_back(&end_key, nullptr);
+    std::vector<RangeOpt> ranges;
+    ranges.emplace_back(OptSlice{}, begin_key);
+    ranges.emplace_back(end_key, OptSlice{});
     status = DeleteFilesInRanges(column_family, ranges.data(), ranges.size());
   }
 
@@ -6480,8 +6480,11 @@ Status DBImpl::VerifyChecksumInternal(const ReadOptions& read_options,
                                      fmeta->file_checksum_func_name, fname,
                                      read_options);
         } else {
+          FileOptions fopts = file_options_;
+          fopts.file_checksum = fmeta->file_checksum;
+          fopts.file_checksum_func_name = fmeta->file_checksum_func_name;
           s = ROCKSDB_NAMESPACE::VerifySstFileChecksumInternal(
-              opts, file_options_, read_options, fname, fd.largest_seqno);
+              opts, fopts, read_options, fname, fd.largest_seqno);
         }
         RecordTick(stats_, VERIFY_CHECKSUM_READ_BYTES,
                    IOSTATS(bytes_read) - prev_bytes_read);
@@ -6549,12 +6552,15 @@ Status DBImpl::VerifyFullFileChecksum(const std::string& file_checksum_expected,
   }
   std::string file_checksum;
   std::string func_name;
+  FileOptions fopts;
+  fopts.file_checksum = file_checksum_expected;
+  fopts.file_checksum_func_name = func_name_expected;
   s = ROCKSDB_NAMESPACE::GenerateOneFileChecksum(
       fs_.get(), fname, immutable_db_options_.file_checksum_gen_factory.get(),
       func_name_expected, &file_checksum, &func_name,
       read_options.readahead_size, immutable_db_options_.allow_mmap_reads,
       io_tracer_, immutable_db_options_.rate_limiter.get(), read_options,
-      immutable_db_options_.stats, immutable_db_options_.clock);
+      immutable_db_options_.stats, immutable_db_options_.clock, fopts);
   if (s.ok()) {
     assert(func_name_expected == func_name);
     if (file_checksum != file_checksum_expected) {
@@ -6732,7 +6738,7 @@ Status DBImpl::GetCreationTimeOfOldestFile(uint64_t* creation_time) {
   }
 }
 
-void DBImpl::RecordSeqnoToTimeMapping(uint64_t populate_historical_seconds) {
+std::pair<SequenceNumber, uint64_t> DBImpl::GetSeqnoToTimeSample() const {
   // TECHNICALITY: Sample last sequence number *before* time, as prescribed
   // for SeqnoToTimeMapping. We don't know how long it has been since the last
   // sequence number was written, so we at least have a one-sided bound by
@@ -6741,62 +6747,191 @@ void DBImpl::RecordSeqnoToTimeMapping(uint64_t populate_historical_seconds) {
   // while holding the DB mutex. (This is really to make testing happy because
   // it's fine to throw out extra close-but-not-quite-consistent mappings in
   // production.)
-  std::vector<SuperVersionContext> sv_contexts;
-  bool success = true;
-  SequenceNumber seqno;
-  uint64_t unix_time;
+  mutex_.AssertHeld();
+  SequenceNumber seqno = GetLatestSequenceNumber();
+  // HACK/TODO: seqno might be zero but we can't record a mapping for that.
+  // Start with 1, which should be close enough.
+  seqno = std::max(seqno, SequenceNumber{1});
+  int64_t unix_time_signed = 0;
+  immutable_db_options_.clock->GetCurrentTime(&unix_time_signed)
+      .PermitUncheckedError();  // Ignore error
+  return {seqno, static_cast<uint64_t>(unix_time_signed)};
+}
+
+void DBImpl::EnsureSeqnoToTimeMapping(
+    const MinAndMaxPreserveSeconds& preserve_info) {
+  mutex_.AssertHeld();
+  assert(preserve_info.IsEnabled());
+
+  // Atomically with CF creation or mutable option change (see
+  // InstallSuperVersionForConfigChange()), we need to be sure any data written
+  // after setting preserve/preclude options must have a reasonable time
+  // estimate (so that we can accurately place the data), which means at least
+  // one entry in seqno_to_time_mapping_. It's not critical that `preserve_info`
+  // take into account all CFs, as that's mostly relevant to how we add
+  // recurring entries and purge old ones.
+
+  auto [seqno, unix_time_now] = GetSeqnoToTimeSample();
+  // Ensure at least one sample that is sufficiently recent
+  uint64_t unix_time_last_sample = 0;
+  if (seqno_to_time_mapping_.Empty()) {
+    // The exact best settings will be found and applied in
+    // RegisterRecordSeqnoTimeWorker()
+    seqno_to_time_mapping_.SetCapacity(kMaxSeqnoToTimeEntries);
+  } else {
+    unix_time_last_sample =
+        seqno_to_time_mapping_.GetProximalTimeBeforeSeqno(kMaxSequenceNumber);
+  }
+  uint64_t cadence = preserve_info.GetRecodingCadence();
+  // Extend cadence so as to avoid stepping on toes of recorder job, which
+  // could lag a bit.
+  cadence += 3 + cadence / 100;
+  if (unix_time_now >= cadence &&
+      unix_time_last_sample <= unix_time_now - cadence) {
+    assert(seqno > 0);  // See GetSeqnoToTimeSample()
+    // Always successful assuming seqno never go backwards
+    seqno_to_time_mapping_.Append(seqno, unix_time_now);
+  }
+}
+
+void DBImpl::PrepopulateSeqnoToTimeMapping(
+    const MinAndMaxPreserveSeconds& preserve_info) {
+  // Only for opening a new DB, with preserve/preclude options set
+  if (!preserve_info.IsEnabled()) {
+    assert(false);
+    return;
+  }
+  if (GetLatestSequenceNumber() != 0) {
+    assert(false);
+    return;
+  }
+
+  // Here we fulfill the following promise:
+  //
+  // Any DB/CF created with preserve/preclude options set from the beginning
+  // will get pre-allocated seqnos with pre-populated time mappings back to
+  // the times we are interested in. (This will enable future import of data
+  // while preserving rough write time. We can only do this reliably from
+  // DB::Open, as otherwise there could be a race between CreateColumnFamily
+  // and the first Write to the DB, and seqno-to-time mappings need to be
+  // monotonic.
+  //
+  // FIXME: We don't currently guarantee that if the first column family with
+  // that setting is added or configured after initial DB::Open but before
+  // the first user Write. Fixing this causes complications with the crash
+  // test because if DB starts without preserve/preclude option, does some
+  // user writes but all those writes are lost in crash, then re-opens with
+  // preserve/preclude option, it sees seqno==1 which looks like one of the
+  // user writes was recovered, when actually it was not.
+
+  // Pre-allocate seqnos and pre-populate historical mapping
+  // We can simply modify these, before writes are allowed
+  constexpr uint64_t kMax = kMaxSeqnoTimePairsPerSST;
+  versions_->SetLastAllocatedSequence(kMax);
+  versions_->SetLastPublishedSequence(kMax);
+  versions_->SetLastSequence(kMax);
+
+  // And record in manifest, to avoid going backwards in seqno on re-open
+  // (potentially with different options). Concurrency is simple because we
+  // are in DB::Open
+  const WriteOptions write_options(Env::IOActivity::kDBOpen);
+  const ReadOptions read_options(Env::IOActivity::kDBOpen);
+  VersionEdit edit;
+  edit.SetLastSequence(kMax);
+  Status s = versions_->LogAndApplyToDefaultColumnFamily(
+      read_options, write_options, &edit, &mutex_, directories_.GetDbDir());
+  if (!s.ok() && versions_->io_status().IsIOError()) {
+    error_handler_.SetBGError(versions_->io_status(),
+                              BackgroundErrorReason::kManifestWrite);
+  }
+
+  auto [seqno, unix_time_now] = GetSeqnoToTimeSample();
+  uint64_t populate_historical_seconds = preserve_info.max_preserve_seconds;
+  if (seqno > 1 && unix_time_now > populate_historical_seconds) {
+    // seqno=0 is reserved
+    SequenceNumber from_seqno = 1;
+    seqno_to_time_mapping_.PrePopulate(
+        from_seqno, seqno, unix_time_now - populate_historical_seconds,
+        unix_time_now);
+  } else {
+    // One of these will fail
+    assert(seqno > 1);
+    assert(unix_time_now > populate_historical_seconds);
+  }
+}
+
+void DBImpl::InstallSuperVersionForConfigChange(
+    ColumnFamilyData* cfd, SuperVersionContext* sv_context) {
+  MinAndMaxPreserveSeconds preserve_info{cfd->GetLatestCFOptions()};
+  std::shared_ptr<SeqnoToTimeMapping> new_seqno_to_time_mapping;
+  if (preserve_info.IsEnabled()) {
+    // TODO: detect & optimize if mapping hasn't changed from previous
+    // SuperVersion
+    EnsureSeqnoToTimeMapping(preserve_info);
+    new_seqno_to_time_mapping = std::make_shared<SeqnoToTimeMapping>();
+    new_seqno_to_time_mapping->CopyFrom(seqno_to_time_mapping_);
+  }
+  InstallSuperVersionAndScheduleWork(cfd, sv_context,
+                                     std::move(new_seqno_to_time_mapping));
+}
+
+void DBImpl::RecordSeqnoToTimeMapping() {
+  SuperVersionContext sv_context;
   {
     InstrumentedMutexLock l(&mutex_);
-
-    seqno = GetLatestSequenceNumber();
-    int64_t unix_time_signed = 0;
-    immutable_db_options_.clock->GetCurrentTime(&unix_time_signed)
-        .PermitUncheckedError();  // Ignore error
-    unix_time = static_cast<uint64_t>(unix_time_signed);
-
-    if (populate_historical_seconds > 0) {
-      if (seqno > 1 && unix_time > populate_historical_seconds) {
-        // seqno=0 is reserved
-        SequenceNumber from_seqno = 1;
-        success = seqno_to_time_mapping_.PrePopulate(
-            from_seqno, seqno, unix_time - populate_historical_seconds,
-            unix_time);
-        InstallSeqnoToTimeMappingInSV(&sv_contexts);
-      } else {
-        // One of these will fail
-        assert(seqno > 1);
-        assert(unix_time > populate_historical_seconds);
-        success = false;
+    // Record next sample
+    seqno_to_time_mapping_.Append(GetSeqnoToTimeSample());
+    // Create an immutable snapshot for sharing across CFs
+    std::shared_ptr<SeqnoToTimeMapping> new_seqno_to_time_mapping =
+        std::make_shared<SeqnoToTimeMapping>();
+    new_seqno_to_time_mapping->CopyFrom(seqno_to_time_mapping_);
+
+    // Update in SV of all applicable CFs
+    for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      MinAndMaxPreserveSeconds preserve_info{cfd->GetLatestCFOptions()};
+      if (preserve_info.IsEnabled()) {
+        sv_context.NewSuperVersion();
+        cfd->InstallSuperVersion(&sv_context, &mutex_,
+                                 new_seqno_to_time_mapping);
       }
-    } else {
-      // FIXME: assert(seqno > 0);
-      // Always successful assuming seqno never go backwards
-      seqno_to_time_mapping_.Append(seqno, unix_time);
-      InstallSeqnoToTimeMappingInSV(&sv_contexts);
     }
+    bg_cv_.SignalAll();
   }
 
   // clean up & report outside db mutex
-  for (SuperVersionContext& sv_context : sv_contexts) {
-    sv_context.Clean();
-  }
+  sv_context.Clean();
+}
 
-  if (populate_historical_seconds > 0) {
-    if (success) {
-      ROCKS_LOG_INFO(
-          immutable_db_options_.info_log,
-          "Pre-populated sequence number to time entries: [1,%" PRIu64
-          "] -> [%" PRIu64 ",%" PRIu64 "]",
-          seqno, unix_time - populate_historical_seconds, unix_time);
-    } else {
-      ROCKS_LOG_WARN(
-          immutable_db_options_.info_log,
-          "Failed to pre-populate sequence number to time entries: [1,%" PRIu64
-          "] -> [%" PRIu64 ",%" PRIu64 "]",
-          seqno, unix_time - populate_historical_seconds, unix_time);
+void DBImpl::TriggerPeriodicCompaction() {
+  TEST_SYNC_POINT("DBImpl::TriggerPeriodicCompaction:StartRunning");
+  {
+    InstrumentedMutexLock l(&mutex_);
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "Running the periodic task to trigger compactions.");
+
+    for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      if (cfd->GetLatestCFOptions().periodic_compaction_seconds &&
+          !cfd->queued_for_compaction()) {
+        cfd->current()->storage_info()->ComputeCompactionScore(
+            cfd->ioptions(), cfd->GetLatestMutableCFOptions(),
+            cfd->GetFullHistoryTsLow());
+        EnqueuePendingCompaction(cfd);
+        if (cfd->queued_for_compaction()) {
+          ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                         "Periodic task to trigger compaction queued Column "
+                         "family [%s] for compaction.",
+                         cfd->GetName().c_str());
+        }
+      }
     }
-  } else {
-    assert(success);
+    MaybeScheduleFlushOrCompaction();
+    bg_cv_.SignalAll();
   }
 }
 
@@ -6856,22 +6991,4 @@ void DBImpl::TrackOrUntrackFiles(
   }
 }
 
-void DBImpl::InstallSeqnoToTimeMappingInSV(
-    std::vector<SuperVersionContext>* sv_contexts) {
-  mutex_.AssertHeld();
-  std::shared_ptr<SeqnoToTimeMapping> new_seqno_to_time_mapping =
-      std::make_shared<SeqnoToTimeMapping>();
-  new_seqno_to_time_mapping->CopyFrom(seqno_to_time_mapping_);
-  for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) {
-    if (cfd->IsDropped()) {
-      continue;
-    }
-    sv_contexts->emplace_back(/*create_superversion=*/true);
-    sv_contexts->back().new_seqno_to_time_mapping = new_seqno_to_time_mapping;
-    cfd->InstallSuperVersion(&sv_contexts->back(),
-                             cfd->GetLatestMutableCFOptions());
-  }
-  bg_cv_.SignalAll();
-}
-
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 9c3f4dbd7cd9..c3c432bec8d8 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -173,10 +173,10 @@ struct DBOpenLogRecordReadReporter : public log::Reader::Reporter {
 
   void OldLogRecord(size_t bytes) override;
 
-  uint64_t GetCorruptedLogNumber() const { return corrupted_log_number_; }
+  uint64_t GetCorruptedLogNumber() const { return corrupted_wal_number_; }
 
  private:
-  uint64_t corrupted_log_number_ = kMaxSequenceNumber;
+  uint64_t corrupted_wal_number_ = kMaxSequenceNumber;
 };
 
 // While DB is the public interface of RocksDB, and DBImpl is the actual
@@ -256,6 +256,10 @@ class DBImpl : public DB {
   Status WriteWithCallback(const WriteOptions& options, WriteBatch* updates,
                            UserWriteCallback* user_write_cb) override;
 
+  Status IngestWriteBatchWithIndex(
+      const WriteOptions& options,
+      std::shared_ptr<WriteBatchWithIndex> wbwi) override;
+
   using DB::Get;
   Status Get(const ReadOptions& _read_options,
              ColumnFamilyHandle* column_family, const Slice& key,
@@ -379,6 +383,11 @@ class DBImpl : public DB {
                       const std::vector<ColumnFamilyHandle*>& column_families,
                       std::vector<Iterator*>* iterators) override;
 
+  using DB::NewMultiScan;
+  std::unique_ptr<MultiScan> NewMultiScan(
+      const ReadOptions& _read_options, ColumnFamilyHandle* column_family,
+      const MultiScanArgs& scan_opts) override;
+
   const Snapshot* GetSnapshot() override;
   void ReleaseSnapshot(const Snapshot* snapshot) override;
 
@@ -446,19 +455,20 @@ class DBImpl : public DB {
 
   void EnableManualCompaction() override;
   void DisableManualCompaction() override;
+  void AbortAllCompactions() override;
+  void ResumeAllCompactions() override;
 
   using DB::SetOptions;
   Status SetOptions(
-      ColumnFamilyHandle* column_family,
-      const std::unordered_map<std::string, std::string>& options_map) override;
+      const std::unordered_map<ColumnFamilyHandle*,
+                               std::unordered_map<std::string, std::string>>&
+          column_families_opts_map) override;
 
   Status SetDBOptions(
       const std::unordered_map<std::string, std::string>& options_map) override;
 
   using DB::NumberLevels;
   int NumberLevels(ColumnFamilyHandle* column_family) override;
-  using DB::MaxMemCompactionLevel;
-  int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) override;
   using DB::Level0StopWriteTrigger;
   int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) override;
   const std::string& GetName() const override;
@@ -475,10 +485,13 @@ class DBImpl : public DB {
       const FlushOptions& options,
       const std::vector<ColumnFamilyHandle*>& column_families) override;
   Status FlushWAL(bool sync) override {
-    // TODO: plumb Env::IOActivity, Env::IOPriority
-    return FlushWAL(WriteOptions(), sync);
+    FlushWALOptions options;
+    options.sync = sync;
+    return FlushWAL(options);
   }
 
+  Status FlushWAL(const FlushWALOptions& options) override;
+
   virtual Status FlushWAL(const WriteOptions& write_options, bool sync);
   bool WALBufferIsEmpty();
   Status SyncWAL() override;
@@ -497,6 +510,9 @@ class DBImpl : public DB {
   Status GetFullHistoryTsLow(ColumnFamilyHandle* column_family,
                              std::string* ts_low) override;
 
+  Status GetNewestUserDefinedTimestamp(ColumnFamilyHandle* column_family,
+                                       std::string* newest_timestamp) override;
+
   Status GetDbIdentity(std::string& identity) const override;
 
   virtual Status GetDbIdentityFromIdentityFile(const IOOptions& opts,
@@ -530,11 +546,11 @@ class DBImpl : public DB {
 
   // Get the known flushed sizes of WALs that might still be written to
   // or have pending sync.
-  // NOTE: unlike alive_log_files_, this function includes WALs that might
+  // NOTE: unlike alive_wal_files_, this function includes WALs that might
   // be obsolete (but not obsolete to a pending Checkpoint) and not yet fully
   // synced.
   Status GetOpenWalSizes(std::map<uint64_t, uint64_t>& number_to_size);
-  Status GetCurrentWalFile(std::unique_ptr<WalFile>* current_log_file) override;
+  Status GetCurrentWalFile(std::unique_ptr<WalFile>* current_wal_file) override;
   Status GetCreationTimeOfOldestFile(uint64_t* creation_time) override;
 
   Status GetUpdatesSince(
@@ -542,7 +558,7 @@ class DBImpl : public DB {
       const TransactionLogIterator::ReadOptions& read_options =
           TransactionLogIterator::ReadOptions()) override;
   Status DeleteFilesInRanges(ColumnFamilyHandle* column_family,
-                             const RangePtr* ranges, size_t n,
+                             const RangeOpt* ranges, size_t n,
                              bool include_end = true);
 
   void GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) override;
@@ -558,6 +574,11 @@ class DBImpl : public DB {
   void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
                                ColumnFamilyMetaData* metadata) override;
 
+  // Get column family metadata with filtering based on key range and level
+  void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
+                               const GetColumnFamilyMetaDataOptions& options,
+                               ColumnFamilyMetaData* metadata) override;
+
   void GetAllColumnFamilyMetaData(
       std::vector<ColumnFamilyMetaData>* metadata) override;
 
@@ -651,6 +672,11 @@ class DBImpl : public DB {
       ColumnFamilyHandle* column_family, const Range* range, std::size_t n,
       TablePropertiesCollection* props) override;
 
+  Status GetPropertiesOfTablesByLevel(
+      ColumnFamilyHandle* column_family,
+      std::vector<std::unique_ptr<TablePropertiesCollection>>* props_by_level)
+      override;
+
   // ---- End of implementations of the DB interface ----
   SystemClock* GetSystemClock() const;
 
@@ -787,10 +813,6 @@ class DBImpl : public DB {
   // being detected.
   const Snapshot* GetSnapshotForWriteConflictBoundary();
 
-  // checks if all live files exist on file system and that their file sizes
-  // match to our in-memory records
-  virtual Status CheckConsistency();
-
   // max_file_num_to_ignore allows bottom level compaction to filter out newly
   // compacted SST files. Setting max_file_num_to_ignore to kMaxUint64 will
   // disable the filtering
@@ -1063,7 +1085,7 @@ class DBImpl : public DB {
 
   void AddToLogsToFreeQueue(log::Writer* log_writer) {
     mutex_.AssertHeld();
-    logs_to_free_queue_.push_back(log_writer);
+    wals_to_free_queue_.push_back(log_writer);
   }
 
   void AddSuperVersionsToFreeQueue(SuperVersion* sv) {
@@ -1073,10 +1095,7 @@ class DBImpl : public DB {
   void SetSnapshotChecker(SnapshotChecker* snapshot_checker);
 
   // Fill JobContext with snapshot information needed by flush and compaction.
-  void GetSnapshotContext(JobContext* job_context,
-                          std::vector<SequenceNumber>* snapshot_seqs,
-                          SequenceNumber* earliest_write_conflict_snapshot,
-                          SnapshotChecker** snapshot_checker);
+  void InitSnapshotContext(JobContext* job_context);
 
   // Not thread-safe.
   void SetRecoverableStatePreReleaseCallback(PreReleaseCallback* callback);
@@ -1128,7 +1147,7 @@ class DBImpl : public DB {
   bool TEST_UnableToReleaseOldestLog() { return unable_to_release_oldest_log_; }
 
   bool TEST_IsLogGettingFlushed() {
-    return alive_log_files_.begin()->getting_flushed;
+    return alive_wal_files_.begin()->getting_flushed;
   }
 
   Status TEST_SwitchMemtable(ColumnFamilyData* cfd = nullptr);
@@ -1208,7 +1227,9 @@ class DBImpl : public DB {
 
   uint64_t TEST_LogfileNumber();
 
-  uint64_t TEST_total_log_size() const { return total_log_size_; }
+  uint64_t TEST_wals_total_size() const {
+    return wals_total_size_.LoadRelaxed();
+  }
 
   void TEST_GetAllBlockCaches(std::unordered_set<const Cache*>* cache_set);
 
@@ -1267,27 +1288,24 @@ class DBImpl : public DB {
   // flush LOG out of application buffer
   void FlushInfoLog();
 
-  // record current sequence number to time mapping. If
-  // populate_historical_seconds > 0 then pre-populate all the
-  // sequence numbers from [1, last] to map to [now minus
-  // populate_historical_seconds, now].
-  void RecordSeqnoToTimeMapping(uint64_t populate_historical_seconds);
-
-  // Everytime DB's seqno to time mapping changed (which already hold the db
-  // mutex), we install a new SuperVersion in each column family with a shared
-  // copy of the new mapping while holding the db mutex.
-  // This is done for all column families even though the column family does not
-  // explicitly enabled the
-  // `preclude_last_level_data_seconds` or `preserve_internal_time_seconds`
-  // features.
-  // This mapping supports iterators to fulfill the
-  // "rocksdb.iterator.write-time" iterator property for entries in memtables.
-  //
-  // Since this new SuperVersion doesn't involve an LSM tree shape change, we
-  // don't schedule work after installing this SuperVersion. It returns the used
-  // `SuperVersionContext` for clean up after release mutex.
-  void InstallSeqnoToTimeMappingInSV(
-      std::vector<SuperVersionContext>* sv_contexts);
+  // For the background timer job
+  void RecordSeqnoToTimeMapping();
+
+  // Compactions rely on an event triggers like flush/compaction/SetOptions.
+  // We need to trigger periodic compactions even when there is no such trigger.
+  // This function checks and schedules available compactions and will run
+  // periodically.
+  void TriggerPeriodicCompaction();
+
+  // REQUIRES: DB mutex held
+  std::pair<SequenceNumber, uint64_t> GetSeqnoToTimeSample() const;
+
+  // REQUIRES: DB mutex held or during open
+  void EnsureSeqnoToTimeMapping(const MinAndMaxPreserveSeconds& preserve_secs);
+
+  // Only called during open
+  void PrepopulateSeqnoToTimeMapping(
+      const MinAndMaxPreserveSeconds& preserve_secs);
 
   // Interface to block and signal the DB in case of stalling writes by
   // WriteBufferManager. Each DBImpl object contains ptr to WBMStallInterface.
@@ -1375,16 +1393,19 @@ class DBImpl : public DB {
 
   // State below is protected by mutex_
   // With two_write_queues enabled, some of the variables that accessed during
-  // WriteToWAL need different synchronization: log_empty_, alive_log_files_,
-  // logs_, logfile_number_. Refer to the definition of each variable below for
+  // WriteToWAL need different synchronization: wal_empty_, alive_wal_files_,
+  // logs_, cur_wal_number_. Refer to the definition of each variable below for
   // more description.
   //
+  // Protects access to most ColumnFamilyData methods, see more in comment for
+  // each method.
+  //
   // `mutex_` can be a hot lock in some workloads, so it deserves dedicated
   // cachelines.
   mutable CacheAlignedInstrumentedMutex mutex_;
 
-  ColumnFamilyHandleImpl* default_cf_handle_;
-  InternalStats* default_cf_internal_stats_;
+  ColumnFamilyHandleImpl* default_cf_handle_ = nullptr;
+  InternalStats* default_cf_internal_stats_ = nullptr;
 
   // table_cache_ provides its own synchronization
   std::shared_ptr<Cache> table_cache_;
@@ -1396,7 +1417,7 @@ class DBImpl : public DB {
 
   // only used for dynamically adjusting max_total_wal_size. it is a sum of
   // [write_buffer_size * max_write_buffer_number] over all column families
-  std::atomic<uint64_t> max_total_in_memory_state_;
+  std::atomic<uint64_t> max_total_in_memory_state_ = 0;
 
   // The options to access storage files
   const FileOptions file_options_;
@@ -1423,14 +1444,14 @@ class DBImpl : public DB {
 
   // Each flush or compaction gets its own job id. this counter makes sure
   // they're unique
-  std::atomic<int> next_job_id_;
+  std::atomic<int> next_job_id_ = 1;
 
-  std::atomic<bool> shutting_down_;
+  std::atomic<bool> shutting_down_ = false;
 
   // No new background jobs can be queued if true. This is used to prevent new
   // background jobs from being queued after WaitForCompact() completes waiting
   // all background jobs then attempts to close when close_db_ option is true.
-  bool reject_new_background_jobs_;
+  bool reject_new_background_jobs_ = false;
 
   // RecoveryContext struct stores the context about version edits along
   // with corresponding column_family_data and column_family_options.
@@ -1528,11 +1549,11 @@ class DBImpl : public DB {
   // ingests `wbwi` is done.
   // @param memtable_updated Whether the same write that ingests wbwi has
   // updated memtable. This is useful for determining whether to set bg
-  // error when IngestWBWI fails.
-  Status IngestWBWI(std::shared_ptr<WriteBatchWithIndex> wbwi,
-                    const WBWIMemTable::SeqnoRange& assigned_seqno,
-                    uint64_t min_prep_log, SequenceNumber last_seqno,
-                    bool memtable_updated, bool ignore_missing_cf);
+  // error when IngestWBWIAsMemtable fails.
+  Status IngestWBWIAsMemtable(std::shared_ptr<WriteBatchWithIndex> wbwi,
+                              const WBWIMemTable::SeqnoRange& assigned_seqno,
+                              uint64_t min_prep_log, SequenceNumber last_seqno,
+                              bool memtable_updated, bool ignore_missing_cf);
 
   // If disable_memtable is set the application logic must guarantee that the
   // batch will still be skipped from memtable during the recovery. An excption
@@ -1562,18 +1583,17 @@ class DBImpl : public DB {
   Status WriteImpl(const WriteOptions& options, WriteBatch* updates,
                    WriteCallback* callback = nullptr,
                    UserWriteCallback* user_write_cb = nullptr,
-                   uint64_t* log_used = nullptr, uint64_t log_ref = 0,
+                   uint64_t* wal_used = nullptr, uint64_t log_ref = 0,
                    bool disable_memtable = false, uint64_t* seq_used = nullptr,
                    size_t batch_cnt = 0,
                    PreReleaseCallback* pre_release_callback = nullptr,
                    PostMemTableCallback* post_memtable_callback = nullptr,
-                   std::shared_ptr<WriteBatchWithIndex> wbwi = nullptr,
-                   uint64_t min_prep_log = 0);
+                   std::shared_ptr<WriteBatchWithIndex> wbwi = nullptr);
 
   Status PipelinedWriteImpl(const WriteOptions& options, WriteBatch* updates,
                             WriteCallback* callback = nullptr,
                             UserWriteCallback* user_write_cb = nullptr,
-                            uint64_t* log_used = nullptr, uint64_t log_ref = 0,
+                            uint64_t* wal_used = nullptr, uint64_t log_ref = 0,
                             bool disable_memtable = false,
                             uint64_t* seq_used = nullptr);
 
@@ -1600,7 +1620,7 @@ class DBImpl : public DB {
   Status WriteImplWALOnly(
       WriteThread* write_thread, const WriteOptions& options,
       WriteBatch* updates, WriteCallback* callback,
-      UserWriteCallback* user_write_cb, uint64_t* log_used,
+      UserWriteCallback* user_write_cb, uint64_t* wal_used,
       const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt,
       PreReleaseCallback* pre_release_callback, const AssignOrder assign_order,
       const PublishLastSeq publish_last_seq, const bool disable_memtable);
@@ -1761,9 +1781,9 @@ class DBImpl : public DB {
     }
   };
 
-  struct LogFileNumberSize {
-    explicit LogFileNumberSize(uint64_t _number) : number(_number) {}
-    LogFileNumberSize() {}
+  struct WalFileNumberSize {
+    explicit WalFileNumberSize(uint64_t _number) : number(_number) {}
+    WalFileNumberSize() {}
     void AddSize(uint64_t new_size) { size += new_size; }
     uint64_t number;
     uint64_t size = 0;
@@ -1785,6 +1805,13 @@ class DBImpl : public DB {
       if (writer->file()) {
         // TODO: plumb Env::IOActivity, Env::IOPriority
         s = writer->WriteBuffer(WriteOptions());
+        if (attempt_truncate_size < SIZE_MAX &&
+            attempt_truncate_size < writer->file()->GetFileSize()) {
+          Status s2 = writer->file()->writable_file()->Truncate(
+              attempt_truncate_size, IOOptions{}, nullptr);
+          // This is just a best effort attempt
+          s2.PermitUncheckedError();
+        }
       }
       delete writer;
       writer = nullptr;
@@ -1817,6 +1844,11 @@ class DBImpl : public DB {
       getting_synced = false;
     }
 
+    void SetAttemptTruncateSize(uint64_t size) {
+      assert(attempt_truncate_size == SIZE_MAX);
+      attempt_truncate_size = size;
+    }
+
     uint64_t number;
     // Visual Studio doesn't support deque's member to be noncopyable because
     // of a std::unique_ptr as a member.
@@ -1829,15 +1861,20 @@ class DBImpl : public DB {
     // to be persisted even if appends happen during sync so it can be used for
     // tracking the synced size in MANIFEST.
     uint64_t pre_sync_size = 0;
+    // When < SIZE_MAX, attempt to truncate the WAL to this size on close,
+    // because a bad entry was written to it beyond that point and it likely
+    // won't be recoverable with the bad entry.
+    uint64_t attempt_truncate_size = SIZE_MAX;
   };
 
-  struct LogContext {
-    explicit LogContext(bool need_sync = false)
-        : need_log_sync(need_sync), need_log_dir_sync(need_sync) {}
-    bool need_log_sync = false;
-    bool need_log_dir_sync = false;
+  struct WalContext {
+    explicit WalContext(bool need_sync = false)
+        : need_wal_sync(need_sync), need_wal_dir_sync(need_sync) {}
+    bool need_wal_sync = false;
+    bool need_wal_dir_sync = false;
     log::Writer* writer = nullptr;
-    LogFileNumberSize* log_file_number_size = nullptr;
+    WalFileNumberSize* wal_file_number_size = nullptr;
+    uint64_t prev_size = SIZE_MAX;
   };
 
   // PurgeFileInfo is a structure to hold information of files to be deleted in
@@ -1929,12 +1966,19 @@ class DBImpl : public DB {
   };
   struct PrepickedCompaction {
     // background compaction takes ownership of `compaction`.
+    // TODO(hx235): consider using std::shared_ptr for easier ownership
+    // management
     Compaction* compaction;
     // caller retains ownership of `manual_compaction_state` as it is reused
     // across background compactions.
     ManualCompactionState* manual_compaction_state;  // nullptr if non-manual
     // task limiter token is requested during compaction picking.
     std::unique_ptr<TaskLimiterToken> task_token;
+    // If true, `compaction` is picked temporarily to express compaction intent
+    // and will be released before re-picking a real compaction based on the
+    // updated LSM shape when thread associated with `compaction` is ready to
+    // run
+    bool need_repick;
   };
 
   struct CompactionArg {
@@ -1979,7 +2023,7 @@ class DBImpl : public DB {
 
   // Follow-up work to user creating a column family or (families)
   Status WrapUpCreateColumnFamilies(
-      const ReadOptions& read_options, const WriteOptions& write_options,
+      const WriteOptions& write_options,
       const std::vector<const ColumnFamilyOptions*>& cf_options);
 
   Status DropColumnFamilyImpl(ColumnFamilyHandle* column_family);
@@ -2025,14 +2069,13 @@ class DBImpl : public DB {
   // Flush the in-memory write buffer to storage.  Switches to a new
   // log-file/memtable and writes a new descriptor iff successful. Then
   // installs a new super version for the column family.
-  Status FlushMemTableToOutputFile(
-      ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
-      bool* madeProgress, JobContext* job_context, FlushReason flush_reason,
-      SuperVersionContext* superversion_context,
-      std::vector<SequenceNumber>& snapshot_seqs,
-      SequenceNumber earliest_write_conflict_snapshot,
-      SnapshotChecker* snapshot_checker, LogBuffer* log_buffer,
-      Env::Priority thread_pri);
+  Status FlushMemTableToOutputFile(ColumnFamilyData* cfd,
+                                   const MutableCFOptions& mutable_cf_options,
+                                   bool* madeProgress, JobContext* job_context,
+                                   FlushReason flush_reason,
+                                   SuperVersionContext* superversion_context,
+                                   LogBuffer* log_buffer,
+                                   Env::Priority thread_pri);
 
   // Flush the memtables of (multiple) column families to multiple files on
   // persistent storage.
@@ -2045,10 +2088,10 @@ class DBImpl : public DB {
       JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri);
 
   // REQUIRES: log_numbers are sorted in ascending order
-  // corrupted_log_found is set to true if we recover from a corrupted log file.
+  // corrupted_wal_found is set to true if we recover from a corrupted log file.
   Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
                          SequenceNumber* next_sequence, bool read_only,
-                         bool is_retry, bool* corrupted_log_found,
+                         bool is_retry, bool* corrupted_wal_found,
                          RecoveryContext* recovery_ctx);
 
   void SetupLogFilesRecovery(
@@ -2138,6 +2181,11 @@ class DBImpl : public DB {
       bool flushed, std::unordered_map<int, VersionEdit>* version_edits,
       RecoveryContext* recovery_ctx);
 
+  // Check that DB sequence number is not set back during recovery between
+  // replaying of WAL files and between replaying of WriteBatches.
+  Status CheckSeqnoNotSetBackDuringRecovery(SequenceNumber prev_next_seqno,
+                                            SequenceNumber current_next_seqno);
+
   void FinishLogFilesRecovery(int job_id, const Status& status);
   // The following two methods are used to flush a memtable to
   // storage. The first one is used at database RecoveryTime (when the
@@ -2151,12 +2199,12 @@ class DBImpl : public DB {
   // log file to its actual size, thereby freeing preallocated space.
   // Return success even if truncate fails
   Status GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate,
-                                    LogFileNumberSize* log);
+                                    WalFileNumberSize* log);
 
-  // Restore alive_log_files_ and total_log_size_ after recovery.
+  // Restore alive_wal_files_ and wals_total_size_ after recovery.
   // It needs to run only when there's no flush during recovery
   // (e.g. avoid_flush_during_recovery=true). May also trigger flush
-  // in case total_log_size > max_total_wal_size.
+  // in case wals_total_size > max_total_wal_size.
   Status RestoreAliveLogFiles(const std::vector<uint64_t>& log_numbers);
 
   // num_bytes: for slowdown case, delay time is calculated based on
@@ -2305,7 +2353,7 @@ class DBImpl : public DB {
 
   // REQUIRES: mutex locked
   Status PreprocessWrite(const WriteOptions& write_options,
-                         LogContext* log_context, WriteContext* write_context);
+                         WalContext* log_context, WriteContext* write_context);
 
   // Merge write batches in the write group into merged_batch.
   // Returns OK if merge is successful.
@@ -2316,20 +2364,21 @@ class DBImpl : public DB {
 
   IOStatus WriteToWAL(const WriteBatch& merged_batch,
                       const WriteOptions& write_options,
-                      log::Writer* log_writer, uint64_t* log_used,
+                      log::Writer* log_writer, uint64_t* wal_used,
                       uint64_t* log_size,
-                      LogFileNumberSize& log_file_number_size,
+                      WalFileNumberSize& wal_file_number_size,
                       SequenceNumber sequence);
 
-  IOStatus WriteToWAL(const WriteThread::WriteGroup& write_group,
-                      log::Writer* log_writer, uint64_t* log_used,
-                      bool need_log_sync, bool need_log_dir_sync,
-                      SequenceNumber sequence,
-                      LogFileNumberSize& log_file_number_size);
+  IOStatus WriteGroupToWAL(const WriteThread::WriteGroup& write_group,
+                           log::Writer* log_writer, uint64_t* wal_used,
+                           bool need_wal_sync, bool need_wal_dir_sync,
+                           SequenceNumber sequence,
+                           WalFileNumberSize& wal_file_number_size);
 
-  IOStatus ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group,
-                                uint64_t* log_used,
-                                SequenceNumber* last_sequence, size_t seq_inc);
+  IOStatus ConcurrentWriteGroupToWAL(const WriteThread::WriteGroup& write_group,
+                                     uint64_t* wal_used,
+                                     SequenceNumber* last_sequence,
+                                     size_t seq_inc);
 
   // Used by WriteImpl to update bg_error_ if paranoid check is enabled.
   // Caller must hold mutex_.
@@ -2343,7 +2392,7 @@ class DBImpl : public DB {
   void WALIOStatusCheck(const IOStatus& status);
 
   // Used by WriteImpl to update bg_error_ in case of memtable insert error.
-  void MemTableInsertStatusCheck(const Status& memtable_insert_status);
+  void HandleMemTableInsertFailure(const Status& nonok_memtable_insert_status);
 
   Status CompactFilesImpl(const CompactionOptions& compact_options,
                           ColumnFamilyData* cfd, Version* version,
@@ -2353,6 +2402,14 @@ class DBImpl : public DB {
                           JobContext* job_context, LogBuffer* log_buffer,
                           CompactionJobInfo* compaction_job_info);
 
+  // Helper function to perform trivial move by updating manifest metadata
+  // without rewriting data files. This is called when IsTrivialMove() is true.
+  // REQUIRES: mutex held
+  // Returns: Status of the trivial move operation
+  Status PerformTrivialMove(Compaction& c, LogBuffer* log_buffer,
+                            bool& compaction_released, size_t& moved_files,
+                            size_t& moved_bytes);
+
   // REQUIRES: mutex unlocked
   void TrackOrUntrackFiles(const std::vector<std::string>& existing_data_files,
                            bool track);
@@ -2428,6 +2485,8 @@ class DBImpl : public DB {
                          bool* flush_rescheduled_to_retain_udt,
                          Env::Priority thread_pri);
 
+  Compaction* CreateIntendedCompactionForwardedToBottomPriorityPool(
+      Compaction* c);
   bool EnoughRoomForCompaction(ColumnFamilyData* cfd,
                                const std::vector<CompactionInputFiles>& inputs,
                                bool* sfm_bookkeeping, LogBuffer* log_buffer);
@@ -2450,9 +2509,7 @@ class DBImpl : public DB {
   // Cancel scheduled periodic tasks
   Status CancelPeriodicTaskScheduler();
 
-  Status RegisterRecordSeqnoTimeWorker(const ReadOptions& read_options,
-                                       const WriteOptions& write_options,
-                                       bool is_new_db);
+  Status RegisterRecordSeqnoTimeWorker();
 
   void PrintStatistics();
 
@@ -2518,12 +2575,21 @@ class DBImpl : public DB {
 
   // Background threads call this function, which is just a wrapper around
   // the InstallSuperVersion() function. Background threads carry
-  // sv_context which can have new_superversion already
-  // allocated.
+  // sv_context to allow allocation of SuperVersion object outside of holding
+  // the DB mutex.
   // All ColumnFamily state changes go through this function. Here we analyze
   // the new state and we schedule background work if we detect that the new
   // state needs flush or compaction.
-  void InstallSuperVersionAndScheduleWork(ColumnFamilyData* cfd,
+  // See also InstallSuperVersionForConfigChange().
+  void InstallSuperVersionAndScheduleWork(
+      ColumnFamilyData* cfd, SuperVersionContext* sv_context,
+      std::optional<std::shared_ptr<SeqnoToTimeMapping>>
+          new_seqno_to_time_mapping = {});
+
+  // A variant of InstallSuperVersionAndScheduleWork() that must be used for
+  // new CFs or for changes to mutable_cf_options. This is so that it can
+  // update seqno_to_time_mapping cached for the new SuperVersion as relevant.
+  void InstallSuperVersionForConfigChange(ColumnFamilyData* cfd,
                                           SuperVersionContext* sv_context);
 
   bool GetIntPropertyInternal(ColumnFamilyData* cfd,
@@ -2538,7 +2604,7 @@ class DBImpl : public DB {
   bool ShouldntRunManualCompaction(ManualCompactionState* m);
   bool HaveManualCompaction(ColumnFamilyData* cfd);
   bool MCOverlap(ManualCompactionState* m, ManualCompactionState* m1);
-  void UpdateDeletionCompactionStats(const std::unique_ptr<Compaction>& c);
+  void UpdateFIFOCompactionStatus(const std::unique_ptr<Compaction>& c);
 
   // May open and read table files for table property.
   // Should not be called while holding mutex_.
@@ -2688,8 +2754,13 @@ class DBImpl : public DB {
       const std::vector<ColumnFamilyHandle*>& column_families,
       ErrorIteratorFuncType error_iterator_func);
 
+  bool ShouldPickCompaction(bool is_prepicked,
+                            const PrepickedCompaction* prepicked_compaction);
+
+  void ResetBottomPriCompactionIntent(ColumnFamilyData* cfd,
+                                      std::unique_ptr<Compaction>& c);
   // Lock over the persistent DB state.  Non-nullptr iff successfully acquired.
-  FileLock* db_lock_;
+  FileLock* db_lock_ = nullptr;
 
   // Guards changes to DB and CF options to ensure consistency between
   // * In-memory options objects
@@ -2703,20 +2774,28 @@ class DBImpl : public DB {
   // Guards reads and writes to in-memory stats_history_.
   InstrumentedMutex stats_history_mutex_;
 
-  // In addition to mutex_, log_write_mutex_ protects writes to logs_ and
-  // logfile_number_. With two_write_queues it also protects alive_log_files_,
-  // and log_empty_. Refer to the definition of each variable below for more
+  // In addition to mutex_, wal_write_mutex_ protects writes to logs_ and
+  // cur_wal_number_. With two_write_queues it also protects alive_wal_files_,
+  // and wal_empty_. Refer to the definition of each variable below for more
   // details.
-  // Note: to avoid deadlock, if needed to acquire both log_write_mutex_ and
-  // mutex_, the order should be first mutex_ and then log_write_mutex_.
-  InstrumentedMutex log_write_mutex_;
+  // Note: to avoid deadlock, if needed to acquire both wal_write_mutex_ and
+  // mutex_, the order should be first mutex_ and then wal_write_mutex_.
+  InstrumentedMutex wal_write_mutex_;
 
   // If zero, manual compactions are allowed to proceed. If non-zero, manual
   // compactions may still be running, but will quickly fail with
   // `Status::Incomplete`. The value indicates how many threads have paused
   // manual compactions. It is accessed in read mode outside the DB mutex in
   // compaction code paths.
-  std::atomic<int> manual_compaction_paused_;
+  std::atomic<int> manual_compaction_paused_ = false;
+
+  // If non-zero, all compaction jobs (background automatic compactions,
+  // manual compactions via CompactRange, and foreground CompactFiles calls)
+  // are being aborted. Compactions will be signaled to stop. Any new
+  // compaction job would fail immediately. The value indicates how many threads
+  // have called AbortAllCompactions(). It is accessed in read mode outside the
+  // DB mutex in compaction code paths.
+  std::atomic<int> compaction_aborted_ = 0;
 
   // This condition variable is signaled on these conditions:
   // * whenever bg_compaction_scheduled_ goes down to 0
@@ -2732,106 +2811,114 @@ class DBImpl : public DB {
   // * whenever SetOptions successfully updates options.
   // * whenever a column family is dropped.
   InstrumentedCondVar bg_cv_;
-  // Writes are protected by locking both mutex_ and log_write_mutex_, and reads
-  // must be under either mutex_ or log_write_mutex_. Since after ::Open,
-  // logfile_number_ is currently updated only in write_thread_, it can be read
+
+  ColumnFamilyHandleImpl* persist_stats_cf_handle_ = nullptr;
+
+  bool persistent_stats_cfd_exists_ = true;
+
+  // Writes are protected by locking both mutex_ and wal_write_mutex_, and reads
+  // must be under either mutex_ or wal_write_mutex_. Since after ::Open,
+  // cur_wal_number_ is currently updated only in write_thread_, it can be read
   // from the same write_thread_ without any locks.
-  uint64_t logfile_number_;
+  uint64_t cur_wal_number_ = 0;
+
   // Log files that we can recycle. Must be protected by db mutex_.
-  std::deque<uint64_t> log_recycle_files_;
+  std::deque<uint64_t> wal_recycle_files_;
+
   // The minimum log file number taht can be recycled, if log recycling is
   // enabled. This is used to ensure that log files created by previous
   // instances of the database are not recycled, as we cannot be sure they
   // were created in the recyclable format.
-  uint64_t min_log_number_to_recycle_;
-  // Protected by log_write_mutex_.
-  bool log_dir_synced_;
-  // Without two_write_queues, read and writes to log_empty_ are protected by
+  uint64_t min_wal_number_to_recycle_ = 0;
+
+  // Protected by wal_write_mutex_.
+  bool wal_dir_synced_ = false;
+
+  // Without two_write_queues, read and writes to wal_empty_ are protected by
   // mutex_. Since it is currently updated/read only in write_thread_, it can be
   // accessed from the same write_thread_ without any locks. With
   // two_write_queues writes, where it can be updated in different threads,
-  // read and writes are protected by log_write_mutex_ instead. This is to avoid
-  // expensive mutex_ lock during WAL write, which update log_empty_.
-  bool log_empty_;
-
-  ColumnFamilyHandleImpl* persist_stats_cf_handle_;
-
-  bool persistent_stats_cfd_exists_ = true;
+  // read and writes are protected by wal_write_mutex_ instead. This is to avoid
+  // expensive mutex_ lock during WAL write, which update wal_empty_.
+  bool wal_empty_ = true;
 
   // The current WAL file and those that have not been found obsolete from
   // memtable flushes. A WAL not on this list might still be pending writer
-  // flush and/or sync and close and might still be in logs_. alive_log_files_
-  // is protected by mutex_ and log_write_mutex_ with details as follows:
+  // flush and/or sync and close and might still be in logs_. alive_wal_files_
+  // is protected by mutex_ and wal_write_mutex_ with details as follows:
   // 1. read by FindObsoleteFiles() which can be called in either application
-  //    thread or RocksDB bg threads, both mutex_ and log_write_mutex_ are
+  //    thread or RocksDB bg threads, both mutex_ and wal_write_mutex_ are
   //    held.
-  // 2. pop_front() by FindObsoleteFiles(), both mutex_ and log_write_mutex_
+  // 2. pop_front() by FindObsoleteFiles(), both mutex_ and wal_write_mutex_
   //    are held.
   // 3. push_back() by DBImpl::Open() and DBImpl::RestoreAliveLogFiles()
   //    (actually called by Open()), only mutex_ is held because at this point,
   //    the DB::Open() call has not returned success to application, and the
   //    only other thread(s) that can conflict are bg threads calling
-  //    FindObsoleteFiles() which ensure that both mutex_ and log_write_mutex_
-  //    are held when accessing alive_log_files_.
+  //    FindObsoleteFiles() which ensure that both mutex_ and wal_write_mutex_
+  //    are held when accessing alive_wal_files_.
   // 4. read by DBImpl::Open() is protected by mutex_.
-  // 5. push_back() by SwitchMemtable(). Both mutex_ and log_write_mutex_ are
+  // 5. push_back() by SwitchMemtable(). Both mutex_ and wal_write_mutex_ are
   //    held. This is done by the write group leader. Note that in the case of
   //    two-write-queues, another WAL-only write thread can be writing to the
   //    WAL concurrently. See 9.
-  // 6. read by SwitchWAL() with both mutex_ and log_write_mutex_ held. This is
+  // 6. read by SwitchWAL() with both mutex_ and wal_write_mutex_ held. This is
   //    done by write group leader.
   // 7. read by ConcurrentWriteToWAL() by the write group leader in the case of
-  //    two-write-queues. Only log_write_mutex_ is held to protect concurrent
+  //    two-write-queues. Only wal_write_mutex_ is held to protect concurrent
   //    pop_front() by FindObsoleteFiles().
-  // 8. read by PreprocessWrite() by the write group leader. log_write_mutex_
+  // 8. read by PreprocessWrite() by the write group leader. wal_write_mutex_
   //    is held to protect the data structure from concurrent pop_front() by
   //    FindObsoleteFiles().
   // 9. read by ConcurrentWriteToWAL() by a WAL-only write thread in the case
-  //    of two-write-queues. Only log_write_mutex_ is held. This suffices to
+  //    of two-write-queues. Only wal_write_mutex_ is held. This suffices to
   //    protect the data structure from concurrent push_back() by current
   //    write group leader as well as pop_front() by FindObsoleteFiles().
-  std::deque<LogFileNumberSize> alive_log_files_;
+  std::deque<WalFileNumberSize> alive_wal_files_;
+
+  // Total size of all "alive" WALs (for easy access without synchronization)
+  RelaxedAtomic<uint64_t> wals_total_size_{0};
 
   // Log files that aren't fully synced, and the current log file.
   // Synchronization:
   // 1. read by FindObsoleteFiles() which can be called either in application
-  //    thread or RocksDB bg threads. log_write_mutex_ is always held, while
+  //    thread or RocksDB bg threads. wal_write_mutex_ is always held, while
   //    some reads are performed without mutex_.
-  // 2. pop_front() by FindObsoleteFiles() with only log_write_mutex_ held.
-  // 3. read by DBImpl::Open() with both mutex_ and log_write_mutex_.
-  // 4. emplace_back() by DBImpl::Open() with both mutex_ and log_write_mutex.
+  // 2. pop_front() by FindObsoleteFiles() with only wal_write_mutex_ held.
+  // 3. read by DBImpl::Open() with both mutex_ and wal_write_mutex_.
+  // 4. emplace_back() by DBImpl::Open() with both mutex_ and wal_write_mutex.
   //    Note that at this point, DB::Open() has not returned success to
   //    application, thus the only other thread(s) that can conflict are bg
   //    threads calling FindObsoleteFiles(). See 1.
-  // 5. iteration and clear() from CloseHelper() always hold log_write_mutex
+  // 5. iteration and clear() from CloseHelper() always hold wal_write_mutex
   //    and mutex_.
   // 6. back() called by APIs FlushWAL() and LockWAL() are protected by only
-  //    log_write_mutex_. These two can be called by application threads after
+  //    wal_write_mutex_. These two can be called by application threads after
   //    DB::Open() returns success to applications.
-  // 7. read by SyncWAL(), another API, protected by only log_write_mutex_.
+  // 7. read by SyncWAL(), another API, protected by only wal_write_mutex_.
   // 8. read by MarkLogsNotSynced() and MarkLogsSynced() are protected by
-  //    log_write_mutex_.
-  // 9. erase() by MarkLogsSynced() protected by log_write_mutex_.
-  // 10. read by SyncClosedWals() protected by only log_write_mutex_. This can
+  //    wal_write_mutex_.
+  // 9. erase() by MarkLogsSynced() protected by wal_write_mutex_.
+  // 10. read by SyncClosedWals() protected by only wal_write_mutex_. This can
   //     happen in bg flush threads after DB::Open() returns success to
   //     applications.
   // 11. reads, e.g. front(), iteration, and back() called by PreprocessWrite()
-  //     holds only the log_write_mutex_. This is done by the write group
+  //     holds only the wal_write_mutex_. This is done by the write group
   //     leader. A bg thread calling FindObsoleteFiles() or MarkLogsSynced()
-  //     can happen concurrently. This is fine because log_write_mutex_ is used
+  //     can happen concurrently. This is fine because wal_write_mutex_ is used
   //     by all parties. See 2, 5, 9.
   // 12. reads, empty(), back() called by SwitchMemtable() hold both mutex_ and
-  //     log_write_mutex_. This happens in the write group leader.
+  //     wal_write_mutex_. This happens in the write group leader.
   // 13. emplace_back() by SwitchMemtable() hold both mutex_ and
-  //     log_write_mutex_. This happens in the write group leader. Can conflict
+  //     wal_write_mutex_. This happens in the write group leader. Can conflict
   //     with bg threads calling FindObsoleteFiles(), MarkLogsSynced(),
   //     SyncClosedWals(), etc. as well as application threads calling
   //     FlushWAL(), SyncWAL(), LockWAL(). This is fine because all parties
-  //     require at least log_write_mutex_.
+  //     require at least wal_write_mutex_.
   // 14. iteration called in WriteToWAL(write_group) protected by
-  //     log_write_mutex_. This is done by write group leader when
+  //     wal_write_mutex_. This is done by write group leader when
   //     two-write-queues is disabled and write needs to sync logs.
-  // 15. back() called in ConcurrentWriteToWAL() protected by log_write_mutex_.
+  // 15. back() called in ConcurrentWriteToWAL() protected by wal_write_mutex_.
   //     This can be done by the write group leader if two-write-queues is
   //     enabled. It can also be done by another WAL-only write thread.
   //
@@ -2848,23 +2935,22 @@ class DBImpl : public DB {
   std::deque<LogWriterNumber> logs_;
 
   // Signaled when getting_synced becomes false for some of the logs_.
-  InstrumentedCondVar log_sync_cv_;
+  InstrumentedCondVar wal_sync_cv_;
   // This is the app-level state that is written to the WAL but will be used
   // only during recovery. Using this feature enables not writing the state to
   // memtable on normal writes and hence improving the throughput. Each new
   // write of the state will replace the previous state entirely even if the
   // keys in the two consecutive states do not overlap.
-  // It is protected by log_write_mutex_ when two_write_queues_ is enabled.
+  // It is protected by wal_write_mutex_ when two_write_queues_ is enabled.
   // Otherwise only the heaad of write_thread_ can access it.
   WriteBatch cached_recoverable_state_;
   std::atomic<bool> cached_recoverable_state_empty_ = {true};
-  std::atomic<uint64_t> total_log_size_;
 
   // If this is non-empty, we need to delete these log files in background
-  // threads. Protected by log_write_mutex_.
-  autovector<log::Writer*> logs_to_free_;
+  // threads. Protected by wal_write_mutex_.
+  autovector<log::Writer*> wals_to_free_;
 
-  bool is_snapshot_supported_;
+  bool is_snapshot_supported_ = true;
 
   std::map<uint64_t, std::map<std::string, uint64_t>> stats_history_;
 
@@ -2888,7 +2974,7 @@ class DBImpl : public DB {
   // sleep if it uses up the quota.
   // Note: This is to protect memtable and compaction. If the batch only writes
   // to the WAL its size need not to be included in this.
-  uint64_t last_batch_group_size_;
+  uint64_t last_batch_group_size_ = 0;
 
   FlushScheduler flush_scheduler_;
 
@@ -2947,32 +3033,32 @@ class DBImpl : public DB {
   std::unordered_set<uint64_t> files_grabbed_for_purge_;
 
   // A queue to store log writers to close. Protected by db mutex_.
-  std::deque<log::Writer*> logs_to_free_queue_;
+  std::deque<log::Writer*> wals_to_free_queue_;
 
   std::deque<SuperVersion*> superversions_to_free_queue_;
 
-  int unscheduled_flushes_;
+  int unscheduled_flushes_ = 0;
 
-  int unscheduled_compactions_;
+  int unscheduled_compactions_ = 0;
 
   // count how many background compactions are running or have been scheduled in
   // the BOTTOM pool
-  int bg_bottom_compaction_scheduled_;
+  int bg_bottom_compaction_scheduled_ = 0;
 
   // count how many background compactions are running or have been scheduled
-  int bg_compaction_scheduled_;
+  int bg_compaction_scheduled_ = 0;
 
   // stores the number of compactions are currently running
-  int num_running_compactions_;
+  int num_running_compactions_ = 0;
 
   // number of background memtable flush jobs, submitted to the HIGH pool
-  int bg_flush_scheduled_;
+  int bg_flush_scheduled_ = 0;
 
   // stores the number of flushes are currently running
-  int num_running_flushes_;
+  int num_running_flushes_ = 0;
 
   // number of background obsolete file purge jobs, submitted to the HIGH pool
-  int bg_purge_scheduled_;
+  int bg_purge_scheduled_ = 0;
 
   std::deque<ManualCompactionState*> manual_compaction_dequeue_;
 
@@ -2982,11 +3068,11 @@ class DBImpl : public DB {
   // This enables two different threads to call
   // EnableFileDeletions() and DisableFileDeletions()
   // without any synchronization
-  int disable_delete_obsolete_files_;
+  int disable_delete_obsolete_files_ = 0;
 
   // Number of times FindObsoleteFiles has found deletable files and the
   // corresponding call to PurgeObsoleteFiles has not yet finished.
-  int pending_purge_obsolete_files_;
+  int pending_purge_obsolete_files_ = 0;
 
   // last time when DeleteObsoleteFiles with full scan was executed. Originally
   // initialized with startup time.
@@ -2998,12 +3084,12 @@ class DBImpl : public DB {
   // The mutex used by switch_cv_. mutex_ should be acquired beforehand.
   std::mutex switch_mutex_;
   // Number of threads intending to write to memtable
-  std::atomic<size_t> pending_memtable_writes_ = {};
+  std::atomic<size_t> pending_memtable_writes_{0};
 
   // A flag indicating whether the current rocksdb database has any
   // data that is not yet persisted into either WAL or SST file.
   // Used when disableWAL is true.
-  std::atomic<bool> has_unpersisted_data_;
+  std::atomic<bool> has_unpersisted_data_{false};
 
   // if an attempt was made to flush all column families that
   // the oldest log depends on but uncommitted data in the oldest
@@ -3011,26 +3097,26 @@ class DBImpl : public DB {
   // We must attempt to free the dependent memtables again
   // at a later time after the transaction in the oldest
   // log is fully commited.
-  bool unable_to_release_oldest_log_;
+  bool unable_to_release_oldest_log_{false};
 
   // Number of running IngestExternalFile() or CreateColumnFamilyWithImport()
   // calls.
   // REQUIRES: mutex held
-  int num_running_ingest_file_;
+  int num_running_ingest_file_ = 0;
 
   WalManager wal_manager_;
 
   // A value of > 0 temporarily disables scheduling of background work
-  int bg_work_paused_;
+  int bg_work_paused_ = 0;
 
   // A value of > 0 temporarily disables scheduling of background compaction
-  int bg_compaction_paused_;
+  int bg_compaction_paused_ = 0;
 
   // Guard against multiple concurrent refitting
-  bool refitting_level_;
+  bool refitting_level_ = false;
 
   // Indicate DB was opened successfully
-  bool opened_successfully_;
+  bool opened_successfully_ = false;
 
   // The min threshold to triggere bottommost compaction for removing
   // garbages, among all column families.
@@ -3076,13 +3162,13 @@ class DBImpl : public DB {
   // error recovery from going on in parallel. The latter, shutting_down_,
   // is set a little later during the shutdown after scheduling memtable
   // flushes
-  std::atomic<bool> shutdown_initiated_;
+  std::atomic<bool> shutdown_initiated_{false};
   // Flag to indicate whether sst_file_manager object was allocated in
   // DB::Open() or passed to us
   bool own_sfm_;
 
   // Flag to check whether Close() has been called on this DB
-  bool closed_;
+  bool closed_ = false;
   // save the closing status, for re-calling the close()
   Status closing_status_;
   // mutex for DB::Close()
@@ -3118,7 +3204,7 @@ class DBImpl : public DB {
 
   // The number of LockWAL called without matching UnlockWAL call.
   // See also lock_wal_write_token_
-  uint32_t lock_wal_count_;
+  uint32_t lock_wal_count_ = 0;
 };
 
 class GetWithTimestampReadCallback : public ReadCallback {
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index 0cbb6c79e382..ab136b57b505 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -13,12 +13,14 @@
 #include "db/db_impl/db_impl.h"
 #include "db/error_handler.h"
 #include "db/event_helpers.h"
+#include "file/file_util.h"
 #include "file/sst_file_manager_impl.h"
 #include "logging/logging.h"
 #include "monitoring/iostats_context_imp.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/thread_status_updater.h"
 #include "monitoring/thread_status_util.h"
+#include "options/options_helper.h"
 #include "rocksdb/file_system.h"
 #include "rocksdb/io_status.h"
 #include "rocksdb/options.h"
@@ -143,10 +145,7 @@ IOStatus DBImpl::SyncClosedWals(const WriteOptions& write_options,
 Status DBImpl::FlushMemTableToOutputFile(
     ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
     bool* made_progress, JobContext* job_context, FlushReason flush_reason,
-    SuperVersionContext* superversion_context,
-    std::vector<SequenceNumber>& snapshot_seqs,
-    SequenceNumber earliest_write_conflict_snapshot,
-    SnapshotChecker* snapshot_checker, LogBuffer* log_buffer,
+    SuperVersionContext* superversion_context, LogBuffer* log_buffer,
     Env::Priority thread_pri) {
   mutex_.AssertHeld();
   assert(cfd);
@@ -168,7 +167,7 @@ Status DBImpl::FlushMemTableToOutputFile(
   // had not been committed yet. Make sure we sync them to keep the persisted
   // WAL state at least as new as the persisted SST state.
   const bool needs_to_sync_closed_wals =
-      logfile_number_ > 0 &&
+      cur_wal_number_ > 0 &&
       (versions_->GetColumnFamilySet()->NumberOfColumnFamilies() > 1 ||
        allow_2pc());
 
@@ -210,7 +209,6 @@ Status DBImpl::FlushMemTableToOutputFile(
   FlushJob flush_job(
       dbname_, cfd, immutable_db_options_, mutable_cf_options, max_memtable_id,
       file_options_for_compaction_, versions_.get(), &mutex_, &shutting_down_,
-      snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker,
       job_context, flush_reason, log_buffer, directories_.GetDbDir(),
       GetDataDir(cfd, 0U),
       GetCompressionFlush(cfd->ioptions(), mutable_cf_options), stats_,
@@ -224,7 +222,7 @@ Status DBImpl::FlushMemTableToOutputFile(
   bool need_cancel = false;
   IOStatus log_io_s = IOStatus::OK();
   if (needs_to_sync_closed_wals) {
-    // SyncClosedWals() may unlock and re-lock the log_write_mutex multiple
+    // SyncClosedWals() may unlock and re-lock the wal_write_mutex multiple
     // times.
     VersionEdit synced_wals;
     bool error_recovery_in_prog = error_handler_.IsRecoveryInProgress();
@@ -395,11 +393,8 @@ Status DBImpl::FlushMemTablesToOutputFiles(
         bg_flush_args, made_progress, job_context, log_buffer, thread_pri);
   }
   assert(bg_flush_args.size() == 1);
-  std::vector<SequenceNumber> snapshot_seqs;
-  SequenceNumber earliest_write_conflict_snapshot;
-  SnapshotChecker* snapshot_checker;
-  GetSnapshotContext(job_context, &snapshot_seqs,
-                     &earliest_write_conflict_snapshot, &snapshot_checker);
+  InitSnapshotContext(job_context);
+
   const auto& bg_flush_arg = bg_flush_args[0];
   ColumnFamilyData* cfd = bg_flush_arg.cfd_;
   // intentional infrequent copy for each flush
@@ -410,8 +405,7 @@ Status DBImpl::FlushMemTablesToOutputFiles(
   FlushReason flush_reason = bg_flush_arg.flush_reason_;
   Status s = FlushMemTableToOutputFile(
       cfd, mutable_cf_options_copy, made_progress, job_context, flush_reason,
-      superversion_context, snapshot_seqs, earliest_write_conflict_snapshot,
-      snapshot_checker, log_buffer, thread_pri);
+      superversion_context, log_buffer, thread_pri);
   return s;
 }
 
@@ -446,12 +440,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
   }
 #endif /* !NDEBUG */
 
-  std::vector<SequenceNumber> snapshot_seqs;
-  SequenceNumber earliest_write_conflict_snapshot;
-  SnapshotChecker* snapshot_checker;
-  GetSnapshotContext(job_context, &snapshot_seqs,
-                     &earliest_write_conflict_snapshot, &snapshot_checker);
-
+  InitSnapshotContext(job_context);
   autovector<FSDirectory*> distinct_output_dirs;
   autovector<std::string> distinct_output_dir_paths;
   std::vector<std::unique_ptr<FlushJob>> jobs;
@@ -485,8 +474,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
     jobs.emplace_back(new FlushJob(
         dbname_, cfd, immutable_db_options_, mutable_cf_options,
         max_memtable_id, file_options_for_compaction_, versions_.get(), &mutex_,
-        &shutting_down_, snapshot_seqs, earliest_write_conflict_snapshot,
-        snapshot_checker, job_context, flush_reason, log_buffer,
+        &shutting_down_, job_context, flush_reason, log_buffer,
         directories_.GetDbDir(), data_dir,
         GetCompressionFlush(cfd->ioptions(), mutable_cf_options), stats_,
         &event_logger_, mutable_cf_options.report_bg_io_stats,
@@ -512,7 +500,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
                        job_context->job_id, flush_reason);
   }
 
-  if (logfile_number_ > 0) {
+  if (cur_wal_number_ > 0) {
     // TODO (yanqin) investigate whether we should sync the closed logs for
     // single column family case.
     VersionEdit synced_wals;
@@ -528,7 +516,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
 
     if (!log_io_s.ok() && !log_io_s.IsShutdownInProgress() &&
         !log_io_s.IsColumnFamilyDropped()) {
-      if (total_log_size_ > 0) {
+      if (wals_total_size_.LoadRelaxed() > 0) {
         error_handler_.SetBGError(log_io_s, BackgroundErrorReason::kFlush);
       } else {
         // If the WAL is empty, we use different error reason
@@ -967,6 +955,10 @@ Status DBImpl::CompactRange(const CompactRangeOptions& options,
     return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
   }
 
+  if (compaction_aborted_.load(std::memory_order_acquire) > 0) {
+    return Status::Incomplete(Status::SubCode::kCompactionAborted);
+  }
+
   if (options.canceled && options.canceled->load(std::memory_order_acquire)) {
     return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
   }
@@ -981,7 +973,8 @@ Status DBImpl::CompactRange(const CompactRangeOptions& options,
 
   std::string begin_str, end_str;
   auto [begin, end] =
-      MaybeAddTimestampsToRange(begin_without_ts, end_without_ts, ts_sz,
+      MaybeAddTimestampsToRange(OptSlice::CopyFromPtr(begin_without_ts),
+                                OptSlice::CopyFromPtr(end_without_ts), ts_sz,
                                 &begin_str, &end_str, false /*exclusive_end*/);
 
   return CompactRangeInternal(
@@ -1122,8 +1115,7 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
       cfd->NumberLevels() > 1) {
     // Always compact all files together.
     final_output_level = cfd->NumberLevels() - 1;
-    // if bottom most level is reserved
-    if (immutable_db_options_.allow_ingest_behind) {
+    if (cfd->AllowIngestBehind()) {
       final_output_level--;
     }
     s = RunManualCompaction(cfd, ColumnFamilyData::kCompactAllLevels,
@@ -1392,6 +1384,9 @@ Status DBImpl::CompactFiles(const CompactionOptions& compact_options,
   TEST_SYNC_POINT_CALLBACK("TestCompactFiles:PausingManualCompaction:3",
                            static_cast<void*>(const_cast<std::atomic<int>*>(
                                &manual_compaction_paused_)));
+  TEST_SYNC_POINT_CALLBACK("TestCancelCompactFiles:SuccessfulCompaction",
+                           static_cast<void*>(const_cast<std::atomic<int>*>(
+                               &manual_compaction_paused_)));
   {
     InstrumentedMutexLock l(&mutex_);
     auto* current = cfd->current();
@@ -1433,6 +1428,57 @@ Status DBImpl::CompactFiles(const CompactionOptions& compact_options,
   return s;
 }
 
+Status DBImpl::PerformTrivialMove(Compaction& c, LogBuffer* log_buffer,
+                                  bool& compaction_released,
+                                  size_t& moved_files, size_t& moved_bytes) {
+  mutex_.AssertHeld();
+
+  ROCKS_LOG_BUFFER(log_buffer, "[%s] Moving %d files to level-%d\n",
+                   c.column_family_data()->GetName().c_str(),
+                   static_cast<int>(c.num_input_files(0)), c.output_level());
+
+  // Move files to the output level by editing the manifest
+  for (unsigned int l = 0; l < c.num_input_levels(); l++) {
+    if (c.level(l) == c.output_level()) {
+      continue;
+    }
+    for (size_t i = 0; i < c.num_input_files(l); i++) {
+      FileMetaData* f = c.input(l, i);
+      c.edit()->DeleteFile(c.level(l), f->fd.GetNumber());
+      c.edit()->AddFile(c.output_level(), f->fd.GetNumber(), f->fd.GetPathId(),
+                        f->fd.GetFileSize(), f->smallest, f->largest,
+                        f->fd.smallest_seqno, f->fd.largest_seqno,
+                        f->marked_for_compaction, f->temperature,
+                        f->oldest_blob_file_number, f->oldest_ancester_time,
+                        f->file_creation_time, f->epoch_number,
+                        f->file_checksum, f->file_checksum_func_name,
+                        f->unique_id, f->compensated_range_deletion_size,
+                        f->tail_size, f->user_defined_timestamps_persisted,
+                        f->min_timestamp, f->max_timestamp);
+      moved_bytes += static_cast<size_t>(c.input(l, i)->fd.GetFileSize());
+      ROCKS_LOG_BUFFER(
+          log_buffer, "[%s] Moved #%" PRIu64 " to level-%d %" PRIu64 " bytes\n",
+          c.column_family_data()->GetName().c_str(), f->fd.GetNumber(),
+          c.output_level(), f->fd.GetFileSize());
+    }
+    moved_files += c.num_input_files(l);
+  }
+
+  // Install the new version
+  const ReadOptions read_options(Env::IOActivity::kCompaction);
+  const WriteOptions write_options(Env::IOActivity::kCompaction);
+  Status status = versions_->LogAndApply(
+      c.column_family_data(), read_options, write_options, c.edit(), &mutex_,
+      directories_.GetDbDir(), /*new_descriptor_log=*/false,
+      /*column_family_options=*/nullptr,
+      [&c, &compaction_released](const Status& s) {
+        c.ReleaseCompactionFiles(s);
+        compaction_released = true;
+      });
+
+  return status;
+}
+
 Status DBImpl::CompactFilesImpl(
     const CompactionOptions& compact_options, ColumnFamilyData* cfd,
     Version* version, const std::vector<std::string>& input_file_names,
@@ -1444,7 +1490,17 @@ Status DBImpl::CompactFilesImpl(
   if (shutting_down_.load(std::memory_order_acquire)) {
     return Status::ShutdownInProgress();
   }
-  if (manual_compaction_paused_.load(std::memory_order_acquire) > 0) {
+
+  // triggered by AbortAllCompactions
+  if (compaction_aborted_.load(std::memory_order_acquire) > 0) {
+    return Status::Incomplete(Status::SubCode::kCompactionAborted);
+  }
+
+  // triggered by DisableManualCompactions or by user-set canceled flag in
+  // CompactionOptions
+  if (manual_compaction_paused_.load(std::memory_order_acquire) > 0 ||
+      (compact_options.canceled &&
+       compact_options.canceled->load(std::memory_order_acquire))) {
     return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
   }
 
@@ -1463,7 +1519,7 @@ Status DBImpl::CompactFilesImpl(
     }
   }
 
-  if (cfd->ioptions().allow_ingest_behind &&
+  if (cfd->AllowIngestBehind() &&
       output_level >= cfd->ioptions().num_levels - 1) {
     return Status::InvalidArgument(
         "Exceed the maximum output level defined by "
@@ -1503,7 +1559,7 @@ Status DBImpl::CompactFilesImpl(
 
   std::unique_ptr<Compaction> c;
   assert(cfd->compaction_picker());
-  c.reset(cfd->compaction_picker()->CompactFiles(
+  c.reset(cfd->compaction_picker()->PickCompactionForCompactFiles(
       compact_options, input_files, output_level, version->storage_info(),
       cfd->GetLatestMutableCFOptions(), mutable_db_options_, output_path_id));
   // we already sanitized the set of input files and checked for conflicts
@@ -1515,11 +1571,64 @@ Status DBImpl::CompactFilesImpl(
   // deletion compaction currently not allowed in CompactFiles.
   assert(!c->deletion_compaction());
 
-  std::vector<SequenceNumber> snapshot_seqs;
-  SequenceNumber earliest_write_conflict_snapshot;
-  SnapshotChecker* snapshot_checker;
-  GetSnapshotContext(job_context, &snapshot_seqs,
-                     &earliest_write_conflict_snapshot, &snapshot_checker);
+  // Check if this can be a trivial move (metadata-only update)
+  // Similar to the logic in DBImpl::BackgroundCompaction
+  // Note: We disable trivial move when compaction_service is present because
+  // the service expects all compactions to go through CompactionJob for
+  // tracking
+  bool is_trivial_move = compact_options.allow_trivial_move &&
+                         c->IsTrivialMove() &&
+                         immutable_db_options().compaction_service == nullptr;
+
+  if (is_trivial_move) {
+    // Perform trivial move: just update manifest without rewriting data
+    TEST_SYNC_POINT("DBImpl::CompactFilesImpl:TrivialMove");
+
+    bool compaction_released = false;
+    size_t moved_files = 0;
+    size_t moved_bytes = 0;
+    Status status = PerformTrivialMove(
+        *c.get(), log_buffer, compaction_released, moved_files, moved_bytes);
+
+    if (status.ok()) {
+      InstallSuperVersionAndScheduleWork(
+          c->column_family_data(), job_context->superversion_contexts.data());
+
+      // Populate output file names for trivial move
+      if (output_file_names != nullptr) {
+        for (const auto& newf : c->edit()->GetNewFiles()) {
+          output_file_names->push_back(TableFileName(
+              c->immutable_options().cf_paths, newf.second.fd.GetNumber(),
+              newf.second.fd.GetPathId()));
+        }
+      }
+
+      ROCKS_LOG_BUFFER(
+          log_buffer,
+          "[%s] Trivial move succeeded for %zu files, %zu bytes total\n",
+          c->column_family_data()->GetName().c_str(), moved_files, moved_bytes);
+    } else {
+      if (!compaction_released) {
+        c->ReleaseCompactionFiles(status);
+      }
+      ROCKS_LOG_BUFFER(log_buffer, "[%s] Trivial move failed: %s\n",
+                       c->column_family_data()->GetName().c_str(),
+                       status.ToString().c_str());
+      error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction);
+    }
+
+    c.reset();
+    bg_compaction_scheduled_--;
+    if (bg_compaction_scheduled_ == 0) {
+      bg_cv_.SignalAll();
+    }
+    MaybeScheduleFlushOrCompaction();
+
+    return status;
+  }
+
+  // Not a trivial move, proceed with full compaction
+  InitSnapshotContext(job_context);
 
   std::unique_ptr<std::list<uint64_t>::iterator> pending_outputs_inserted_elem(
       new std::list<uint64_t>::iterator(
@@ -1533,22 +1642,21 @@ Status DBImpl::CompactFilesImpl(
       log_buffer, directories_.GetDbDir(),
       GetDataDir(c->column_family_data(), c->output_path_id()),
       GetDataDir(c->column_family_data(), 0), stats_, &mutex_, &error_handler_,
-      snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker,
       job_context, table_cache_, &event_logger_,
       c->mutable_cf_options().paranoid_file_checks,
       c->mutable_cf_options().report_bg_io_stats, dbname_,
       &compaction_job_stats, Env::Priority::USER, io_tracer_,
-      kManualCompactionCanceledFalse_, db_id_, db_session_id_,
-      c->column_family_data()->GetFullHistoryTsLow(), c->trim_ts(),
-      &blob_callback_, &bg_compaction_scheduled_,
+      kManualCompactionCanceledFalse_, compaction_aborted_, db_id_,
+      db_session_id_, c->column_family_data()->GetFullHistoryTsLow(),
+      c->trim_ts(), &blob_callback_, &bg_compaction_scheduled_,
       &bg_bottom_compaction_scheduled_);
 
   // Creating a compaction influences the compaction score because the score
   // takes running compactions into account (by skipping files that are already
   // being compacted). Since we just changed compaction score, we recalculate it
   // here.
-  version->storage_info()->ComputeCompactionScore(cfd->ioptions(),
-                                                  c->mutable_cf_options());
+  version->storage_info()->ComputeCompactionScore(
+      cfd->ioptions(), c->mutable_cf_options(), cfd->GetFullHistoryTsLow());
 
   compaction_job.Prepare(std::nullopt /*subcompact to be computed*/);
 
@@ -1611,6 +1719,11 @@ Status DBImpl::CompactFilesImpl(
                    "[%s] [JOB %d] Stopping manual compaction",
                    c->column_family_data()->GetName().c_str(),
                    job_context->job_id);
+  } else if (status.IsCompactionAborted()) {
+    // Don't report aborted compaction as error
+    ROCKS_LOG_INFO(
+        immutable_db_options_.info_log, "[%s] [JOB %d] Compaction aborted",
+        c->column_family_data()->GetName().c_str(), job_context->job_id);
   } else {
     ROCKS_LOG_WARN(immutable_db_options_.info_log,
                    "[%s] [JOB %d] Compaction error: %s",
@@ -1695,11 +1808,13 @@ void DBImpl::NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c,
   }
 
   c->SetNotifyOnCompactionCompleted();
+  int num_l0_files = c->input_version()->storage_info()->NumLevelFiles(0);
   // release lock while notifying events
   mutex_.Unlock();
   TEST_SYNC_POINT("DBImpl::NotifyOnCompactionBegin::UnlockMutex");
   {
     CompactionJobInfo info{};
+    info.num_l0_files = num_l0_files;
     BuildCompactionJobInfo(cfd, c, st, job_stats, job_id, &info);
     for (const auto& listener : immutable_db_options_.listeners) {
       listener->OnCompactionBegin(this, info);
@@ -1724,11 +1839,13 @@ void DBImpl::NotifyOnCompactionCompleted(
     return;
   }
 
+  int num_l0_files = cfd->current()->storage_info()->NumLevelFiles(0);
   // release lock while notifying events
   mutex_.Unlock();
   TEST_SYNC_POINT("DBImpl::NotifyOnCompactionCompleted::UnlockMutex");
   {
     CompactionJobInfo info{};
+    info.num_l0_files = num_l0_files;
     BuildCompactionJobInfo(cfd, c, st, compaction_job_stats, job_id, &info);
     for (const auto& listener : immutable_db_options_.listeners) {
       listener->OnCompactionCompleted(this, info);
@@ -1848,15 +1965,13 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
         ,
         LLONG_MAX /* max compaction bytes, not applicable */,
         0 /* output path ID, not applicable */, mutable_cf_options.compression,
-        mutable_cf_options.compression_opts,
-        mutable_cf_options.default_write_temperature,
+        mutable_cf_options.compression_opts, Temperature::kUnknown,
         0 /* max_subcompactions, not applicable */,
         {} /* grandparents, not applicable */,
         std::nullopt /* earliest_snapshot */, nullptr /* snapshot_checker */,
-        false /* is manual */, "" /* trim_ts */, -1 /* score, not applicable */,
-        false /* is deletion compaction, not applicable */,
-        false /* l0_files_might_overlap, not applicable */,
-        CompactionReason::kRefitLevel));
+        CompactionReason::kRefitLevel, "" /* trim_ts */,
+        -1 /* score, not applicable */,
+        false /* l0_files_might_overlap, not applicable */));
     cfd->compaction_picker()->RegisterCompaction(c.get());
     TEST_SYNC_POINT("DBImpl::ReFitLevel:PostRegisterCompaction");
     VersionEdit edit;
@@ -1871,7 +1986,8 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
           f->oldest_ancester_time, f->file_creation_time, f->epoch_number,
           f->file_checksum, f->file_checksum_func_name, f->unique_id,
           f->compensated_range_deletion_size, f->tail_size,
-          f->user_defined_timestamps_persisted);
+          f->user_defined_timestamps_persisted, f->min_timestamp,
+          f->max_timestamp);
     }
     ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
                     "[%s] Apply version edit:\n%s", cfd->GetName().c_str(),
@@ -1909,10 +2025,6 @@ int DBImpl::NumberLevels(ColumnFamilyHandle* column_family) {
   return cfh->cfd()->NumberLevels();
 }
 
-int DBImpl::MaxMemCompactionLevel(ColumnFamilyHandle* /*column_family*/) {
-  return 0;
-}
-
 int DBImpl::Level0StopWriteTrigger(ColumnFamilyHandle* column_family) {
   auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
   InstrumentedMutexLock l(&mutex_);
@@ -2068,6 +2180,17 @@ Status DBImpl::RunManualCompaction(
     return manual.status;
   }
 
+  if (compaction_aborted_.load(std::memory_order_acquire) > 0) {
+    // All compactions are being aborted. Return immediately.
+    int counter = compaction_aborted_.load(std::memory_order_acquire);
+    ROCKS_LOG_INFO(
+        immutable_db_options_.info_log,
+        "RunManualCompaction: Aborting due to compaction_aborted_=%d", counter);
+    manual.status = Status::Incomplete(Status::SubCode::kCompactionAborted);
+    manual.done = true;
+    return manual.status;
+  }
+
   // When a manual compaction arrives, temporarily disable scheduling of
   // non-manual compactions and wait until the number of scheduled compaction
   // jobs drops to zero. This used to be needed to ensure that this manual
@@ -2092,6 +2215,13 @@ Status DBImpl::RunManualCompaction(
     // and `CompactRangeOptions::canceled` might not work well together.
     while (bg_bottom_compaction_scheduled_ > 0 ||
            bg_compaction_scheduled_ > 0) {
+      if (compaction_aborted_.load(std::memory_order_acquire) > 0) {
+        // Pretend the error came from compaction so the below cleanup/error
+        // handling code can process it.
+        manual.done = true;
+        manual.status = Status::Incomplete(Status::SubCode::kCompactionAborted);
+        break;
+      }
       if (manual_compaction_paused_ > 0 || manual.canceled == true) {
         // Pretend the error came from compaction so the below cleanup/error
         // handling code can process it.
@@ -2182,6 +2312,7 @@ Status DBImpl::RunManualCompaction(
         // Don't throttle manual compaction, only count outstanding tasks.
         assert(false);
       }
+      ca->prepicked_compaction->need_repick = false;
       manual.incomplete = false;
       if (compaction->bottommost_level() &&
           env_->GetBackgroundThreads(Env::Priority::BOTTOM) > 0) {
@@ -2209,7 +2340,12 @@ Status DBImpl::RunManualCompaction(
     if (!scheduled) {
       // There is nothing scheduled to wait on, so any cancellation can end the
       // manual now.
-      if (manual_compaction_paused_ > 0 || manual.canceled == true) {
+      if (compaction_aborted_.load(std::memory_order_acquire) > 0) {
+        // Stop waiting since it was canceled. Pretend the error came from
+        // compaction so the below cleanup/error handling code can process it.
+        manual.done = true;
+        manual.status = Status::Incomplete(Status::SubCode::kCompactionAborted);
+      } else if (manual_compaction_paused_ > 0 || manual.canceled == true) {
         // Stop waiting since it was canceled. Pretend the error came from
         // compaction so the below cleanup/error handling code can process it.
         manual.done = true;
@@ -2711,6 +2847,10 @@ Status DBImpl::WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd,
 // Finish waiting when ALL column families finish flushing memtables.
 // resuming_from_bg_err indicates whether the caller is trying to resume from
 // background error or in normal processing.
+// Note that the wait finishes when the flush result is installed to column
+// families' Versions and persisted in MANIFEST. It doesn't wait until
+// SuperVersion to reflect the flush result, except for the case when
+// flush_reason is `kExternalFileIngestion`.
 Status DBImpl::WaitForFlushMemTables(
     const autovector<ColumnFamilyData*>& cfds,
     const autovector<const uint64_t*>& flush_memtable_ids,
@@ -2784,16 +2924,8 @@ Status DBImpl::WaitForFlushMemTables(
 
 Status DBImpl::EnableAutoCompaction(
     const std::vector<ColumnFamilyHandle*>& column_family_handles) {
-  Status s;
-  for (auto cf_ptr : column_family_handles) {
-    Status status =
-        this->SetOptions(cf_ptr, {{"disable_auto_compactions", "false"}});
-    if (!status.ok()) {
-      s = status;
-    }
-  }
-
-  return s;
+  return SetOptions(column_family_handles,
+                    {{"disable_auto_compactions", "false"}});
 }
 
 // NOTE: Calling DisableManualCompaction() may overwrite the
@@ -2831,6 +2963,61 @@ void DBImpl::EnableManualCompaction() {
   manual_compaction_paused_.fetch_sub(1, std::memory_order_release);
 }
 
+void DBImpl::AbortAllCompactions() {
+  InstrumentedMutexLock l(&mutex_);
+
+  // Increment the abort counter to signal all compactions to abort
+  compaction_aborted_.fetch_add(1, std::memory_order_release);
+
+  TEST_SYNC_POINT("DBImpl::AbortAllCompactions:FlagSet");
+
+  // Mark all manual compactions as canceled
+  for (const auto& manual_compaction : manual_compaction_dequeue_) {
+    manual_compaction->canceled = true;
+  }
+
+  // Wake up any waiting compaction threads to check the abort signal
+  bg_cv_.SignalAll();
+
+  // Wait for all running compactions (both manual and automatic) to finish
+  // or abort before returning.
+  // Note: bg_cv_.Wait() releases the mutex while waiting, so other threads
+  // can make progress and signal when compactions complete.
+  while (bg_bottom_compaction_scheduled_ > 0 || bg_compaction_scheduled_ > 0 ||
+         HasPendingManualCompaction()) {
+    bg_cv_.Wait();
+  }
+}
+
+void DBImpl::ResumeAllCompactions() {
+  InstrumentedMutexLock l(&mutex_);
+  int before = compaction_aborted_.load(std::memory_order_acquire);
+
+  // Guard against calling Resume without prior Abort
+  if (before <= 0) {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                   "ResumeAllCompactions called without prior "
+                   "AbortAllCompactions (counter=%d)",
+                   before);
+    return;
+  }
+
+  // Decrement the abort counter
+  compaction_aborted_.fetch_sub(1, std::memory_order_release);
+
+  // As the operation is executed under db mutex, we could just use before value
+  // to calculate the current value.
+  int current = before - 1;
+  ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                 "ResumeAllCompactions: counter %d -> %d", before, current);
+
+  // If this is the last resume call (abort counter back to 0), schedule
+  // compactions that may have been waiting
+  if (current == 0) {
+    MaybeScheduleFlushOrCompaction();
+  }
+}
+
 void DBImpl::MaybeScheduleFlushOrCompaction() {
   mutex_.AssertHeld();
   TEST_SYNC_POINT("DBImpl::MaybeScheduleFlushOrCompaction:Start");
@@ -2895,6 +3082,9 @@ void DBImpl::MaybeScheduleFlushOrCompaction() {
   if (bg_compaction_paused_ > 0) {
     // we paused the background compaction
     return;
+  } else if (compaction_aborted_.load(std::memory_order_acquire) > 0) {
+    // we are aborting all compactions
+    return;
   } else if (error_handler_.IsBGWorkStopped()) {
     // Compaction is not part of the recovery sequence from a hard error. We
     // might get here because recovery might do a flush and install a new
@@ -3404,6 +3594,10 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
   bool made_progress = false;
   JobContext job_context(next_job_id_.fetch_add(1), true);
   TEST_SYNC_POINT("BackgroundCallCompaction:0");
+  if (bg_thread_pri == Env::Priority::BOTTOM) {
+    TEST_SYNC_POINT("BackgroundCallCompaction:0:BottomPri");
+  }
+
   LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
                        immutable_db_options_.info_log.get());
   {
@@ -3428,7 +3622,8 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
           10000);  // prevent hot loop
       mutex_.Lock();
     } else if (!s.ok() && !s.IsShutdownInProgress() &&
-               !s.IsManualCompactionPaused() && !s.IsColumnFamilyDropped()) {
+               !s.IsManualCompactionPaused() && !s.IsColumnFamilyDropped() &&
+               !s.IsCompactionAborted()) {
       // Wait a little bit before retrying background compaction in
       // case this is an environmental problem and we do not want to
       // chew up resources for failed compactions for the duration of
@@ -3460,6 +3655,7 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
     // case of a failure). Thus, we force full scan in FindObsoleteFiles()
     FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() &&
                                         !s.IsManualCompactionPaused() &&
+                                        !s.IsCompactionAborted() &&
                                         !s.IsColumnFamilyDropped() &&
                                         !s.IsBusy());
     TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:FoundObsoleteFiles");
@@ -3564,6 +3760,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
   if (!error_handler_.IsBGWorkStopped()) {
     if (shutting_down_.load(std::memory_order_acquire)) {
       status = Status::ShutdownInProgress();
+    } else if (compaction_aborted_.load(std::memory_order_acquire) > 0) {
+      status = Status::Incomplete(Status::SubCode::kCompactionAborted);
     } else if (is_manual &&
                manual_compaction->canceled.load(std::memory_order_acquire)) {
       status = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
@@ -3639,34 +3837,54 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
                  : m->manual_end->DebugString(true).c_str()));
       }
     }
-  } else if (!is_prepicked && !compaction_queue_.empty()) {
+  } else if (ShouldPickCompaction(is_prepicked, prepicked_compaction)) {
+    bool need_repick = is_prepicked && prepicked_compaction->need_repick;
     if (HasExclusiveManualCompaction()) {
-      // Can't compact right now, but try again later
       TEST_SYNC_POINT("DBImpl::BackgroundCompaction()::Conflict");
 
-      // Stay in the compaction queue.
-      unscheduled_compactions_++;
+      // TODO(hx235): Resolve conflict between intended
+      // bottom-priority compaction (requiring repick, i.e., need_repick = true)
+      // and exclusive manual compaction by releasing the intended
+      // bottom-priority compaction.
+      if (!need_repick) {
+        // Can't compact right now, but try again later
+        //
+        // Increase `unscheduled_compactions_` directly so we
+        // don't need to
+        // dequeue and enqueue the CFD again in the compaction queue and thus
+        // keep the CFD's position in the queue
+        unscheduled_compactions_++;
 
-      return Status::OK();
+        return Status::OK();
+      }
     }
 
-    auto cfd = PickCompactionFromQueue(&task_token, log_buffer);
-    if (cfd == nullptr) {
-      // Can't find any executable task from the compaction queue.
-      // All tasks have been throttled by compaction thread limiter.
-      ++unscheduled_compactions_;
-      return Status::Busy();
-    }
+    ColumnFamilyData* cfd = nullptr;
+
+    if (!need_repick) {
+      cfd = PickCompactionFromQueue(&task_token, log_buffer);
+      if (cfd == nullptr) {
+        // Can't find any executable task from the compaction queue.
+        // All tasks have been throttled by compaction thread limiter.
+        ++unscheduled_compactions_;
+        return Status::Busy();
+      }
 
-    // We unreference here because the following code will take a Ref() on
-    // this cfd if it is going to use it (Compaction class holds a
-    // reference).
-    // This will all happen under a mutex so we don't have to be afraid of
-    // somebody else deleting it.
-    if (cfd->UnrefAndTryDelete()) {
-      // This was the last reference of the column family, so no need to
-      // compact.
-      return Status::OK();
+      // We unreference here because the following code will take a Ref() on
+      // this cfd if it is going to use it (Compaction class holds a
+      // reference).
+      // This will all happen under a mutex so we don't have to be afraid of
+      // somebody else deleting it.
+      if (cfd->UnrefAndTryDelete()) {
+        // This was the last reference of the column family, so no need to
+        // compact.
+        return Status::OK();
+      }
+    } else {
+      cfd = c->column_family_data();
+      assert(cfd);
+      ResetBottomPriCompactionIntent(cfd, c);
+      assert(c == nullptr);
     }
 
     // Pick up latest mutable CF Options and use it throughout the
@@ -3680,21 +3898,24 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
       // compaction is not necessary. Need to make sure mutex is held
       // until we make a copy in the following code
       TEST_SYNC_POINT("DBImpl::BackgroundCompaction():BeforePickCompaction");
-      SnapshotChecker* snapshot_checker = nullptr;
-      std::vector<SequenceNumber> snapshot_seqs;
       // This info is not useful for other scenarios, so save querying existing
       // snapshots for those cases.
       if (cfd->ioptions().compaction_style == kCompactionStyleUniversal &&
           cfd->user_comparator()->timestamp_size() == 0) {
-        SequenceNumber earliest_write_conflict_snapshot;
-        GetSnapshotContext(job_context, &snapshot_seqs,
-                           &earliest_write_conflict_snapshot,
-                           &snapshot_checker);
+        InitSnapshotContext(job_context);
         assert(is_snapshot_supported_ || snapshots_.empty());
       }
-      c.reset(cfd->PickCompaction(mutable_cf_options, mutable_db_options_,
-                                  snapshot_seqs, snapshot_checker, log_buffer));
-      TEST_SYNC_POINT("DBImpl::BackgroundCompaction():AfterPickCompaction");
+      c.reset(cfd->PickCompaction(
+          mutable_cf_options, mutable_db_options_, job_context->snapshot_seqs,
+          job_context->snapshot_checker, log_buffer,
+          thread_pri == Env::Priority::BOTTOM /* require_max_output_level */));
+      if (thread_pri == Env::Priority::LOW) {
+        TEST_SYNC_POINT("DBImpl::BackgroundCompaction():AfterPickCompaction");
+      } else if (thread_pri == Env::Priority::BOTTOM) {
+        TEST_SYNC_POINT_CALLBACK(
+            "DBImpl::BackgroundCompaction():AfterPickCompactionBottomPri",
+            c.get());
+      }
 
       if (c != nullptr) {
         bool enough_room = EnoughRoomForCompaction(
@@ -3707,8 +3928,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
               ->current()
               ->storage_info()
               ->ComputeCompactionScore(c->immutable_options(),
-                                       c->mutable_cf_options());
-          AddToCompactionQueue(cfd);
+                                       c->mutable_cf_options(),
+                                       cfd->GetFullHistoryTsLow());
+          EnqueuePendingCompaction(cfd);
 
           c.reset();
           // Don't need to sleep here, because BackgroundCallCompaction
@@ -3730,16 +3952,21 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
           // options take effect.
           // 3) When we Pick a new compaction, we "remove" those files being
           // compacted from the calculation, which then influences compaction
-          // score. Here we check if we need the new compaction even without the
-          // files that are currently being compacted. If we need another
-          // compaction, we might be able to execute it in parallel, so we add
-          // it to the queue and schedule a new thread.
-          if (cfd->NeedsCompaction()) {
-            // Yes, we need more compactions!
-            AddToCompactionQueue(cfd);
-            MaybeScheduleFlushOrCompaction();
-          }
+          // score. Inside EnqueuePendingCompaction(),  we check if we need
+          // the new compaction even without the files that are currently being
+          // compacted. If we need another compaction, we might be able to
+          // execute it in parallel, so we add it to the queue and schedule a
+          // new thread.
+          EnqueuePendingCompaction(cfd);
+          MaybeScheduleFlushOrCompaction();
         }
+      } else if (is_prepicked) {
+        ROCKS_LOG_BUFFER(
+            log_buffer,
+            "[%s] Pre-picked compaction repicked files for compaction as "
+            "required, "
+            "but upon re-evaluation, no compaction was found necessary \n",
+            cfd->GetName().c_str());
       }
     }
   }
@@ -3781,11 +4008,253 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
                      c->column_family_data()->GetName().c_str(),
                      c->num_input_files(0));
     if (status.ok() && io_s.ok()) {
-      UpdateDeletionCompactionStats(c);
+      UpdateFIFOCompactionStatus(c);
     }
     *made_progress = true;
     TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction",
                              c->column_family_data());
+  } else if (c->is_trivial_copy_compaction()) {
+    TEST_SYNC_POINT_CALLBACK(
+        "DBImpl::BackgroundCompaction:TriviaCopyBeforeCompaction",
+        c->column_family_data());
+    assert(c->num_input_files(1) == 0);
+    assert(c->column_family_data()->ioptions().compaction_style ==
+           kCompactionStyleFIFO);
+    assert(c->compaction_reason() == CompactionReason::kChangeTemperature);
+
+    compaction_job_stats.num_input_files = c->num_input_files(0);
+
+    NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
+                            compaction_job_stats, job_context->job_id);
+
+    std::vector<FileMetaData> out_files;
+    for (const auto& in_file : *c->inputs(0)) {
+      const uint64_t out_file_number = versions_->NewFileNumber();
+      const std::string in_fname =
+          TableFileName(c->immutable_options().cf_paths,
+                        in_file->fd.GetNumber(), in_file->fd.GetPathId());
+      const std::string out_fname =
+          TableFileName(c->immutable_options().cf_paths, out_file_number,
+                        c->output_path_id());
+
+      // TODO (mikechuang): Currently skip calling
+      // EventHelpers::NotifyTableFileCreationStarted for the trivial copy.
+      // Since it's a trivial copy we should ideally use the exact
+      // TableProperties from the input file but that will break some existing
+      // stress tests. For now skip the listener call for the FIFO
+      // kChangeTemperature trivial copy move.
+
+      int64_t tmp_current_time = 0;
+      auto get_time_status =
+          immutable_db_options_.clock->GetCurrentTime(&tmp_current_time);
+      if (!get_time_status.ok()) {
+        ROCKS_LOG_BUFFER(log_buffer,
+                         "[%s] WARNING: Failed to get current time %s "
+                         "status=%s",
+                         c->column_family_data()->GetName().c_str(),
+                         get_time_status.ToString().c_str());
+      }
+      uint64_t out_file_creation_time = static_cast<uint64_t>(tmp_current_time);
+
+      FileOptions copied_file_options = file_options_;
+      copied_file_options.temperature = c->GetOutputTemperature();
+      std::unique_ptr<WritableFileWriter> dest_writer;
+      {
+        std::unique_ptr<FSWritableFile> dest_file;
+        IOStatus writable_file_io_status =
+            immutable_db_options_.fs.get()->NewWritableFile(
+                out_fname, copied_file_options, &dest_file, nullptr /* dbg */);
+        TEST_SYNC_POINT_CALLBACK(
+            "NewWritableFile::FileOptions.temperature",
+            const_cast<Temperature*>(&copied_file_options.temperature));
+        if (!writable_file_io_status.ok()) {
+          io_s = writable_file_io_status;
+          ROCKS_LOG_BUFFER(
+              log_buffer,
+              "[%s] Error: Abort trivial copy compaction, failed to open "
+              "NewWritableFile %s\n"
+              " out_fname=%s, temperature=%s, io_status=%s",
+              c->column_family_data()->GetName().c_str(), out_fname.c_str(),
+              temperature_to_string[c->GetOutputTemperature()].c_str(),
+              io_s.ToString().c_str());
+          break;
+        }
+
+        FileTypeSet tmp_set = immutable_db_options_.checksum_handoff_file_types;
+        dest_writer.reset(new WritableFileWriter(
+            std::move(dest_file), out_fname, copied_file_options,
+            immutable_db_options_.clock, io_tracer_,
+            immutable_db_options_.stats, Histograms::SST_WRITE_MICROS,
+            c->immutable_options().listeners,
+            immutable_db_options_.file_checksum_gen_factory.get(),
+            tmp_set.Contains(FileType::kTableFile), false));
+      }
+
+      ROCKS_LOG_BUFFER(
+          log_buffer,
+          "[%s] Started copying from: %s\n"
+          " temperature=%s, to: %s, temperature=%s, buffer_size=%" PRIu64,
+          c->column_family_data()->GetName().c_str(), in_fname.c_str(),
+          temperature_to_string[in_file->temperature].c_str(),
+          out_fname.c_str(),
+          temperature_to_string[c->GetOutputTemperature()].c_str(),
+          c->mutable_cf_options()
+              .compaction_options_fifo.trivial_copy_buffer_size);
+      // Add IO_LOW HINT for compaction
+      IOOptions copy_files_compaction_io_options;
+      copy_files_compaction_io_options.rate_limiter_priority =
+          Env::IOPriority::IO_LOW;
+      copy_files_compaction_io_options.type = IOType::kData;
+      copy_files_compaction_io_options.io_activity =
+          Env::IOActivity::kCompaction;
+
+      IOStatus copy_file_io_status = CopyFile(
+          immutable_db_options_.fs.get() /* fileSystem */,
+          in_fname /* source */, in_file->temperature /* src_temp_hint */,
+          dest_writer /* dest_writer */, 0 /* size */, true /* use_fsync */,
+          io_tracer_ /* io_tracer*/,
+          c->mutable_cf_options()
+              .compaction_options_fifo
+              .trivial_copy_buffer_size /* max_read_buffer_size
+                                         */
+          ,
+          copy_files_compaction_io_options /* readIOOptions */,
+          copy_files_compaction_io_options /* writeIOOptions */);
+      if (dest_writer) {
+        IOOptions close_files_compaction_io_options;
+        close_files_compaction_io_options.rate_limiter_priority =
+            Env::IOPriority::IO_LOW;
+        close_files_compaction_io_options.type = IOType::kData;
+        close_files_compaction_io_options.io_activity =
+            Env::IOActivity::kCompaction;
+        // Close the dest_write
+        io_s = dest_writer->Close(close_files_compaction_io_options);
+        if (!io_s.ok()) {
+          ROCKS_LOG_BUFFER(
+              log_buffer,
+              "[%s] Failed to close the writer. Failed to copy from: %s\n"
+              " temperature=%s, to=%s, temperature=%s, io_status=%s",
+              c->column_family_data()->GetName().c_str(), in_fname.c_str(),
+              temperature_to_string[in_file->temperature].c_str(),
+              out_fname.c_str(),
+              temperature_to_string[c->GetOutputTemperature()].c_str(),
+              io_s.ToString().c_str());
+          break;
+        }
+      }
+
+      io_s = copy_file_io_status;
+
+      if (!io_s.ok()) {
+        ROCKS_LOG_BUFFER(
+            log_buffer,
+            "[%s] Failed to copy from: %s\n"
+            " temperature=%s, to=%s, temperature=%s, io_status=%s",
+            c->column_family_data()->GetName().c_str(), in_fname.c_str(),
+            temperature_to_string[in_file->temperature].c_str(),
+            out_fname.c_str(),
+            temperature_to_string[c->GetOutputTemperature()].c_str(),
+            io_s.ToString().c_str());
+        break;
+      }
+      ROCKS_LOG_BUFFER(log_buffer,
+                       "[%s] Successfully copying from: %s\n"
+                       " temperature=%s, to=%s, temperature=%s, io_status=%s",
+                       c->column_family_data()->GetName().c_str(),
+                       in_fname.c_str(),
+                       temperature_to_string[in_file->temperature].c_str(),
+                       out_fname.c_str(),
+                       temperature_to_string[c->GetOutputTemperature()].c_str(),
+                       io_s.ToString().c_str());
+
+      FileMetaData out_file_metadata{
+          out_file_number,
+          c->output_path_id(),
+          in_file->fd.GetFileSize(),
+          in_file->smallest,
+          in_file->largest,
+          in_file->fd.smallest_seqno,
+          in_file->fd.largest_seqno,
+          false /* marked_for_compact */,
+          c->GetOutputTemperature() /* temperature */,
+          in_file->oldest_blob_file_number,
+          in_file->oldest_ancester_time,
+          out_file_creation_time,
+          c->MinInputFileEpochNumber(),
+          dest_writer->GetFileChecksum(),
+          dest_writer->GetFileChecksumFuncName(),
+          in_file->unique_id,
+          in_file->compensated_range_deletion_size,
+          in_file->tail_size,
+          in_file->user_defined_timestamps_persisted,
+          in_file->min_timestamp,
+          in_file->max_timestamp};
+
+      out_files.push_back(std::move(out_file_metadata));
+    }
+
+    // Update version set
+    if (status.ok() && io_s.ok()) {
+      // NOTE: ChangeTemperature should only copy one file at one file
+      // hence *c->inputs(0) == out_files.size() == 1 if copy succeeded
+      assert(c->inputs(0)->size() == 1);
+      assert(out_files.size() == 1);
+
+      auto out_file_metadata_it = out_files.begin();
+      for (const auto& in_file : *c->inputs(0)) {
+        if (out_file_metadata_it == out_files.end()) {
+          break;
+        }
+
+        c->edit()->DeleteFile(c->level(), in_file->fd.GetNumber());
+        c->edit()->AddFile(c->level(), *out_file_metadata_it);
+        ++out_file_metadata_it;
+      }
+
+      status = versions_->LogAndApply(
+          c->column_family_data(), read_options, write_options, c->edit(),
+          &mutex_, directories_.GetDbDir(),
+          /*new_descriptor_log=*/false, /*column_family_options=*/nullptr,
+          [&c, &compaction_released](const Status& s) {
+            c->ReleaseCompactionFiles(s);
+            compaction_released = true;
+          });
+    }
+
+    // TODO (mikechuang): Currently skip calling
+    // EventHelper::LogAndNotifyTableFileCreationFinished for the trivial copy.
+    // Since it's a trivial copy we should ideally use the exact TableProperties
+    // from the input file but that will break some existing stress tests. For
+    // now skip the listener call for the FIFO kChangeTemperature trivial copy
+    // move.
+
+    if (io_s.ok()) {
+      io_s = versions_->io_status();
+    }
+
+    InstallSuperVersionAndScheduleWork(
+        c->column_family_data(), job_context->superversion_contexts.data());
+    if (status.ok() && io_s.ok()) {
+      UpdateFIFOCompactionStatus(c);
+    } else {
+      for (const auto& in_file : *c->inputs(0)) {
+        const std::string in_fname =
+            TableFileName(c->immutable_options().cf_paths,
+                          in_file->fd.GetNumber(), in_file->fd.GetPathId());
+        ROCKS_LOG_BUFFER(
+            log_buffer,
+            "[%s] Failed to do trvial copy compaction: %s"
+            " temperature=%s, to temperature=%s, status=%s, io_status=%s",
+            c->column_family_data()->GetName().c_str(), in_fname.c_str(),
+            temperature_to_string[in_file->temperature].c_str(),
+            temperature_to_string[c->GetOutputTemperature()].c_str(),
+            status.ToString().c_str(), io_s.ToString().c_str());
+      }
+    }
+    *made_progress = true;
+    TEST_SYNC_POINT_CALLBACK(
+        "DBImpl::BackgroundCompaction:TriviaCopyAfterCompaction",
+        c->column_family_data());
   } else if (!trivial_move_disallowed && c->IsTrivialMove()) {
     TEST_SYNC_POINT("DBImpl::BackgroundCompaction:TrivialMove");
     TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction",
@@ -3798,39 +4267,12 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
     compaction_job_stats.num_input_files = c->num_input_files(0);
     // Trivial moves do not get compacted remotely
     compaction_job_stats.is_remote_compaction = false;
+    compaction_job_stats.num_input_files_trivially_moved =
+        compaction_job_stats.num_input_files;
 
     NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
                             compaction_job_stats, job_context->job_id);
 
-    // Move files to next level
-    int32_t moved_files = 0;
-    int64_t moved_bytes = 0;
-    for (unsigned int l = 0; l < c->num_input_levels(); l++) {
-      if (c->level(l) == c->output_level()) {
-        continue;
-      }
-      for (size_t i = 0; i < c->num_input_files(l); i++) {
-        FileMetaData* f = c->input(l, i);
-        c->edit()->DeleteFile(c->level(l), f->fd.GetNumber());
-        c->edit()->AddFile(
-            c->output_level(), f->fd.GetNumber(), f->fd.GetPathId(),
-            f->fd.GetFileSize(), f->smallest, f->largest, f->fd.smallest_seqno,
-            f->fd.largest_seqno, f->marked_for_compaction, f->temperature,
-            f->oldest_blob_file_number, f->oldest_ancester_time,
-            f->file_creation_time, f->epoch_number, f->file_checksum,
-            f->file_checksum_func_name, f->unique_id,
-            f->compensated_range_deletion_size, f->tail_size,
-            f->user_defined_timestamps_persisted);
-
-        ROCKS_LOG_BUFFER(
-            log_buffer,
-            "[%s] Moving #%" PRIu64 " to level-%d %" PRIu64 " bytes\n",
-            c->column_family_data()->GetName().c_str(), f->fd.GetNumber(),
-            c->output_level(), f->fd.GetFileSize());
-        ++moved_files;
-        moved_bytes += f->fd.GetFileSize();
-      }
-    }
     if (c->compaction_reason() == CompactionReason::kLevelMaxLevelSize &&
         c->immutable_options().compaction_pri == kRoundRobin) {
       int start_level = c->start_level();
@@ -3841,14 +4283,12 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
             vstorage->GetNextCompactCursor(start_level, c->num_input_files(0)));
       }
     }
-    status = versions_->LogAndApply(
-        c->column_family_data(), read_options, write_options, c->edit(),
-        &mutex_, directories_.GetDbDir(),
-        /*new_descriptor_log=*/false, /*column_family_options=*/nullptr,
-        [&c, &compaction_released](const Status& s) {
-          c->ReleaseCompactionFiles(s);
-          compaction_released = true;
-        });
+
+    // Perform the trivial move
+    size_t moved_files = 0;
+    size_t moved_bytes = 0;
+    status = PerformTrivialMove(*c.get(), log_buffer, compaction_released,
+                                moved_files, moved_bytes);
     io_s = versions_->io_status();
     InstallSuperVersionAndScheduleWork(
         c->column_family_data(), job_context->superversion_contexts.data());
@@ -3863,8 +4303,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
           << "total_files_size" << moved_bytes;
     }
     ROCKS_LOG_BUFFER(
-        log_buffer,
-        "[%s] Moved #%d files to level-%d %" PRIu64 " bytes %s: %s\n",
+        log_buffer, "[%s] Moved #%d files to level-%zu %zu bytes %s: %s\n",
         c->column_family_data()->GetName().c_str(), moved_files,
         c->output_level(), moved_bytes, status.ToString().c_str(),
         c->column_family_data()->current()->storage_info()->LevelSummary(&tmp));
@@ -3874,14 +4313,17 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
     ThreadStatusUtil::ResetThreadStatus();
     TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction",
                              c->column_family_data());
-  } else if (!is_prepicked && c->output_level() > 0 &&
-             c->output_level() ==
+  } else if (!is_prepicked &&
+             Compaction::OutputToNonZeroMaxOutputLevel(
+                 c->output_level(),
                  c->column_family_data()
                      ->current()
                      ->storage_info()
                      ->MaxOutputLevel(
-                         immutable_db_options_.allow_ingest_behind) &&
+                         c->immutable_options().cf_allow_ingest_behind ||
+                         immutable_db_options_.allow_ingest_behind)) &&
              env_->GetBackgroundThreads(Env::Priority::BOTTOM) > 0) {
+    assert(thread_pri == Env::Priority::LOW);
     // Forward compactions involving last level to the bottom pool if it exists,
     // such that compactions unlikely to contribute to write stalls can be
     // delayed or deprioritized.
@@ -3890,7 +4332,23 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
     ca->db = this;
     ca->compaction_pri_ = Env::Priority::BOTTOM;
     ca->prepicked_compaction = new PrepickedCompaction;
-    ca->prepicked_compaction->compaction = c.release();
+
+    // If `universal_reduce_file_locking` is true, we only lock a limited set of
+    // input files by creating an intended compaction to forward to bottom
+    // priority pool and repicking files when bottom priority thread
+    // gets to execute this intended compaction
+    const bool need_repick =
+        c->mutable_cf_options()
+            .compaction_options_universal.reduce_file_locking;
+    if (need_repick) {
+      ca->prepicked_compaction->compaction =
+          CreateIntendedCompactionForwardedToBottomPriorityPool(c.get());
+      c.reset();
+      ca->prepicked_compaction->need_repick = true;
+    } else {
+      ca->prepicked_compaction->compaction = c.release();
+      ca->prepicked_compaction->need_repick = false;
+    }
     ca->prepicked_compaction->manual_compaction_state = nullptr;
     // Transfer requested token, so it doesn't need to do it again.
     ca->prepicked_compaction->task_token = std::move(task_token);
@@ -3905,11 +4363,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
     output_level = c->output_level();
     TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:NonTrivial",
                              &output_level);
-    std::vector<SequenceNumber> snapshot_seqs;
-    SequenceNumber earliest_write_conflict_snapshot;
-    SnapshotChecker* snapshot_checker;
-    GetSnapshotContext(job_context, &snapshot_seqs,
-                       &earliest_write_conflict_snapshot, &snapshot_checker);
+    InitSnapshotContext(job_context);
     assert(is_snapshot_supported_ || snapshots_.empty());
 
     CompactionJob compaction_job(
@@ -3918,15 +4372,15 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
         &shutting_down_, log_buffer, directories_.GetDbDir(),
         GetDataDir(c->column_family_data(), c->output_path_id()),
         GetDataDir(c->column_family_data(), 0), stats_, &mutex_,
-        &error_handler_, snapshot_seqs, earliest_write_conflict_snapshot,
-        snapshot_checker, job_context, table_cache_, &event_logger_,
+        &error_handler_, job_context, table_cache_, &event_logger_,
         c->mutable_cf_options().paranoid_file_checks,
         c->mutable_cf_options().report_bg_io_stats, dbname_,
         &compaction_job_stats, thread_pri, io_tracer_,
         is_manual ? manual_compaction->canceled
                   : kManualCompactionCanceledFalse_,
-        db_id_, db_session_id_, c->column_family_data()->GetFullHistoryTsLow(),
-        c->trim_ts(), &blob_callback_, &bg_compaction_scheduled_,
+        compaction_aborted_, db_id_, db_session_id_,
+        c->column_family_data()->GetFullHistoryTsLow(), c->trim_ts(),
+        &blob_callback_, &bg_compaction_scheduled_,
         &bg_bottom_compaction_scheduled_);
     compaction_job.Prepare(std::nullopt /*subcompact to be computed*/);
 
@@ -3939,8 +4393,15 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
     NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
                             compaction_job_stats, job_context->job_id);
     mutex_.Unlock();
-    TEST_SYNC_POINT_CALLBACK(
-        "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", nullptr);
+    if (thread_pri == Env::Priority::LOW) {
+      TEST_SYNC_POINT_CALLBACK(
+          "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", nullptr);
+    } else {
+      assert(thread_pri == Env::Priority::BOTTOM);
+      TEST_SYNC_POINT(
+          "DBImpl::BackgroundCompaction:NonTrivial:BeforeRunBottomPri");
+    }
+
     // Should handle error?
     compaction_job.Run().PermitUncheckedError();
     TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun");
@@ -4002,7 +4463,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
   }
 
   if (status.ok() || status.IsCompactionTooLarge() ||
-      status.IsManualCompactionPaused()) {
+      status.IsManualCompactionPaused() || status.IsCompactionAborted()) {
     // Done
   } else if (status.IsColumnFamilyDropped() || status.IsShutdownInProgress()) {
     // Ignore compaction errors found during shutting down
@@ -4033,10 +4494,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
           ->current()
           ->storage_info()
           ->ComputeCompactionScore(c->immutable_options(),
-                                   c->mutable_cf_options());
-      if (!cfd->queued_for_compaction()) {
-        AddToCompactionQueue(cfd);
-      }
+                                   c->mutable_cf_options(),
+                                   cfd->GetFullHistoryTsLow());
+      EnqueuePendingCompaction(cfd);
     }
   }
   // this will unref its input_version and column_family_data
@@ -4081,6 +4541,72 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
   return status;
 }
 
+// Create an intended compaction to forward based on the original picked
+// compaction. It serves two purposes while it is waiting
+// for a bottom-priority thread becomes available to run:
+// - Prevent the last input file (or sorted run if non-L0) from
+// being included in compaction score calculations unnecessarily since the
+// intended compaction is already scheduled to compact it
+// - Allow other input files to be picked by low-priority compactions that can
+// run right away
+//
+// Once a bottom-priority available to run this intended compaction, it will
+// repick files to consider the LSM updates that occurred during the waiting
+// period.
+Compaction* DBImpl::CreateIntendedCompactionForwardedToBottomPriorityPool(
+    Compaction* c) {
+  auto* cfd = c->column_family_data();
+  const auto& io = c->immutable_options();
+  const auto& mo = c->mutable_cf_options();
+  auto* vstorage = c->input_version()->storage_info();
+
+  std::vector<CompactionInputFiles> inputs(1);
+
+  const std::vector<FileMetaData*>* max_intput_level_files = nullptr;
+  int max_intput_level = 0;
+
+  for (size_t i = c->num_input_levels(); i >= 1; --i) {
+    size_t level = i - 1;
+    if (c->num_input_files(level) > 0) {
+      max_intput_level = static_cast<int>(level);
+      max_intput_level_files = c->inputs(level);
+      break;
+    }
+  }
+
+  assert(max_intput_level_files);
+  assert(!max_intput_level_files->empty());
+  inputs[0].level = max_intput_level;
+
+  if (max_intput_level == 0) {
+    // The last input file
+    inputs[0].files.push_back(
+        (*max_intput_level_files)[max_intput_level_files->size() - 1]);
+  } else {
+    // The last input sorted run
+    for (FileMetaData* f : (*max_intput_level_files)) {
+      inputs[0].files.push_back(f);
+    }
+  }
+
+  c->ReleaseCompactionFiles(Status::OK());
+
+  Compaction* intended_compaction =
+      new Compaction(vstorage, io, mo, mutable_db_options_, std::move(inputs),
+                     c->output_level(), c->target_output_file_size(),
+                     c->max_compaction_bytes(), c->output_path_id(),
+                     c->output_compression(), c->output_compression_opts(),
+                     c->GetOutputTemperature(), c->max_subcompactions(),
+                     c->grandparents(), std::nullopt /* earliest_snapshot */,
+                     nullptr /* snapshot_checker */, c->compaction_reason());
+
+  cfd->compaction_picker()->RegisterCompaction(intended_compaction);
+  vstorage->ComputeCompactionScore(io, mo, cfd->GetFullHistoryTsLow());
+  intended_compaction->FinalizeInputInfo(cfd->current());
+
+  return intended_compaction;
+}
+
 bool DBImpl::HasPendingManualCompaction() {
   return (!manual_compaction_dequeue_.empty());
 }
@@ -4169,8 +4695,7 @@ bool DBImpl::MCOverlap(ManualCompactionState* m, ManualCompactionState* m1) {
   return false;
 }
 
-void DBImpl::UpdateDeletionCompactionStats(
-    const std::unique_ptr<Compaction>& c) {
+void DBImpl::UpdateFIFOCompactionStatus(const std::unique_ptr<Compaction>& c) {
   if (c == nullptr) {
     return;
   }
@@ -4184,6 +4709,9 @@ void DBImpl::UpdateDeletionCompactionStats(
     case CompactionReason::kFIFOTtl:
       RecordTick(stats_, FIFO_TTL_COMPACTIONS);
       break;
+    case CompactionReason::kChangeTemperature:
+      RecordTick(stats_, FIFO_CHANGE_TEMPERATURE_COMPACTIONS);
+      break;
     default:
       assert(false);
       break;
@@ -4198,6 +4726,7 @@ void DBImpl::BuildCompactionJobInfo(
   compaction_job_info->cf_id = cfd->GetID();
   compaction_job_info->cf_name = cfd->GetName();
   compaction_job_info->status = st;
+  compaction_job_info->aborted = st.IsCompactionAborted();
   compaction_job_info->thread_id = env_->GetThreadID();
   compaction_job_info->job_id = job_id;
   compaction_job_info->base_input_level = c->start_level();
@@ -4273,9 +4802,10 @@ void DBImpl::BuildCompactionJobInfo(
 // for superversion_to_free
 
 void DBImpl::InstallSuperVersionAndScheduleWork(
-    ColumnFamilyData* cfd, SuperVersionContext* sv_context) {
+    ColumnFamilyData* cfd, SuperVersionContext* sv_context,
+    std::optional<std::shared_ptr<SeqnoToTimeMapping>>
+        new_seqno_to_time_mapping) {
   mutex_.AssertHeld();
-  const auto& mutable_cf_options = cfd->GetLatestMutableCFOptions();
 
   // Update max_total_in_memory_state_
   size_t old_memtable_size = 0;
@@ -4289,7 +4819,8 @@ void DBImpl::InstallSuperVersionAndScheduleWork(
   if (UNLIKELY(sv_context->new_superversion == nullptr)) {
     sv_context->NewSuperVersion();
   }
-  cfd->InstallSuperVersion(sv_context, mutable_cf_options);
+  cfd->InstallSuperVersion(sv_context, &mutex_,
+                           std::move(new_seqno_to_time_mapping));
 
   // There may be a small data race here. The snapshot tricking bottommost
   // compaction may already be released here. But assuming there will always be
@@ -4298,7 +4829,7 @@ void DBImpl::InstallSuperVersionAndScheduleWork(
   bottommost_files_mark_threshold_ = kMaxSequenceNumber;
   standalone_range_deletion_files_mark_threshold_ = kMaxSequenceNumber;
   for (auto* my_cfd : *versions_->GetColumnFamilySet()) {
-    if (!my_cfd->ioptions().allow_ingest_behind) {
+    if (!my_cfd->AllowIngestBehind()) {
       bottommost_files_mark_threshold_ = std::min(
           bottommost_files_mark_threshold_,
           my_cfd->current()->storage_info()->bottommost_files_mark_threshold());
@@ -4316,9 +4847,10 @@ void DBImpl::InstallSuperVersionAndScheduleWork(
   MaybeScheduleFlushOrCompaction();
 
   // Update max_total_in_memory_state_
-  max_total_in_memory_state_ = max_total_in_memory_state_ - old_memtable_size +
-                               mutable_cf_options.write_buffer_size *
-                                   mutable_cf_options.max_write_buffer_number;
+  max_total_in_memory_state_ =
+      max_total_in_memory_state_ - old_memtable_size +
+      cfd->GetLatestMutableCFOptions().write_buffer_size *
+          cfd->GetLatestMutableCFOptions().max_write_buffer_number;
 }
 
 // ShouldPurge is called by FindObsoleteFiles when doing a full scan,
@@ -4347,31 +4879,33 @@ void DBImpl::SetSnapshotChecker(SnapshotChecker* snapshot_checker) {
   snapshot_checker_.reset(snapshot_checker);
 }
 
-void DBImpl::GetSnapshotContext(
-    JobContext* job_context, std::vector<SequenceNumber>* snapshot_seqs,
-    SequenceNumber* earliest_write_conflict_snapshot,
-    SnapshotChecker** snapshot_checker_ptr) {
+void DBImpl::InitSnapshotContext(JobContext* job_context) {
   mutex_.AssertHeld();
   assert(job_context != nullptr);
-  assert(snapshot_seqs != nullptr);
-  assert(earliest_write_conflict_snapshot != nullptr);
-  assert(snapshot_checker_ptr != nullptr);
-
-  *snapshot_checker_ptr = snapshot_checker_.get();
-  if (use_custom_gc_ && *snapshot_checker_ptr == nullptr) {
-    *snapshot_checker_ptr = DisableGCSnapshotChecker::Instance();
+  if (job_context->snapshot_context_initialized) {
+    return;
+  }
+  SnapshotChecker* snapshot_checker = snapshot_checker_.get();
+  if (use_custom_gc_ && !snapshot_checker) {
+    snapshot_checker = DisableGCSnapshotChecker::Instance();
   }
-  if (*snapshot_checker_ptr != nullptr) {
+  std::unique_ptr<ManagedSnapshot> managed_snapshot = nullptr;
+  if (snapshot_checker) {
     // If snapshot_checker is used, that means the flush/compaction may
     // contain values not visible to snapshot taken after
     // flush/compaction job starts. Take a snapshot and it will appear
     // in snapshot_seqs and force compaction iterator to consider such
     // snapshots.
-    const Snapshot* job_snapshot =
-        GetSnapshotImpl(false /*write_conflict_boundary*/, false /*lock*/);
-    job_context->job_snapshot.reset(new ManagedSnapshot(this, job_snapshot));
-  }
-  *snapshot_seqs = snapshots_.GetAll(earliest_write_conflict_snapshot);
+    const Snapshot* snapshot =
+        GetSnapshotImpl(/*is_write_conflict_boundary=*/false, /*lock=*/false);
+    managed_snapshot.reset(new ManagedSnapshot(this, snapshot));
+  }
+  SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber;
+  std::vector<SequenceNumber> snapshot_seqs =
+      snapshots_.GetAll(&earliest_write_conflict_snapshot);
+  job_context->InitSnapshotContext(
+      snapshot_checker, std::move(managed_snapshot),
+      earliest_write_conflict_snapshot, std::move(snapshot_seqs));
 }
 
 Status DBImpl::WaitForCompact(
@@ -4430,4 +4964,19 @@ Status DBImpl::WaitForCompact(
   }
 }
 
+bool DBImpl::ShouldPickCompaction(
+    bool is_prepicked, const PrepickedCompaction* prepicked_compaction) {
+  return (!is_prepicked && !compaction_queue_.empty()) ||
+         (is_prepicked && prepicked_compaction->need_repick);
+}
+
+void DBImpl::ResetBottomPriCompactionIntent(ColumnFamilyData* cfd,
+                                            std::unique_ptr<Compaction>& c) {
+  c->ReleaseCompactionFiles(Status::OK());
+  cfd->current()->storage_info()->ComputeCompactionScore(
+      c->immutable_options(), c->mutable_cf_options(),
+      cfd->GetFullHistoryTsLow());
+  c.reset();
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/db_impl/db_impl_debug.cc b/db/db_impl/db_impl_debug.cc
index 38873b0e3212..138527bb782e 100644
--- a/db/db_impl/db_impl_debug.cc
+++ b/db/db_impl/db_impl_debug.cc
@@ -84,6 +84,7 @@ void DBImpl::TEST_GetFilesMetaData(
 }
 
 uint64_t DBImpl::TEST_Current_Manifest_FileNo() {
+  InstrumentedMutexLock l(&mutex_);
   return versions_->manifest_file_number();
 }
 
@@ -224,13 +225,13 @@ void DBImpl::TEST_EndWrite(void* w) {
 }
 
 size_t DBImpl::TEST_LogsToFreeSize() {
-  InstrumentedMutexLock l(&log_write_mutex_);
-  return logs_to_free_.size();
+  InstrumentedMutexLock l(&wal_write_mutex_);
+  return wals_to_free_.size();
 }
 
 uint64_t DBImpl::TEST_LogfileNumber() {
   InstrumentedMutexLock l(&mutex_);
-  return logfile_number_;
+  return cur_wal_number_;
 }
 
 void DBImpl::TEST_GetAllBlockCaches(
@@ -379,10 +380,13 @@ void DBImpl::TEST_VerifyNoObsoleteFilesCached(
     uint64_t file_number;
     GetUnaligned(reinterpret_cast<const uint64_t*>(key.data()), &file_number);
     // Assert file is in live/quarantined set
-    if (live_and_quar_files.find(file_number) == live_and_quar_files.end()) {
+    bool cached_file_is_live_or_quar =
+        live_and_quar_files.find(file_number) != live_and_quar_files.end();
+    if (!cached_file_is_live_or_quar) {
+      // Fail with useful info
       std::cerr << "File " << file_number << " is not live nor quarantined"
                 << std::endl;
-      assert(false);
+      assert(cached_file_is_live_or_quar);
     }
   };
   table_cache_->ApplyToAllEntries(fn, {});
diff --git a/db/db_impl/db_impl_experimental.cc b/db/db_impl/db_impl_experimental.cc
index 49d583e6623d..bb6a9a2e409c 100644
--- a/db/db_impl/db_impl_experimental.cc
+++ b/db/db_impl/db_impl_experimental.cc
@@ -46,7 +46,8 @@ Status DBImpl::SuggestCompactRange(ColumnFamilyHandle* column_family,
     // Since we have some more files to compact, we should also recompute
     // compaction score
     vstorage->ComputeCompactionScore(cfd->ioptions(),
-                                     cfd->GetLatestMutableCFOptions());
+                                     cfd->GetLatestMutableCFOptions(),
+                                     cfd->GetFullHistoryTsLow());
     EnqueuePendingCompaction(cfd);
     MaybeScheduleFlushOrCompaction();
   }
@@ -143,7 +144,8 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) {
                    f->file_creation_time, f->epoch_number, f->file_checksum,
                    f->file_checksum_func_name, f->unique_id,
                    f->compensated_range_deletion_size, f->tail_size,
-                   f->user_defined_timestamps_persisted);
+                   f->user_defined_timestamps_persisted, f->min_timestamp,
+                   f->max_timestamp);
     }
 
     status = versions_->LogAndApply(cfd, read_options, write_options, &edit,
diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc
index c1ef7b96b160..d9d56a1f447b 100644
--- a/db/db_impl/db_impl_files.cc
+++ b/db/db_impl/db_impl_files.cc
@@ -28,7 +28,7 @@ uint64_t DBImpl::MinLogNumberToKeep() {
   return versions_->min_log_number_to_keep();
 }
 
-uint64_t DBImpl::MinLogNumberToRecycle() { return min_log_number_to_recycle_; }
+uint64_t DBImpl::MinLogNumberToRecycle() { return min_wal_number_to_recycle_; }
 
 uint64_t DBImpl::MinObsoleteSstNumberToKeep() {
   mutex_.AssertHeld();
@@ -267,82 +267,85 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
     if (!job_context->HaveSomethingToDelete()) {
       mutex_.AssertHeld();
       --pending_purge_obsolete_files_;
+      if (pending_purge_obsolete_files_ == 0) {
+        bg_cv_.SignalAll();
+      }
     }
   });
 
   // logs_ is empty when called during recovery, in which case there can't yet
   // be any tracked obsolete logs
-  log_write_mutex_.Lock();
+  wal_write_mutex_.Lock();
 
-  if (alive_log_files_.empty() || logs_.empty()) {
+  if (alive_wal_files_.empty() || logs_.empty()) {
     mutex_.AssertHeld();
     // We may reach here if the db is DBImplSecondary
-    log_write_mutex_.Unlock();
+    wal_write_mutex_.Unlock();
     return;
   }
 
   bool mutex_unlocked = false;
-  if (!alive_log_files_.empty() && !logs_.empty()) {
+  if (!alive_wal_files_.empty() && !logs_.empty()) {
     uint64_t min_log_number = job_context->log_number;
-    size_t num_alive_log_files = alive_log_files_.size();
+    size_t num_alive_wal_files = alive_wal_files_.size();
     // find newly obsoleted log files
-    while (alive_log_files_.begin()->number < min_log_number) {
-      auto& earliest = *alive_log_files_.begin();
+    while (alive_wal_files_.begin()->number < min_log_number) {
+      auto& earliest = *alive_wal_files_.begin();
       if (immutable_db_options_.recycle_log_file_num >
-              log_recycle_files_.size() &&
+              wal_recycle_files_.size() &&
           earliest.number >= MinLogNumberToRecycle()) {
         ROCKS_LOG_INFO(immutable_db_options_.info_log,
                        "adding log %" PRIu64 " to recycle list\n",
                        earliest.number);
-        log_recycle_files_.push_back(earliest.number);
+        wal_recycle_files_.push_back(earliest.number);
       } else {
         job_context->log_delete_files.push_back(earliest.number);
       }
       if (job_context->size_log_to_delete == 0) {
-        job_context->prev_total_log_size = total_log_size_;
-        job_context->num_alive_log_files = num_alive_log_files;
+        job_context->prev_wals_total_size = wals_total_size_.LoadRelaxed();
+        job_context->num_alive_wal_files = num_alive_wal_files;
       }
       job_context->size_log_to_delete += earliest.size;
-      total_log_size_ -= earliest.size;
-      alive_log_files_.pop_front();
+      wals_total_size_.FetchSubRelaxed(earliest.size);
+      alive_wal_files_.pop_front();
 
       // Current log should always stay alive since it can't have
       // number < MinLogNumber().
-      assert(alive_log_files_.size());
+      assert(alive_wal_files_.size());
     }
-    log_write_mutex_.Unlock();
+    wal_write_mutex_.Unlock();
     mutex_.Unlock();
     mutex_unlocked = true;
     TEST_SYNC_POINT_CALLBACK("FindObsoleteFiles::PostMutexUnlock", nullptr);
-    log_write_mutex_.Lock();
+    wal_write_mutex_.Lock();
     while (!logs_.empty() && logs_.front().number < min_log_number) {
       auto& log = logs_.front();
       if (log.IsSyncing()) {
-        log_sync_cv_.Wait();
+        wal_sync_cv_.Wait();
         // logs_ could have changed while we were waiting.
         continue;
       }
       // This WAL file is not live, so it's OK if we never sync the rest of it.
       // If it's already closed, then it's been fully synced. If
       // !background_close_inactive_wals then we need to Close it before
-      // removing from logs_ but not blocking while holding log_write_mutex_.
+      // removing from logs_ but not blocking while holding wal_write_mutex_.
       if (!immutable_db_options_.background_close_inactive_wals &&
           log.writer->file()) {
         // We are taking ownership of and pinning the front entry, so we can
         // expect it to be the same after releasing and re-acquiring the lock
         log.PrepareForSync();
-        log_write_mutex_.Unlock();
+        wal_write_mutex_.Unlock();
         // TODO: maybe check the return value of Close.
         // TODO: plumb Env::IOActivity, Env::IOPriority
         auto s = log.writer->file()->Close({});
         s.PermitUncheckedError();
-        log_write_mutex_.Lock();
+        wal_write_mutex_.Lock();
         log.writer->PublishIfClosed();
         assert(&log == &logs_.front());
         log.FinishSync();
-        log_sync_cv_.SignalAll();
+        wal_sync_cv_.SignalAll();
       }
-      logs_to_free_.push_back(log.ReleaseWriter());
+      wals_to_free_.push_back(log.ReleaseWriter());
       logs_.pop_front();
     }
     // Current log cannot be obsolete.
@@ -350,16 +353,16 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
   }
 
   // We're just cleaning up for DB::Write().
-  assert(job_context->logs_to_free.empty());
-  job_context->logs_to_free = logs_to_free_;
+  assert(job_context->wals_to_free.empty());
+  job_context->wals_to_free = wals_to_free_;
 
-  logs_to_free_.clear();
-  log_write_mutex_.Unlock();
+  wals_to_free_.clear();
+  wal_write_mutex_.Unlock();
   if (mutex_unlocked) {
     mutex_.Lock();
   }
-  job_context->log_recycle_files.assign(log_recycle_files_.begin(),
-                                        log_recycle_files_.end());
+  job_context->log_recycle_files.assign(wal_recycle_files_.begin(),
+                                        wal_recycle_files_.end());
 }
 
 // Delete obsolete files and log status and information of file deletion
@@ -368,6 +371,7 @@ void DBImpl::DeleteObsoleteFileImpl(int job_id, const std::string& fname,
                                     FileType type, uint64_t number) {
   TEST_SYNC_POINT_CALLBACK("DBImpl::DeleteObsoleteFileImpl::BeforeDeletion",
                            const_cast<std::string*>(&fname));
+  IGNORE_STATUS_IF_ERROR(Status::IOError());
 
   Status file_deletion_status;
   if (type == kTableFile || type == kBlobFile || type == kWalFile) {
@@ -423,12 +427,14 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
   // FindObsoleteFiles() should've populated this so nonzero
   assert(state.manifest_file_number != 0);
 
+  IGNORE_STATUS_IF_ERROR(Status::IOError());
+
   // Now, convert lists to unordered sets, WITHOUT mutex held; set is slow.
   std::unordered_set<uint64_t> sst_live_set(state.sst_live.begin(),
                                             state.sst_live.end());
   std::unordered_set<uint64_t> blob_live_set(state.blob_live.begin(),
                                              state.blob_live.end());
-  std::unordered_set<uint64_t> log_recycle_files_set(
+  std::unordered_set<uint64_t> wal_recycle_files_set(
       state.log_recycle_files.begin(), state.log_recycle_files.end());
   std::unordered_set<uint64_t> quarantine_files_set(
       state.files_to_quarantine.begin(), state.files_to_quarantine.end());
@@ -488,13 +494,13 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
       std::unique(candidate_files.begin(), candidate_files.end()),
       candidate_files.end());
 
-  if (state.prev_total_log_size > 0) {
+  if (state.prev_wals_total_size > 0) {
     ROCKS_LOG_INFO(immutable_db_options_.info_log,
                    "[JOB %d] Try to delete WAL files size %" PRIu64
                    ", prev total WAL file size %" PRIu64
                    ", number of live WAL files %" ROCKSDB_PRIszt ".\n",
                    state.job_id, state.size_log_to_delete,
-                   state.prev_total_log_size, state.num_alive_log_files);
+                   state.prev_wals_total_size, state.num_alive_wal_files);
   }
 
   std::vector<std::string> old_info_log_files;
@@ -529,7 +535,7 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
   optsfile_num2 = std::min(optsfile_num2, state.min_options_file_number);
 
   // Close WALs before trying to delete them.
-  for (const auto w : state.logs_to_free) {
+  for (const auto w : state.wals_to_free) {
     // TODO: maybe check the return value of Close.
     // TODO: plumb Env::IOActivity, Env::IOPriority
     auto s = w->Close({});
@@ -556,8 +562,8 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
       case kWalFile:
         keep = ((number >= state.log_number) ||
                 (number == state.prev_log_number) ||
-                (log_recycle_files_set.find(number) !=
-                 log_recycle_files_set.end()));
+                (wal_recycle_files_set.find(number) !=
+                 wal_recycle_files_set.end()));
         break;
       case kDescriptorFile:
         // Keep my manifest file, and any newer incarnations'
@@ -611,6 +617,11 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
       case kOptionsFile:
         keep = (number >= optsfile_num2);
         break;
+      case kCompactionProgressFile:
+        // Keep compaction progress files - they are managed
+        // separately by DBImplSecondary for now
+        keep = true;
+        break;
       case kCurrentFile:
       case kDBLockFile:
       case kIdentityFile:
diff --git a/db/db_impl/db_impl_follower.cc b/db/db_impl/db_impl_follower.cc
index 90c4326ceb15..1262c5bdfdb6 100644
--- a/db/db_impl/db_impl_follower.cc
+++ b/db/db_impl/db_impl_follower.cc
@@ -70,9 +70,6 @@ Status DBImplFollower::Recover(
     }
     return s;
   }
-  if (immutable_db_options_.paranoid_checks && s.ok()) {
-    s = CheckConsistency();
-  }
   if (s.ok()) {
     default_cf_handle_ = new ColumnFamilyHandleImpl(
         versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_);
@@ -296,9 +293,9 @@ Status DB::OpenAsFollower(
   DBImplFollower* impl =
       new DBImplFollower(tmp_opts, std::move(new_env), dbname, src_path);
   impl->versions_.reset(new ReactiveVersionSet(
-      dbname, &impl->immutable_db_options_, impl->file_options_,
-      impl->table_cache_.get(), impl->write_buffer_manager_,
-      &impl->write_controller_, impl->io_tracer_));
+      dbname, &impl->immutable_db_options_, impl->mutable_db_options_,
+      impl->file_options_, impl->table_cache_.get(),
+      impl->write_buffer_manager_, &impl->write_controller_, impl->io_tracer_));
   impl->column_family_memtables_.reset(
       new ColumnFamilyMemTablesImpl(impl->versions_->GetColumnFamilySet()));
   impl->wal_in_db_path_ = impl->immutable_db_options_.IsWalDirSameAsDBPath();
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index 577a861dcca6..7b2e949789fc 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -35,8 +35,8 @@ Options SanitizeOptions(const std::string& dbname, const Options& src,
   auto db_options =
       SanitizeOptions(dbname, DBOptions(src), read_only, logger_creation_s);
   ImmutableDBOptions immutable_db_options(db_options);
-  auto cf_options =
-      SanitizeOptions(immutable_db_options, ColumnFamilyOptions(src));
+  auto cf_options = SanitizeCfOptions(immutable_db_options, read_only,
+                                      ColumnFamilyOptions(src));
   return Options(db_options, cf_options);
 }
 
@@ -191,12 +191,6 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src,
                    "wal_compression is disabled since only zstd is supported");
   }
 
-  if (!result.paranoid_checks) {
-    result.skip_checking_sst_file_sizes_on_db_open = true;
-    ROCKS_LOG_INFO(result.info_log,
-                   "file size check will be skipped during open.");
-  }
-
   return result;
 }
 
@@ -224,6 +218,12 @@ Status DBImpl::ValidateOptions(
     if (!s.ok()) {
       return s;
     }
+    if (cfd.name == kDefaultColumnFamilyName) {
+      if (cfd.options.disallow_memtable_writes) {
+        return Status::InvalidArgument(
+            "Default column family cannot use disallow_memtable_writes=true");
+      }
+    }
   }
   s = ValidateOptions(db_options);
   return s;
@@ -329,7 +329,7 @@ Status DBImpl::NewDB(std::vector<std::string>* new_filenames) {
     }
     FileTypeSet tmp_set = immutable_db_options_.checksum_handoff_file_types;
     file->SetPreallocationBlockSize(
-        immutable_db_options_.manifest_preallocation_size);
+        mutable_db_options_.manifest_preallocation_size);
     std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
         std::move(file), manifest, file_options, immutable_db_options_.clock,
         io_tracer_, nullptr /* stats */,
@@ -599,7 +599,7 @@ Status DBImpl::Recover(
         // allow_ingest_behind does not support Level Compaction,
         // and per_key_placement can have infinite compaction loop for Level
         // Compaction. Adjust to_level here just to be safe.
-        if (cfd->ioptions().allow_ingest_behind ||
+        if (cfd->AllowIngestBehind() ||
             moptions.preclude_last_level_data_seconds > 0) {
           to_level -= 1;
         }
@@ -657,7 +657,8 @@ Status DBImpl::Recover(
                            f->file_creation_time, f->epoch_number,
                            f->file_checksum, f->file_checksum_func_name,
                            f->unique_id, f->compensated_range_deletion_size,
-                           f->tail_size, f->user_defined_timestamps_persisted);
+                           f->tail_size, f->user_defined_timestamps_persisted,
+                           f->min_timestamp, f->max_timestamp);
               ROCKS_LOG_WARN(immutable_db_options_.info_log,
                              "[%s] Moving #%" PRIu64
                              " from from_level-%d to from_level-%d %" PRIu64
@@ -688,9 +689,6 @@ Status DBImpl::Recover(
     s = MaybeUpdateNextFileNumber(recovery_ctx);
   }
 
-  if (immutable_db_options_.paranoid_checks && s.ok()) {
-    s = CheckConsistency();
-  }
   if (s.ok() && !read_only) {
     // TODO: share file descriptors (FSDirectory) with SetDirectories above
     std::map<std::string, std::shared_ptr<FSDirectory>> created_dirs;
@@ -1113,7 +1111,7 @@ void DBOpenLogRecordReadReporter::Corruption(size_t bytes, const Status& s,
                  static_cast<int>(bytes), s.ToString().c_str());
   if (status != nullptr && status->ok()) {
     *status = s;
-    corrupted_log_number_ = log_number;
+    corrupted_wal_number_ = log_number;
   }
 }
 
@@ -1197,6 +1195,13 @@ Status DBImpl::ProcessLogFiles(
   PredecessorWALInfo predecessor_wal_info;
 
   for (auto wal_number : wal_numbers) {
+    // Detecting early break on the next iteration after `wal_number` has been
+    // advanced since this `wal_number` doesn't affect follow-up handling after
+    // breaking out of the for loop.
+    if (!status.ok()) {
+      break;
+    }
+    SequenceNumber prev_next_sequence = *next_sequence;
     if (status.ok()) {
       status = ProcessLogFile(
           wal_number, min_wal_number, is_retry, read_only, job_id,
@@ -1204,6 +1209,10 @@ Status DBImpl::ProcessLogFiles(
           &stop_replay_by_wal_filter, &corrupted_wal_number,
           corrupted_wal_found, version_edits, &flushed, predecessor_wal_info);
     }
+    if (status.ok()) {
+      status = CheckSeqnoNotSetBackDuringRecovery(prev_next_sequence,
+                                                  *next_sequence);
+    }
   }
 
   if (status.ok()) {
@@ -1311,6 +1320,7 @@ Status DBImpl::ProcessLogFile(
     }
 
     // FIXME(hx235): consolidate `process_status` and `status`
+    SequenceNumber prev_next_sequence = *next_sequence;
     Status process_status = ProcessLogRecord(
         record, reader, running_ts_sz, wal_number, fname, read_only, job_id,
         logFileDropped, &reporter, &record_checksum, &last_seqno_observed,
@@ -1319,6 +1329,12 @@ Status DBImpl::ProcessLogFile(
 
     if (!process_status.ok()) {
       return process_status;
+    } else if (Status seqno_check_status = CheckSeqnoNotSetBackDuringRecovery(
+                   prev_next_sequence, *next_sequence);
+               !seqno_check_status.ok()) {
+      // Sequence number being set back indicates a serious software bug, the DB
+      // should not be opened in this case.
+      return seqno_check_status;
     } else if (*stop_replay_for_corruption) {
       break;
     }
@@ -1740,8 +1756,12 @@ Status DBImpl::MaybeHandleStopReplayForCorruptionForInconsistency(
         ROCKS_LOG_ERROR(immutable_db_options_.info_log,
                         "Column family inconsistency: SST file contains data"
                         " beyond the point of corruption.");
-        status = Status::Corruption("SST file is ahead of WALs in CF " +
-                                    cfd->GetName());
+        status = Status::Corruption(
+            "Column family inconsistency: SST file contains data"
+            " beyond the point of corruption in CF " +
+            cfd->GetName() +
+            ". WAL recovery stopped at corruption point, but SST files"
+            " contain newer data.");
         return status;
       }
     }
@@ -1857,6 +1877,20 @@ Status DBImpl::MaybeFlushFinalMemtableOrRestoreActiveLogFiles(
   return status;
 }
 
+Status DBImpl::CheckSeqnoNotSetBackDuringRecovery(
+    SequenceNumber prev_next_seqno, SequenceNumber current_next_seqno) {
+  if (prev_next_seqno == kMaxSequenceNumber ||
+      prev_next_seqno <= current_next_seqno) {
+    return Status::OK();
+  }
+  std::string msg =
+      "Sequence number is being set backwards during recovery, this is likely "
+      "a software bug or a data corruption. Prev next seqno: " +
+      std::to_string(prev_next_seqno) +
+      " , current next seqno: " + std::to_string(current_next_seqno);
+  return Status::Corruption(msg);
+}
+
 void DBImpl::FinishLogFilesRecovery(int job_id, const Status& status) {
   event_logger_.Log() << "job" << job_id << "event"
                       << (status.ok() ? "recovery_finished" : "recovery_failed")
@@ -1864,8 +1898,8 @@ void DBImpl::FinishLogFilesRecovery(int job_id, const Status& status) {
 }
 
 Status DBImpl::GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate,
-                                          LogFileNumberSize* log_ptr) {
-  LogFileNumberSize log(wal_number);
+                                          WalFileNumberSize* log_ptr) {
+  WalFileNumberSize log(wal_number);
   std::string fname =
       LogFileName(immutable_db_options_.GetWalDir(), wal_number);
   Status s;
@@ -1908,27 +1942,27 @@ Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& wal_numbers) {
   assert(immutable_db_options_.avoid_flush_during_recovery);
   // Mark these as alive so they'll be considered for deletion later by
   // FindObsoleteFiles()
-  total_log_size_ = 0;
-  log_empty_ = false;
+  wals_total_size_.StoreRelaxed(0);
+  wal_empty_ = false;
   uint64_t min_wal_with_unflushed_data =
       versions_->MinLogNumberWithUnflushedData();
   for (auto wal_number : wal_numbers) {
     if (!allow_2pc() && wal_number < min_wal_with_unflushed_data) {
       // In non-2pc mode, the WAL files not backing unflushed data are not
-      // alive, thus should not be added to the alive_log_files_.
+      // alive, thus should not be added to the alive_wal_files_.
       continue;
     }
     // We preallocate space for wals, but then after a crash and restart, those
     // preallocated space are not needed anymore. It is likely only the last
     // log has such preallocated space, so we only truncate for the last log.
-    LogFileNumberSize log;
+    WalFileNumberSize log;
     s = GetLogSizeAndMaybeTruncate(
         wal_number, /*truncate=*/(wal_number == wal_numbers.back()), &log);
     if (!s.ok()) {
       break;
     }
-    total_log_size_ += log.size;
-    alive_log_files_.push_back(log);
+    wals_total_size_.FetchAddRelaxed(log.size);
+    alive_wal_files_.push_back(log);
   }
   return s;
 }
@@ -1962,6 +1996,9 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
   const size_t ts_sz = ucmp->timestamp_size();
   const bool logical_strip_timestamp =
       ts_sz > 0 && !cfd->ioptions().persist_user_defined_timestamps;
+  // Note that here we treat flush as level 0 compaction in internal stats
+  InternalStats::CompactionStats flush_stats(CompactionReason::kFlush,
+                                             1 /* count */);
   {
     ScopedArenaPtr<InternalIterator> iter(
         logical_strip_timestamp
@@ -1989,8 +2026,9 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
     meta.oldest_ancester_time = current_time;
     meta.epoch_number = cfd->NewEpochNumber();
     {
-      auto write_hint =
-          cfd->current()->storage_info()->CalculateSSTWriteHint(/*level=*/0);
+      auto write_hint = cfd->current()->storage_info()->CalculateSSTWriteHint(
+          /*level=*/0,
+          immutable_db_options_.calculate_sst_write_lifetime_hint_set);
       mutex_.Unlock();
 
       SequenceNumber earliest_write_conflict_snapshot;
@@ -2033,19 +2071,20 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
           kMaxSequenceNumber);
       Version* version = cfd->current();
       version->Ref();
-      uint64_t num_input_entries = 0;
-      s = BuildTable(dbname_, versions_.get(), immutable_db_options_, tboptions,
-                     file_options_for_compaction_, cfd->table_cache(),
-                     iter.get(), std::move(range_del_iters), &meta,
-                     &blob_file_additions, snapshot_seqs, earliest_snapshot,
-                     earliest_write_conflict_snapshot, kMaxSequenceNumber,
-                     snapshot_checker, paranoid_file_checks,
-                     cfd->internal_stats(), &io_s, io_tracer_,
-                     BlobFileCreationReason::kRecovery,
-                     nullptr /* seqno_to_time_mapping */, &event_logger_,
-                     job_id, nullptr /* table_properties */, write_hint,
-                     nullptr /*full_history_ts_low*/, &blob_callback_, version,
-                     &num_input_entries);
+      TableProperties temp_table_proerties;
+      s = BuildTable(
+          dbname_, versions_.get(), immutable_db_options_, tboptions,
+          file_options_for_compaction_, cfd->table_cache(), iter.get(),
+          std::move(range_del_iters), &meta, &blob_file_additions,
+          snapshot_seqs, earliest_snapshot, earliest_write_conflict_snapshot,
+          kMaxSequenceNumber, snapshot_checker, paranoid_file_checks,
+          cfd->internal_stats(), &io_s, io_tracer_,
+          BlobFileCreationReason::kRecovery,
+          nullptr /* seqno_to_time_mapping */, &event_logger_, job_id,
+          &temp_table_proerties /* table_properties */, write_hint,
+          nullptr /*full_history_ts_low*/, &blob_callback_, version,
+          nullptr /* memtable_payload_bytes */,
+          nullptr /* memtable_garbage_bytes */, &flush_stats);
       version->Unref();
       LogFlush(immutable_db_options_.info_log);
       ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
@@ -2061,10 +2100,31 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
       }
 
       uint64_t total_num_entries = mem->NumEntries();
-      if (s.ok() && total_num_entries != num_input_entries) {
+      if (s.ok() && total_num_entries != flush_stats.num_input_records) {
         std::string msg = "Expected " + std::to_string(total_num_entries) +
                           " entries in memtable, but read " +
-                          std::to_string(num_input_entries);
+                          std::to_string(flush_stats.num_input_records);
+        ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                       "[%s] [JOB %d] Level-0 flush during recover: %s",
+                       cfd->GetName().c_str(), job_id, msg.c_str());
+        if (immutable_db_options_.flush_verify_memtable_count) {
+          s = Status::Corruption(msg);
+        }
+      }
+      // Only verify on table with format collects table properties
+      const auto& mutable_cf_options = cfd->GetLatestMutableCFOptions();
+      if (s.ok() &&
+          (mutable_cf_options.table_factory->IsInstanceOf(
+               TableFactory::kBlockBasedTableName()) ||
+           mutable_cf_options.table_factory->IsInstanceOf(
+               TableFactory::kPlainTableName())) &&
+          flush_stats.num_output_records != temp_table_proerties.num_entries) {
+        std::string msg =
+            "Number of keys in flush output SST files does not match "
+            "number of keys added to the table. Expected " +
+            std::to_string(flush_stats.num_output_records) + " but there are " +
+            std::to_string(temp_table_proerties.num_entries) +
+            " in output SST files";
         ROCKS_LOG_WARN(immutable_db_options_.info_log,
                        "[%s] [JOB %d] Level-0 flush during recover: %s",
                        cfd->GetName().c_str(), job_id, msg.c_str());
@@ -2112,25 +2172,25 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
     }
   }
 
-  InternalStats::CompactionStats stats(CompactionReason::kFlush, 1);
-  stats.micros = immutable_db_options_.clock->NowMicros() - start_micros;
+  flush_stats.micros = immutable_db_options_.clock->NowMicros() - start_micros;
 
   if (has_output) {
-    stats.bytes_written = meta.fd.GetFileSize();
-    stats.num_output_files = 1;
+    flush_stats.bytes_written = meta.fd.GetFileSize();
+    flush_stats.num_output_files = 1;
   }
 
   const auto& blobs = edit->GetBlobFileAdditions();
   for (const auto& blob : blobs) {
-    stats.bytes_written_blob += blob.GetTotalBlobBytes();
+    flush_stats.bytes_written_blob += blob.GetTotalBlobBytes();
   }
 
-  stats.num_output_files_blob = static_cast<int>(blobs.size());
+  flush_stats.num_output_files_blob = static_cast<int>(blobs.size());
 
-  cfd->internal_stats()->AddCompactionStats(level, Env::Priority::USER, stats);
+  cfd->internal_stats()->AddCompactionStats(level, Env::Priority::USER,
+                                            flush_stats);
   cfd->internal_stats()->AddCFStats(
       InternalStats::BYTES_FLUSHED,
-      stats.bytes_written + stats.bytes_written_blob);
+      flush_stats.bytes_written + flush_stats.bytes_written_blob);
   RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize());
   return s;
 }
@@ -2204,7 +2264,7 @@ Status DB::OpenAndTrimHistory(
     return s;
   }
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   s = DB::Open(db_options, dbname, column_families, handles, &db);
   if (!s.ok()) {
     return s;
@@ -2213,7 +2273,7 @@ Status DB::OpenAndTrimHistory(
   CompactRangeOptions options;
   options.bottommost_level_compaction =
       BottommostLevelCompaction::kForceOptimized;
-  auto db_impl = static_cast_with_check<DBImpl>(db);
+  auto db_impl = static_cast_with_check<DBImpl>(db.get());
   for (auto handle : *handles) {
     assert(handle != nullptr);
     auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(handle);
@@ -2235,14 +2295,14 @@ Status DB::OpenAndTrimHistory(
       assert(temp_s.ok());
     }
     handles->clear();
-    delete db;
+    db.reset();
   };
   if (!s.ok()) {
     clean_op();
     return s;
   }
 
-  dbptr->reset(db);
+  *dbptr = std::move(db);
   return s;
 }
 
@@ -2258,6 +2318,7 @@ IOStatus DBImpl::CreateWAL(const WriteOptions& write_options,
       BuildDBOptions(immutable_db_options_, mutable_db_options_);
   FileOptions opt_file_options =
       fs_->OptimizeForLogWrite(file_options_, db_options);
+  opt_file_options.write_hint = CalculateWALWriteHint();
   // DB option takes precedence when not kUnknown
   if (immutable_db_options_.wal_write_temperature != Temperature::kUnknown) {
     opt_file_options.temperature = immutable_db_options_.wal_write_temperature;
@@ -2279,7 +2340,9 @@ IOStatus DBImpl::CreateWAL(const WriteOptions& write_options,
   }
 
   if (io_s.ok()) {
-    lfile->SetWriteLifeTimeHint(CalculateWALWriteHint());
+    // Subsequent attempts to override the hint via SetWriteLifeTimeHint
+    // with the very same value will be ignored by the fs.
+    lfile->SetWriteLifeTimeHint(opt_file_options.write_hint);
     lfile->SetPreallocationBlockSize(preallocate_block_size);
 
     const auto& listeners = immutable_db_options_.listeners;
@@ -2334,9 +2397,11 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
   handles->clear();
 
   size_t max_write_buffer_size = 0;
+  MinAndMaxPreserveSeconds preserve_info;
   for (const auto& cf : column_families) {
     max_write_buffer_size =
         std::max(max_write_buffer_size, cf.options.write_buffer_size);
+    preserve_info.Combine(cf.options);
   }
 
   auto impl = std::make_unique<DBImpl>(db_options, dbname, seq_per_batch,
@@ -2405,18 +2470,18 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
     if (s.ok()) {
       // Prevent log files created by previous instance from being recycled.
       // They might be in alive_log_file_, and might get recycled otherwise.
-      impl->min_log_number_to_recycle_ = new_log_number;
+      impl->min_wal_number_to_recycle_ = new_log_number;
     }
     if (s.ok()) {
-      InstrumentedMutexLock wl(&impl->log_write_mutex_);
-      impl->logfile_number_ = new_log_number;
+      InstrumentedMutexLock wl(&impl->wal_write_mutex_);
+      impl->cur_wal_number_ = new_log_number;
       assert(new_log != nullptr);
       assert(impl->logs_.empty());
       impl->logs_.emplace_back(new_log_number, new_log);
     }
 
     if (s.ok()) {
-      impl->alive_log_files_.emplace_back(impl->logfile_number_);
+      impl->alive_wal_files_.emplace_back(impl->cur_wal_number_);
       // In WritePrepared there could be gap in sequence numbers. This breaks
       // the trick we use in kPointInTimeRecovery which assumes the first seq in
       // the log right after the corrupted log is one larger than the last seq
@@ -2429,14 +2494,14 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
       if (recovered_seq != kMaxSequenceNumber) {
         WriteBatch empty_batch;
         WriteBatchInternal::SetSequence(&empty_batch, recovered_seq);
-        uint64_t log_used, log_size;
+        uint64_t wal_used, log_size;
         log::Writer* log_writer = impl->logs_.back().writer;
-        LogFileNumberSize& log_file_number_size = impl->alive_log_files_.back();
+        WalFileNumberSize& wal_file_number_size = impl->alive_wal_files_.back();
 
-        assert(log_writer->get_log_number() == log_file_number_size.number);
+        assert(log_writer->get_log_number() == wal_file_number_size.number);
         impl->mutex_.AssertHeld();
-        s = impl->WriteToWAL(empty_batch, write_options, log_writer, &log_used,
-                             &log_size, log_file_number_size, recovered_seq);
+        s = impl->WriteToWAL(empty_batch, write_options, log_writer, &wal_used,
+                             &log_size, wal_file_number_size, recovered_seq);
         if (s.ok()) {
           // Need to fsync, otherwise it might get lost after a power reset.
           s = impl->FlushWAL(write_options, false);
@@ -2469,6 +2534,12 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
     s = impl->InitPersistStatsColumnFamily();
   }
 
+  // After reaching the post-recovery seqno but before creating SuperVersions
+  // ensure seqno to time mapping is pre-populated as needed.
+  if (s.ok() && recovery_ctx.is_new_db_ && preserve_info.IsEnabled()) {
+    impl->PrepopulateSeqnoToTimeMapping(preserve_info);
+  }
+
   if (s.ok()) {
     // set column family handles
     for (const auto& cf : column_families) {
@@ -2478,6 +2549,9 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
         handles->push_back(
             new ColumnFamilyHandleImpl(cfd, impl.get(), &impl->mutex_));
         impl->NewThreadStatusCfInfo(cfd);
+        SuperVersionContext sv_context(/* create_superversion */ true);
+        impl->InstallSuperVersionForConfigChange(cfd, &sv_context);
+        sv_context.Clean();
       } else {
         if (db_options.create_missing_column_families) {
           // missing column family, create it
@@ -2485,6 +2559,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
           impl->mutex_.Unlock();
           // NOTE: the work normally done in WrapUpCreateColumnFamilies will
           // be done separately below.
+          // This includes InstallSuperVersionForConfigChange.
           s = impl->CreateColumnFamilyImpl(read_options, write_options,
                                            cf.options, cf.name, &handle);
           impl->mutex_.Lock();
@@ -2501,15 +2576,14 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
     }
   }
 
-  if (s.ok()) {
+  if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) {
+    // Install SuperVersion for hidden column family
+    assert(impl->persist_stats_cf_handle_);
+    assert(impl->persist_stats_cf_handle_->cfd());
     SuperVersionContext sv_context(/* create_superversion */ true);
-    for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
-      impl->InstallSuperVersionAndScheduleWork(cfd, &sv_context);
-    }
+    impl->InstallSuperVersionForConfigChange(
+        impl->persist_stats_cf_handle_->cfd(), &sv_context);
     sv_context.Clean();
-  }
-
-  if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) {
     // try to read format version
     s = impl->PersistentStatsProcessFormatVersion();
   }
@@ -2618,8 +2692,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
     s = impl->StartPeriodicTaskScheduler();
   }
   if (s.ok()) {
-    s = impl->RegisterRecordSeqnoTimeWorker(read_options, write_options,
-                                            recovery_ctx.is_new_db_);
+    s = impl->RegisterRecordSeqnoTimeWorker();
   }
   impl->options_mutex_.Unlock();
   if (s.ok()) {
diff --git a/db/db_impl/db_impl_readonly.cc b/db/db_impl/db_impl_readonly.cc
index dac0d9660037..31934ee192c7 100644
--- a/db/db_impl/db_impl_readonly.cc
+++ b/db/db_impl/db_impl_readonly.cc
@@ -185,16 +185,10 @@ Iterator* DBImplReadOnly::NewIterator(const ReadOptions& _read_options,
           ? static_cast<const SnapshotImpl*>(read_options.snapshot)->number_
           : latest_snapshot;
   ReadCallback* read_callback = nullptr;  // No read callback provided.
-  auto db_iter = NewArenaWrappedDbIterator(
-      env_, read_options, cfd->ioptions(), super_version->mutable_cf_options,
-      super_version->current, read_seq,
-      super_version->mutable_cf_options.max_sequential_skip_in_iterations,
-      super_version->version_number, read_callback);
-  auto internal_iter = NewInternalIterator(
-      db_iter->GetReadOptions(), cfd, super_version, db_iter->GetArena(),
-      read_seq, /* allow_unprepared_value */ true, db_iter);
-  db_iter->SetIterUnderDBIter(internal_iter);
-  return db_iter;
+  return NewArenaWrappedDbIterator(
+      env_, read_options, cfh, super_version, read_seq, read_callback, this,
+      /*expose_blob_index=*/false, /*allow_refresh=*/false,
+      /*allow_mark_memtable_for_flush=*/false);
 }
 
 Status DBImplReadOnly::NewIterators(
@@ -231,36 +225,32 @@ Status DBImplReadOnly::NewIterators(
           ? static_cast<const SnapshotImpl*>(read_options.snapshot)->number_
           : latest_snapshot;
 
-  autovector<std::tuple<ColumnFamilyData*, SuperVersion*>> cfd_to_sv;
+  autovector<std::tuple<ColumnFamilyHandleImpl*, SuperVersion*>> cfh_to_sv;
 
   const bool check_read_ts =
       read_options.timestamp && read_options.timestamp->size() > 0;
   for (auto cfh : column_families) {
     auto* cfd = static_cast_with_check<ColumnFamilyHandleImpl>(cfh)->cfd();
     auto* sv = cfd->GetSuperVersion()->Ref();
-    cfd_to_sv.emplace_back(cfd, sv);
+    cfh_to_sv.emplace_back(static_cast_with_check<ColumnFamilyHandleImpl>(cfh),
+                           sv);
     if (check_read_ts) {
       const Status s =
           FailIfReadCollapsedHistory(cfd, sv, *(read_options.timestamp));
       if (!s.ok()) {
-        for (auto prev_entry : cfd_to_sv) {
+        for (auto prev_entry : cfh_to_sv) {
           std::get<1>(prev_entry)->Unref();
         }
         return s;
       }
     }
   }
-  assert(cfd_to_sv.size() == column_families.size());
-  for (auto [cfd, sv] : cfd_to_sv) {
+  assert(cfh_to_sv.size() == column_families.size());
+  for (auto [cfh, sv] : cfh_to_sv) {
     auto* db_iter = NewArenaWrappedDbIterator(
-        env_, read_options, cfd->ioptions(), sv->mutable_cf_options,
-        sv->current, read_seq,
-        sv->mutable_cf_options.max_sequential_skip_in_iterations,
-        sv->version_number, read_callback);
-    auto* internal_iter = NewInternalIterator(
-        db_iter->GetReadOptions(), cfd, sv, db_iter->GetArena(), read_seq,
-        /* allow_unprepared_value */ true, db_iter);
-    db_iter->SetIterUnderDBIter(internal_iter);
+        env_, read_options, cfh, sv, read_seq, read_callback, this,
+        /*expose_blob_index=*/false, /*allow_refresh=*/false,
+        /*allow_mark_memtable_for_flush=*/false);
     iterators->push_back(db_iter);
   }
 
diff --git a/db/db_impl/db_impl_readonly.h b/db/db_impl/db_impl_readonly.h
index 9566f547bfeb..2f456561cc30 100644
--- a/db/db_impl/db_impl_readonly.h
+++ b/db/db_impl/db_impl_readonly.h
@@ -121,6 +121,11 @@ class DBImplReadOnly : public DBImpl {
     return Status::NotSupported("Not supported operation in read only mode.");
   }
 
+  using DBImpl::FlushWAL;
+  Status FlushWAL(const FlushWALOptions& /*options*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
   using DB::IngestExternalFile;
   Status IngestExternalFile(
       ColumnFamilyHandle* /*column_family*/,
@@ -155,6 +160,29 @@ class DBImplReadOnly : public DBImpl {
     return Status::NotSupported("Not supported operation in read only mode.");
   }
 
+  using DB::CreateColumnFamily;
+  using DBImpl::CreateColumnFamily;
+  Status CreateColumnFamily(const ColumnFamilyOptions& /*cf_options*/,
+                            const std::string& /*column_family*/,
+                            ColumnFamilyHandle** /*handle*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  using DB::CreateColumnFamilies;
+  using DBImpl::CreateColumnFamilies;
+  Status CreateColumnFamilies(
+      const ColumnFamilyOptions& /*cf_options*/,
+      const std::vector<std::string>& /*column_family_names*/,
+      std::vector<ColumnFamilyHandle*>* /*handles*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  Status CreateColumnFamilies(
+      const std::vector<ColumnFamilyDescriptor>& /*column_families*/,
+      std::vector<ColumnFamilyHandle*>* /*handles*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
   // FIXME: some missing overrides for more "write" functions
 
  protected:
diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc
index a9082db3b42f..0db4820c3925 100644
--- a/db/db_impl/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@@ -8,7 +8,12 @@
 #include <cinttypes>
 
 #include "db/arena_wrapped_db_iter.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
 #include "db/merge_context.h"
+#include "db/version_edit.h"
+#include "file/filename.h"
+#include "file/writable_file_writer.h"
 #include "logging/auto_roll_logger.h"
 #include "logging/logging.h"
 #include "monitoring/perf_context_imp.h"
@@ -49,9 +54,6 @@ Status DBImplSecondary::Recover(
     }
     return s;
   }
-  if (immutable_db_options_.paranoid_checks && s.ok()) {
-    s = CheckConsistency();
-  }
   // Initial max_total_in_memory_state_ before recovery logs.
   max_total_in_memory_state_ = 0;
   for (auto cfd : *versions_->GetColumnFamilySet()) {
@@ -507,10 +509,6 @@ Iterator* DBImplSecondary::NewIterator(const ReadOptions& _read_options,
   if (read_options.io_activity == Env::IOActivity::kUnknown) {
     read_options.io_activity = Env::IOActivity::kDBIterator;
   }
-  if (read_options.managed) {
-    return NewErrorIterator(
-        Status::NotSupported("Managed iterator is not supported anymore."));
-  }
   if (read_options.read_tier == kPersistedTier) {
     return NewErrorIterator(Status::NotSupported(
         "ReadTier::kPersistedData is not yet supported in iterators."));
@@ -566,17 +564,10 @@ ArenaWrappedDBIter* DBImplSecondary::NewIteratorImpl(
   assert(snapshot == kMaxSequenceNumber);
   snapshot = versions_->LastSequence();
   assert(snapshot != kMaxSequenceNumber);
-  auto db_iter = NewArenaWrappedDbIterator(
-      env_, read_options, cfh->cfd()->ioptions(),
-      super_version->mutable_cf_options, super_version->current, snapshot,
-      super_version->mutable_cf_options.max_sequential_skip_in_iterations,
-      super_version->version_number, read_callback, cfh, expose_blob_index,
-      allow_refresh);
-  auto internal_iter = NewInternalIterator(
-      db_iter->GetReadOptions(), cfh->cfd(), super_version, db_iter->GetArena(),
-      snapshot, /* allow_unprepared_value */ true, db_iter);
-  db_iter->SetIterUnderDBIter(internal_iter);
-  return db_iter;
+  return NewArenaWrappedDbIterator(env_, read_options, cfh, super_version,
+                                   snapshot, read_callback, this,
+                                   expose_blob_index, allow_refresh,
+                                   /*allow_mark_memtable_for_flush=*/false);
 }
 
 Status DBImplSecondary::NewIterators(
@@ -593,9 +584,6 @@ Status DBImplSecondary::NewIterators(
   if (read_options.io_activity == Env::IOActivity::kUnknown) {
     read_options.io_activity = Env::IOActivity::kDBIterator;
   }
-  if (read_options.managed) {
-    return Status::NotSupported("Managed iterator is not supported anymore.");
-  }
   if (read_options.read_tier == kPersistedTier) {
     return Status::NotSupported(
         "ReadTier::kPersistedData is not yet supported in iterators.");
@@ -660,58 +648,15 @@ Status DBImplSecondary::NewIterators(
   return Status::OK();
 }
 
-Status DBImplSecondary::CheckConsistency() {
-  mutex_.AssertHeld();
-  Status s = DBImpl::CheckConsistency();
-  // If DBImpl::CheckConsistency() which is stricter returns success, then we
-  // do not need to give a second chance.
-  if (s.ok()) {
-    return s;
-  }
-  // It's possible that DBImpl::CheckConssitency() can fail because the primary
-  // may have removed certain files, causing the GetFileSize(name) call to
-  // fail and returning a PathNotFound. In this case, we take a best-effort
-  // approach and just proceed.
-  TEST_SYNC_POINT_CALLBACK(
-      "DBImplSecondary::CheckConsistency:AfterFirstAttempt", &s);
-
-  if (immutable_db_options_.skip_checking_sst_file_sizes_on_db_open) {
-    return Status::OK();
-  }
-
-  std::vector<LiveFileMetaData> metadata;
-  versions_->GetLiveFilesMetaData(&metadata);
-
-  std::string corruption_messages;
-  for (const auto& md : metadata) {
-    // md.name has a leading "/".
-    std::string file_path = md.db_path + md.name;
-
-    uint64_t fsize = 0;
-    s = env_->GetFileSize(file_path, &fsize);
-    if (!s.ok() &&
-        (env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok() ||
-         s.IsPathNotFound())) {
-      s = Status::OK();
-    }
-    if (!s.ok()) {
-      corruption_messages +=
-          "Can't access " + md.name + ": " + s.ToString() + "\n";
-    }
-  }
-  return corruption_messages.empty() ? Status::OK()
-                                     : Status::Corruption(corruption_messages);
-}
-
 Status DBImplSecondary::TryCatchUpWithPrimary() {
   assert(versions_.get() != nullptr);
-  assert(manifest_reader_.get() != nullptr);
   Status s;
   // read the manifest and apply new changes to the secondary instance
   std::unordered_set<ColumnFamilyData*> cfds_changed;
   JobContext job_context(0, true /*create_superversion*/);
   {
     InstrumentedMutexLock lock_guard(&mutex_);
+    assert(manifest_reader_.get() != nullptr);
     s = static_cast_with_check<ReactiveVersionSet>(versions_.get())
             ->ReadAndApply(&mutex_, &manifest_reader_,
                            manifest_reader_status_.get(), &cfds_changed,
@@ -735,13 +680,13 @@ Status DBImplSecondary::TryCatchUpWithPrimary() {
     // instance
     if (s.ok()) {
       s = FindAndRecoverLogFiles(&cfds_changed, &job_context);
-    }
-    if (s.IsPathNotFound()) {
-      ROCKS_LOG_INFO(
-          immutable_db_options_.info_log,
-          "Secondary tries to read WAL, but WAL file(s) have already "
-          "been purged by primary.");
-      s = Status::OK();
+      if (s.IsPathNotFound()) {
+        ROCKS_LOG_INFO(
+            immutable_db_options_.info_log,
+            "Secondary tries to read WAL, but WAL file(s) have already "
+            "been purged by primary.");
+        s = Status::OK();
+      }
     }
     if (s.ok()) {
       for (auto cfd : cfds_changed) {
@@ -831,9 +776,9 @@ Status DB::OpenAsSecondary(
   handles->clear();
   DBImplSecondary* impl = new DBImplSecondary(tmp_opts, dbname, secondary_path);
   impl->versions_.reset(new ReactiveVersionSet(
-      dbname, &impl->immutable_db_options_, impl->file_options_,
-      impl->table_cache_.get(), impl->write_buffer_manager_,
-      &impl->write_controller_, impl->io_tracer_));
+      dbname, &impl->immutable_db_options_, impl->mutable_db_options_,
+      impl->file_options_, impl->table_cache_.get(),
+      impl->write_buffer_manager_, &impl->write_controller_, impl->io_tracer_));
   impl->column_family_memtables_.reset(
       new ColumnFamilyMemTablesImpl(impl->versions_->GetColumnFamilySet()));
   impl->wal_in_db_path_ = impl->immutable_db_options_.IsWalDirSameAsDBPath();
@@ -876,18 +821,517 @@ Status DB::OpenAsSecondary(
   return s;
 }
 
+Status DBImplSecondary::ScanCompactionProgressFiles(
+    CompactionProgressFilesScan* scan_result) {
+  assert(scan_result != nullptr);
+  scan_result->Clear();
+
+  WriteOptions write_options(Env::IOActivity::kCompaction);
+  IOOptions opts;
+  Status s = WritableFileWriter::PrepareIOOptions(write_options, opts);
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::vector<std::string> all_filenames;
+  s = fs_->GetChildren(secondary_path_, opts, &all_filenames, nullptr /* dbg*/);
+  if (!s.ok()) {
+    return s;
+  }
+
+  for (const auto& filename : all_filenames) {
+    if (filename == "." || filename == "..") {
+      continue;
+    }
+
+    uint64_t number;
+    FileType type;
+
+    if (!ParseFileName(filename, &number, &type)) {
+      continue;
+    }
+
+    // Categorize compaction progress files
+    if (type == kCompactionProgressFile) {
+      if (number > scan_result->latest_progress_timestamp) {
+        // Found a newer progress file
+        if (scan_result->HasLatestProgressFile()) {
+          // Previous "latest" becomes "old"
+          scan_result->old_progress_filenames.push_back(
+              scan_result->latest_progress_filename.value());
+        }
+        scan_result->latest_progress_timestamp = number;
+        scan_result->latest_progress_filename = filename;
+      } else {
+        // This is an older progress file
+        scan_result->old_progress_filenames.push_back(filename);
+      }
+    } else if (type == kTempFile &&
+               filename.find(kCompactionProgressFileNamePrefix) == 0) {
+      // Temporary progress files
+      scan_result->temp_progress_filenames.push_back(filename);
+    } else if (type == kTableFile) {
+      // Collect table file numbers for CleanupPhysicalCompactionOutputFiles
+      scan_result->table_file_numbers.push_back(number);
+    }
+  }
+
+  return Status::OK();
+}
+
+Status DBImplSecondary::DeleteCompactionProgressFiles(
+    const std::vector<std::string>& filenames) {
+  WriteOptions write_options(Env::IOActivity::kCompaction);
+  IOOptions opts;
+  Status s = WritableFileWriter::PrepareIOOptions(write_options, opts);
+  if (!s.ok()) {
+    return s;
+  }
+
+  for (const auto& filename : filenames) {
+    std::string file_path = secondary_path_ + "/" + filename;
+    Status delete_status = fs_->DeleteFile(file_path, opts, nullptr /* dbg */);
+    if (!delete_status.ok()) {
+      return delete_status;
+    }
+  }
+
+  return Status::OK();
+}
+
+Status DBImplSecondary::CleanupOldAndTemporaryCompactionProgressFiles(
+    bool preserve_latest, const CompactionProgressFilesScan& scan_result) {
+  std::vector<std::string> filenames_to_delete;
+
+  // Always delete old progress files
+  filenames_to_delete.insert(filenames_to_delete.end(),
+                             scan_result.old_progress_filenames.begin(),
+                             scan_result.old_progress_filenames.end());
+
+  // Always delete temp files
+  filenames_to_delete.insert(filenames_to_delete.end(),
+                             scan_result.temp_progress_filenames.begin(),
+                             scan_result.temp_progress_filenames.end());
+
+  // Conditionally delete latest file
+  if (!preserve_latest && scan_result.HasLatestProgressFile()) {
+    filenames_to_delete.push_back(scan_result.latest_progress_filename.value());
+  }
+
+  return DeleteCompactionProgressFiles(filenames_to_delete);
+}
+
+// Loads compaction progress from a file and cleans up extra output
+// files. After loading the progress, this function identifies and deletes any
+// SST files in the output folder that are NOT tracked in the
+// progress. This ensures consistency between the progress file and
+// actual output files on disk.
+Status DBImplSecondary::LoadCompactionProgressAndCleanupExtraOutputFiles(
+    const std::string& compaction_progress_file_path,
+    const CompactionProgressFilesScan& scan_result) {
+  Status s = ParseCompactionProgressFile(compaction_progress_file_path,
+                                         &compaction_progress_);
+  if (s.ok()) {
+    s = CleanupPhysicalCompactionOutputFiles(true /* preserve_tracked_files */,
+                                             scan_result);
+  }
+  return s;
+}
+
+Status DBImplSecondary::ParseCompactionProgressFile(
+    const std::string& compaction_progress_file_path,
+    CompactionProgress* compaction_progress) {
+  std::unique_ptr<FSSequentialFile> file;
+  Status s = fs_->NewSequentialFile(compaction_progress_file_path,
+                                    FileOptions(), &file, nullptr /* dbg */);
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::unique_ptr<SequentialFileReader> file_reader(new SequentialFileReader(
+      std::move(file), compaction_progress_file_path,
+      immutable_db_options_.log_readahead_size, io_tracer_, {} /* listeners */,
+      immutable_db_options_.rate_limiter.get()));
+
+  Status reader_status;
+
+  struct CompactionProgressReaderReporter : public log::Reader::Reporter {
+    Status* status;
+    explicit CompactionProgressReaderReporter(Status* s) : status(s) {}
+
+    void Corruption(size_t /*bytes*/, const Status& s,
+                    uint64_t /*log_number*/) override {
+      if (status->ok()) {
+        *status = s;
+      }
+    }
+
+    void OldLogRecord(size_t /*bytes*/) override {
+      // Ignore old records
+    }
+  } progress_reporter(&reader_status);
+
+  log::Reader compaction_progress_reader(
+      immutable_db_options_.info_log, std::move(file_reader),
+      &progress_reporter, true /* checksum */, 0 /* log_num */);
+
+  // LIMITATION: Only supports resuming single subcompaction
+  SubcompactionProgressBuilder progress_builder;
+  Slice slice;
+  std::string record;
+
+  while (compaction_progress_reader.ReadRecord(&slice, &record) &&
+         reader_status.ok()) {
+    VersionEdit edit;
+    s = edit.DecodeFrom(slice);
+    if (!s.ok()) {
+      break;
+    }
+
+    bool res = progress_builder.ProcessVersionEdit(edit);
+    if (!res) {
+      break;
+    }
+  }
+
+  if (!reader_status.ok()) {
+    return reader_status;
+  }
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (progress_builder.HasAccumulatedSubcompactionProgress()) {
+    compaction_progress->clear();
+    compaction_progress->push_back(
+        progress_builder.GetAccumulatedSubcompactionProgress());
+  } else {
+    s = Status::NotFound("No compaction progress was persisted yet");
+  }
+
+  return s;
+}
+
+Status DBImplSecondary::RenameCompactionProgressFile(
+    const std::string& temp_file_path, std::string* final_file_path) {
+  uint64_t current_time = env_->NowMicros();
+  *final_file_path = CompactionProgressFileName(secondary_path_, current_time);
+
+  WriteOptions write_options(Env::IOActivity::kCompaction);
+  IOOptions opts;
+  Status s = WritableFileWriter::PrepareIOOptions(write_options, opts);
+  if (!s.ok()) {
+    return s;
+  }
+
+  s = fs_->RenameFile(temp_file_path, *final_file_path, opts,
+                      nullptr /* dbg */);
+
+  return s;
+}
+
+Status DBImplSecondary::CleanupPhysicalCompactionOutputFiles(
+    bool preserve_tracked_files,
+    const CompactionProgressFilesScan& scan_result) {
+  std::unordered_set<uint64_t> files_to_preserve;
+
+  if (preserve_tracked_files) {
+    for (const auto& subcompaction_progress : compaction_progress_) {
+      for (const auto& file_metadata :
+           subcompaction_progress.output_level_progress.GetOutputFiles()) {
+        files_to_preserve.insert(file_metadata.fd.GetNumber());
+      }
+      for (const auto& file_metadata :
+           subcompaction_progress.proximal_output_level_progress
+               .GetOutputFiles()) {
+        files_to_preserve.insert(file_metadata.fd.GetNumber());
+      }
+    }
+  }
+
+  WriteOptions write_options(Env::IOActivity::kCompaction);
+  IOOptions opts;
+  Status s = WritableFileWriter::PrepareIOOptions(write_options, opts);
+  if (!s.ok()) {
+    return s;
+  }
+
+  for (uint64_t file_number : scan_result.table_file_numbers) {
+    bool should_delete =
+        !preserve_tracked_files ||
+        (files_to_preserve.find(file_number) == files_to_preserve.end());
+
+    if (should_delete) {
+      std::string file_path = MakeTableFileName(secondary_path_, file_number);
+      Status delete_status =
+          fs_->DeleteFile(file_path, opts, nullptr /* dbg */);
+      if (!delete_status.ok()) {
+        return delete_status;
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+Status DBImplSecondary::InitializeCompactionWorkspace(
+    bool allow_resumption, std::unique_ptr<FSDirectory>* output_dir,
+    std::unique_ptr<log::Writer>* compaction_progress_writer) {
+  // Create output directory if it doest exist yet
+  Status s = CreateAndNewDirectory(fs_.get(), secondary_path_, output_dir);
+  if (!s.ok() || !allow_resumption) {
+    return s;
+  }
+
+  s = PrepareCompactionProgressState();
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  s = FinalizeCompactionProgressWriter(compaction_progress_writer);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  return Status::OK();
+}
+
+// PrepareCompactionProgressState() manages compaction progress files and output
+// files to ensure a clean, consistent state for resuming or starting fresh
+// compaction.
+//
+// PRECONDITION:
+// - This function is ONLY called when allow_resumption = true
+// - The caller wants resumption support for this compaction attempt
+//
+// FILE SYSTEM STATE (before entering this function):
+// - 0 or more compaction progress files may exist in `secondary_path_`:
+//   * Latest progress file (from the most recent compaction attempt)
+//   * Older progress files (left by crashing during a previous
+//     InitializeCompactionWorkspace() call)
+//   * Temporary progress files (left by crashing during a previous
+//     InitializeCompactionWorkspace() call)
+// - 0 or more compaction output files may exist in `secondary_path_`
+//
+// POSTCONDITIONS (after this function):
+// - IF the latest progress file exists AND it parses successfully AND
+//   actually contains valid compaction progress:
+//   * Exactly one latest progress file remains
+//   * All older and temporary compaction progress files are deleted
+//   * All corresponding compaction output files are preserved
+//   * All extra compaction output files are deleted (files left by
+//   compaction
+//     crashing before persisting the progress)
+//   * Result: Ready to resume compaction from the saved progress
+// - OTHERWISE (no latest progress file OR it fails to parse OR it's
+// invalid):
+//   * ALL compaction progress files are deleted (latest + older +
+//   temporary)
+//   * ALL compaction output files are deleted
+//   * Result: Ready to start fresh compaction (despite allow_resumption =
+//   true, we cannot resume because there's no valid progress to resume from)
+//
+// ERROR HANDLING:
+// - ON ERROR (if any of the postconditions cannot be achieved):
+//   * Function returns error status
+//   * File system may be left in a partially modified state
+//   * Caller should manually clean up secondary_path_ before retrying
+//   * Subsequent OpenAndCompact() calls to this clean secondary_path_ will
+//     effectively start fresh compaction
+Status DBImplSecondary::PrepareCompactionProgressState() {
+  Status s;
+
+  // STEP 1: Scan directory ONCE (includes progress files + table files)
+  CompactionProgressFilesScan scan_result;
+  s = ScanCompactionProgressFiles(&scan_result);
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                    "Encountered error when scanning for compaction "
+                    "progress files: %s",
+                    s.ToString().c_str());
+    return s;
+  }
+
+  std::optional<std::string> latest_progress_file =
+      scan_result.latest_progress_filename;
+
+  // STEP 2: Determine if we should resume
+  bool should_resume = false;
+  if (latest_progress_file.has_value()) {
+    should_resume = true;
+  } else {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                   "Did not find any latest compaction progress file. "
+                   "Will perform clean up to start fresh compaction");
+  }
+
+  // STEP 3: Cleanup using pre-scanned results
+  if (should_resume) {
+    // Keep latest, delete old/temp
+    s = CleanupOldAndTemporaryCompactionProgressFiles(
+        true /* preserve_latest */, scan_result);
+  } else {
+    // Delete everything including latest
+    s = CleanupOldAndTemporaryCompactionProgressFiles(
+        false /* preserve_latest */, scan_result);
+    latest_progress_file.reset();
+  }
+
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                    "Failed to clean up compaction progress file(s): %s. "
+                    "Will fail the compaction",
+                    s.ToString().c_str());
+    return s;
+  }
+
+  // STEP 4: Load progress if resuming
+  if (latest_progress_file.has_value()) {
+    uint64_t timestamp = scan_result.latest_progress_timestamp;
+
+    std::string compaction_progress_file_path =
+        CompactionProgressFileName(secondary_path_, timestamp);
+
+    s = LoadCompactionProgressAndCleanupExtraOutputFiles(
+        compaction_progress_file_path, scan_result);
+
+    if (!s.ok()) {
+      ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                     "Failed to load the latest compaction "
+                     "progress from %s: %s. Will perform clean up "
+                     "to start fresh compaction",
+                     latest_progress_file.value().c_str(),
+                     s.ToString().c_str());
+      return HandleInvalidOrNoCompactionProgress(compaction_progress_file_path,
+                                                 scan_result);
+    }
+
+    ROCKS_LOG_DEBUG(
+        immutable_db_options_.info_log,
+        "Loaded compaction progress with %zu subcompaction(s) from %s",
+        compaction_progress_.size(), compaction_progress_file_path.c_str());
+    return s;
+  } else {
+    return HandleInvalidOrNoCompactionProgress(
+        std::nullopt /* compaction_progress_file_path */, scan_result);
+  }
+}
+
+uint64_t DBImplSecondary::CalculateResumedCompactionBytes(
+    const CompactionProgress& compaction_progress) const {
+  uint64_t total_resumed_bytes = 0;
+
+  for (const auto& subcompaction_progress : compaction_progress) {
+    for (const auto& file_meta :
+         subcompaction_progress.output_level_progress.GetOutputFiles()) {
+      total_resumed_bytes += file_meta.fd.file_size;
+    }
+
+    for (const auto& file_meta :
+         subcompaction_progress.proximal_output_level_progress
+             .GetOutputFiles()) {
+      total_resumed_bytes += file_meta.fd.file_size;
+    }
+  }
+
+  return total_resumed_bytes;
+}
+
+Status DBImplSecondary::HandleInvalidOrNoCompactionProgress(
+    const std::optional<std::string>& compaction_progress_file_path,
+    const CompactionProgressFilesScan& scan_result) {
+  compaction_progress_.clear();
+
+  Status s;
+  if (compaction_progress_file_path.has_value()) {
+    WriteOptions write_options(Env::IOActivity::kCompaction);
+    IOOptions opts;
+    s = WritableFileWriter::PrepareIOOptions(write_options, opts);
+    if (s.ok()) {
+      s = fs_->DeleteFile(compaction_progress_file_path.value(), opts,
+                          nullptr /* dbg */);
+    }
+    if (!s.ok()) {
+      ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                      "Failed to remove invalid progress file: %s",
+                      s.ToString().c_str());
+      return s;
+    }
+  }
+
+  s = CleanupPhysicalCompactionOutputFiles(false /* preserve_tracked_files */,
+                                           scan_result);
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                    "Failed to cleanup existing compaction output files: %s",
+                    s.ToString().c_str());
+    return s;
+  }
+
+  return Status::OK();
+}
+
 Status DBImplSecondary::CompactWithoutInstallation(
     const OpenAndCompactOptions& options, ColumnFamilyHandle* cfh,
     const CompactionServiceInput& input, CompactionServiceResult* result) {
   if (options.canceled && options.canceled->load(std::memory_order_acquire)) {
     return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
   }
+
+  std::unique_ptr<FSDirectory> output_dir;
+  std::unique_ptr<log::Writer> compaction_progress_writer;
+
   InstrumentedMutexLock l(&mutex_);
+
   auto cfd = static_cast_with_check<ColumnFamilyHandleImpl>(cfh)->cfd();
   if (!cfd) {
     return Status::InvalidArgument("Cannot find column family" +
                                    cfh->GetName());
   }
+  Status s;
+
+  const auto& mutable_cf_options = cfd->GetLatestMutableCFOptions();
+
+  // TODO(hx235): Resuming compaction is currently incompatible with
+  // output hash verification (enabled via paranoid_file_checks=true or
+  // verify_output_flags containing kVerifyIteration) because resumed compaction
+  // will lose the hash computed before interruption.
+  // Potential solutions:
+  // 1. Persist the hash state: Before interruption, save the current hash value
+  //    of each output file to disk, allowing validation to continue correctly
+  //    after resumption.
+  // 2. Immediate verification: Move output verification to happen
+  //    immediately after each output file is created and closed, eliminating
+  //    the need to maintain hash state across resumption boundaries.
+  bool output_hash_verification_enabled =
+      mutable_cf_options.paranoid_file_checks ||
+      !!(mutable_cf_options.verify_output_flags &
+         VerifyOutputFlags::kVerifyIteration);
+
+  bool allow_resumption =
+      options.allow_resumption && !output_hash_verification_enabled;
+
+  if (options.allow_resumption && output_hash_verification_enabled) {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                   "Resume compaction configured but disabled due to "
+                   "incompatibility with output hash verification "
+                   "(paranoid_file_checks=true or verify_output_flags "
+                   "containing kVerifyIteration)");
+  }
+
+  mutex_.Unlock();
+
+  s = InitializeCompactionWorkspace(allow_resumption, &output_dir,
+                                    &compaction_progress_writer);
+
+  mutex_.Lock();
+
+  if (!s.ok()) {
+    return s;
+  }
 
   std::unordered_set<uint64_t> input_set;
   for (const auto& file_name : input.input_files) {
@@ -901,46 +1345,56 @@ Status DBImplSecondary::CompactWithoutInstallation(
 
   VersionStorageInfo* vstorage = version->storage_info();
 
-  // Use comp_options to reuse some CompactFiles functions
   CompactionOptions comp_options;
   comp_options.compression = kDisableCompressionOption;
   comp_options.output_file_size_limit = MaxFileSizeForLevel(
-      cfd->GetLatestMutableCFOptions(), input.output_level,
-      cfd->ioptions().compaction_style, vstorage->base_level(),
+      mutable_cf_options, input.output_level, cfd->ioptions().compaction_style,
+      vstorage->base_level(),
       cfd->ioptions().level_compaction_dynamic_level_bytes);
 
   std::vector<CompactionInputFiles> input_files;
-  Status s = cfd->compaction_picker()->GetCompactionInputsFromFileNumbers(
+  s = cfd->compaction_picker()->GetCompactionInputsFromFileNumbers(
       &input_files, &input_set, vstorage, comp_options);
   if (!s.ok()) {
     ROCKS_LOG_ERROR(
         immutable_db_options_.info_log,
         "GetCompactionInputsFromFileNumbers() failed - %s.\n DebugString: %s",
-        s.ToString().c_str(), version->DebugString().c_str());
+        s.ToString().c_str(), version->DebugString(/*hex=*/true).c_str());
     return s;
   }
 
+  const int job_id = next_job_id_.fetch_add(1);
+  JobContext job_context(job_id, true /*create_superversion*/);
+  std::vector<SequenceNumber> snapshots = input.snapshots;
+
+  // TODO - snapshot_checker support in Remote Compaction
+  job_context.InitSnapshotContext(/*checker=*/nullptr,
+                                  /*managed_snapshot=*/nullptr,
+                                  kMaxSequenceNumber, std::move(snapshots));
+
+  // TODO - consider serializing the entire Compaction object and using it as
+  // input instead of recreating it in the remote worker
   std::unique_ptr<Compaction> c;
   assert(cfd->compaction_picker());
-  c.reset(cfd->compaction_picker()->CompactFiles(
+  std::optional<SequenceNumber> earliest_snapshot = std::nullopt;
+  // Standalone Range Deletion Optimization is only supported in Universal
+  // Compactions - https://github.com/facebook/rocksdb/pull/13078
+  if (cfd->GetLatestCFOptions().compaction_style ==
+      CompactionStyle::kCompactionStyleUniversal) {
+    earliest_snapshot = !job_context.snapshot_seqs.empty()
+                            ? job_context.snapshot_seqs.front()
+                            : kMaxSequenceNumber;
+  }
+  c.reset(cfd->compaction_picker()->PickCompactionForCompactFiles(
       comp_options, input_files, input.output_level, vstorage,
-      cfd->GetLatestMutableCFOptions(), mutable_db_options_, 0));
+      mutable_cf_options, mutable_db_options_, 0, earliest_snapshot,
+      job_context.snapshot_checker));
   assert(c != nullptr);
-
   c->FinalizeInputInfo(version);
 
-  // Create output directory if it's not existed yet
-  std::unique_ptr<FSDirectory> output_dir;
-  s = CreateAndNewDirectory(fs_.get(), secondary_path_, &output_dir);
-  if (!s.ok()) {
-    return s;
-  }
-
   LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
                        immutable_db_options_.info_log.get());
 
-  const int job_id = next_job_id_.fetch_add(1);
-
   // use primary host's db_id for running the compaction, but db_session_id is
   // using the local one, which is to make sure the unique id is unique from
   // the remote compactors. Because the id is generated from db_id,
@@ -951,17 +1405,19 @@ Status DBImplSecondary::CompactWithoutInstallation(
       job_id, c.get(), immutable_db_options_, mutable_db_options_,
       file_options_for_compaction_, versions_.get(), &shutting_down_,
       &log_buffer, output_dir.get(), stats_, &mutex_, &error_handler_,
-      input.snapshots, table_cache_, &event_logger_, dbname_, io_tracer_,
+      &job_context, table_cache_, &event_logger_, dbname_, io_tracer_,
       options.canceled ? *options.canceled : kManualCompactionCanceledFalse_,
       input.db_id, db_session_id_, secondary_path_, input, result);
 
-  compaction_job.Prepare();
+  compaction_job.Prepare(compaction_progress_,
+                         compaction_progress_writer.get());
 
   mutex_.Unlock();
   s = compaction_job.Run();
   mutex_.Lock();
 
-  // clean up
+  // These cleanup functions handle metadata and state cleanup only and
+  // not the physical files
   compaction_job.io_status().PermitUncheckedError();
   compaction_job.CleanupCompaction();
   c->ReleaseCompactionFiles(s);
@@ -969,6 +1425,18 @@ Status DBImplSecondary::CompactWithoutInstallation(
 
   TEST_SYNC_POINT_CALLBACK("DBImplSecondary::CompactWithoutInstallation::End",
                            &s);
+
+  if (!compaction_progress_.empty() && s.ok()) {
+    uint64_t total_resumed_bytes =
+        CalculateResumedCompactionBytes(compaction_progress_);
+
+    if (total_resumed_bytes > 0 &&
+        immutable_db_options_.statistics != nullptr) {
+      RecordTick(immutable_db_options_.statistics.get(),
+                 REMOTE_COMPACT_RESUMED_BYTES, total_resumed_bytes);
+    }
+  }
+
   result->status = s;
   return s;
 }
@@ -991,9 +1459,10 @@ Status DB::OpenAndCompact(
   }
 
   // 2. Load the options
-  DBOptions db_options;
+  DBOptions base_db_options;
   ConfigOptions config_options;
   config_options.env = override_options.env;
+  config_options.ignore_unknown_options = true;
   std::vector<ColumnFamilyDescriptor> all_column_families;
 
   TEST_SYNC_POINT_CALLBACK(
@@ -1003,13 +1472,22 @@ Status DB::OpenAndCompact(
   std::string options_file_name =
       OptionsFileName(name, compaction_input.options_file_number);
 
-  s = LoadOptionsFromFile(config_options, options_file_name, &db_options,
+  s = LoadOptionsFromFile(config_options, options_file_name, &base_db_options,
                           &all_column_families);
   if (!s.ok()) {
     return s;
   }
 
-  // 3. Override pointer configurations in DBOptions with
+  // 3. Options to Override
+  // Override serializable configurations from override_options.options_map
+  DBOptions db_options;
+  s = GetDBOptionsFromMap(config_options, base_db_options,
+                          override_options.options_map, &db_options);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Override options that are directly set as shared ptrs in
   // CompactionServiceOptionsOverride
   db_options.env = override_options.env;
   db_options.file_checksum_gen_factory =
@@ -1020,6 +1498,7 @@ Status DB::OpenAndCompact(
   // We will close the DB after the compaction anyway.
   // Open as many files as needed for the compaction.
   db_options.max_open_files = -1;
+  db_options.info_log = override_options.info_log;
 
   // 4. Filter CFs that are needed for OpenAndCompact()
   // We do not need to open all column families for the remote compaction.
@@ -1029,6 +1508,18 @@ Status DB::OpenAndCompact(
   std::vector<ColumnFamilyDescriptor> column_families;
   for (auto& cf : all_column_families) {
     if (cf.name == compaction_input.cf_name) {
+      ColumnFamilyOptions cf_options;
+      // Override serializable configurations from override_options.options_map
+      s = GetColumnFamilyOptionsFromMap(config_options, cf.options,
+                                        override_options.options_map,
+                                        &cf_options);
+      if (!s.ok()) {
+        return s;
+      }
+      cf.options = std::move(cf_options);
+
+      // Override options that are directly set as shared ptrs in
+      // CompactionServiceOptionsOverride
       cf.options.comparator = override_options.comparator;
       cf.options.merge_operator = override_options.merge_operator;
       cf.options.compaction_filter = override_options.compaction_filter;
@@ -1040,6 +1531,7 @@ Status DB::OpenAndCompact(
           override_options.sst_partitioner_factory;
       cf.options.table_properties_collector_factories =
           override_options.table_properties_collector_factories;
+
       column_families.emplace_back(cf);
     } else if (cf.name == kDefaultColumnFamilyName) {
       column_families.emplace_back(cf);
@@ -1047,7 +1539,7 @@ Status DB::OpenAndCompact(
   }
 
   // 5. Open db As Secondary
-  DB* db;
+  std::unique_ptr<DB> db;
   std::vector<ColumnFamilyHandle*> handles;
   s = DB::OpenAsSecondary(db_options, name, output_directory, column_families,
                           &handles, &db);
@@ -1056,6 +1548,9 @@ Status DB::OpenAndCompact(
   }
   assert(db);
 
+  TEST_SYNC_POINT_CALLBACK(
+      "DBImplSecondary::OpenAndCompact::AfterOpenAsSecondary:0", db.get());
+
   // 6. Find the handle of the Column Family that this will compact
   ColumnFamilyHandle* cfh = nullptr;
   for (auto* handle : handles) {
@@ -1069,7 +1564,8 @@ Status DB::OpenAndCompact(
   // 7. Run the compaction without installation.
   // Output will be stored in the directory specified by output_directory
   CompactionServiceResult compaction_result;
-  DBImplSecondary* db_secondary = static_cast_with_check<DBImplSecondary>(db);
+  DBImplSecondary* db_secondary =
+      static_cast_with_check<DBImplSecondary>(db.get());
   s = db_secondary->CompactWithoutInstallation(options, cfh, compaction_input,
                                                &compaction_result);
 
@@ -1080,7 +1576,7 @@ Status DB::OpenAndCompact(
   for (auto& handle : handles) {
     delete handle;
   }
-  delete db;
+  db.reset();
   if (s.ok()) {
     return serialization_status;
   } else {
@@ -1097,4 +1593,153 @@ Status DB::OpenAndCompact(
                         output, override_options);
 }
 
+Status DBImplSecondary::CreateCompactionProgressWriter(
+    const std::string& file_path,
+    std::unique_ptr<log::Writer>* compaction_progress_writer) {
+  std::unique_ptr<FSWritableFile> file;
+  Status s =
+      fs_->NewWritableFile(file_path, FileOptions(), &file, nullptr /* dbg */);
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::unique_ptr<WritableFileWriter> file_writer(
+      new WritableFileWriter(std::move(file), file_path, FileOptions()));
+
+  compaction_progress_writer->reset(
+      new log::Writer(std::move(file_writer), 0 /* log_number */,
+                      false /* recycle_log_files */));
+
+  return Status::OK();
+}
+
+Status DBImplSecondary::PersistInitialCompactionProgress(
+    log::Writer* compaction_progress_writer,
+    const CompactionProgress& compaction_progress) {
+  assert(compaction_progress_writer);
+
+  // LIMITATION: Only supports resuming single subcompaction
+  assert(compaction_progress.size() == 1);
+  const SubcompactionProgress& subcompaction_progress = compaction_progress[0];
+
+  VersionEdit edit;
+  edit.SetSubcompactionProgress(subcompaction_progress);
+
+  std::string record;
+  if (!edit.EncodeTo(&record)) {
+    return Status::IOError("Failed to encode the initial compaction progress");
+  }
+
+  WriteOptions write_options(Env::IOActivity::kCompaction);
+  Status s = compaction_progress_writer->AddRecord(write_options, record);
+  if (!s.ok()) {
+    return s;
+  }
+  IOOptions opts;
+  s = WritableFileWriter::PrepareIOOptions(write_options, opts);
+  if (!s.ok()) {
+    return s;
+  }
+
+  s = compaction_progress_writer->file()->Sync(opts,
+                                               immutable_db_options_.use_fsync);
+
+  return s;
+}
+
+Status DBImplSecondary::HandleCompactionProgressWriterCreationFailure(
+    const std::string& temp_file_path, const std::string& final_file_path,
+    std::unique_ptr<log::Writer>* compaction_progress_writer) {
+  compaction_progress_writer->reset();
+
+  const std::vector<std::string> paths_to_delete = {final_file_path,
+                                                    temp_file_path};
+
+  Status s;
+  for (const auto& file_path : paths_to_delete) {
+    WriteOptions write_options(Env::IOActivity::kCompaction);
+    IOOptions opts;
+    s = WritableFileWriter::PrepareIOOptions(write_options, opts);
+    if (s.ok()) {
+      s = fs_->DeleteFile(file_path, opts, nullptr /* dbg */);
+    }
+
+    if (!s.ok()) {
+      ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                      "Failed to cleanup the compaction progress file "
+                      "during writer creation failure: %s",
+                      s.ToString().c_str());
+      return s;
+    }
+  }
+
+  return s;
+}
+
+Status DBImplSecondary::FinalizeCompactionProgressWriter(
+    std::unique_ptr<log::Writer>* compaction_progress_writer) {
+  uint64_t timestamp = env_->NowMicros();
+  const std::string temp_file_path =
+      TempCompactionProgressFileName(secondary_path_, timestamp);
+
+  Status s = CreateCompactionProgressWriter(temp_file_path,
+                                            compaction_progress_writer);
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                   "Failed to create compaction progress writer at "
+                   "temp path %s: %s. Will perform clean up "
+                   "to start compaction without progress persistence",
+                   temp_file_path.c_str(), s.ToString().c_str());
+    return HandleCompactionProgressWriterCreationFailure(
+        temp_file_path, "" /* final_file_path */, compaction_progress_writer);
+  }
+
+  if (!compaction_progress_.empty()) {
+    s = PersistInitialCompactionProgress(compaction_progress_writer->get(),
+                                         compaction_progress_);
+    if (!s.ok()) {
+      ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                     "Failed to persist the initial copmaction "
+                     "progress: %s. Will perform clean up "
+                     "to start compaction without progress persistence",
+                     s.ToString().c_str());
+      return HandleCompactionProgressWriterCreationFailure(
+          temp_file_path, "" /* final_file_path */, compaction_progress_writer);
+    }
+  }
+
+  compaction_progress_writer->reset();
+
+  std::string final_file_path;
+  s = RenameCompactionProgressFile(temp_file_path, &final_file_path);
+
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                   "Failed to rename temporary compaction progress "
+                   "file from %s to %s: %s.  Will perform clean up "
+                   "to start compaction without progress persistence",
+                   temp_file_path.c_str(), final_file_path.c_str(),
+                   s.ToString().c_str());
+    return HandleCompactionProgressWriterCreationFailure(
+        temp_file_path, final_file_path, compaction_progress_writer);
+  }
+
+  s = CreateCompactionProgressWriter(final_file_path,
+                                     compaction_progress_writer);
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                   "Failed to create the final compaction progress "
+                   "writer: %s. Will attempt clean to start the compaction "
+                   "without progress persistence",
+                   s.ToString().c_str());
+    return HandleCompactionProgressWriterCreationFailure(
+        "" /* temp_file_path */, final_file_path, compaction_progress_writer);
+  }
+
+  ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+                  "Finalized compaction progress writer onto %s",
+                  final_file_path.c_str());
+
+  return Status::OK();
+}
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/db_impl/db_impl_secondary.h b/db/db_impl/db_impl_secondary.h
index c0d72c67e9f4..583b4081b3bc 100644
--- a/db/db_impl/db_impl_secondary.h
+++ b/db/db_impl/db_impl_secondary.h
@@ -216,9 +216,9 @@ class DBImplSecondary : public DBImpl {
 
   using DBImpl::SetOptions;
   Status SetOptions(
-      ColumnFamilyHandle* /*cfd*/,
-      const std::unordered_map<std::string, std::string>& /*options_map*/)
-      override {
+      const std::unordered_map<ColumnFamilyHandle*,
+                               std::unordered_map<std::string, std::string>>&
+      /*column_families_opts_map*/) override {
     // Currently not supported because changing certain options may cause
     // flush/compaction and/or write to MANIFEST.
     return Status::NotSupported("Not supported operation in secondary mode.");
@@ -248,12 +248,6 @@ class DBImplSecondary : public DBImpl {
   Status MaybeInitLogReader(uint64_t log_number,
                             log::FragmentBufferedReader** log_reader);
 
-  // Check if all live files exist on file system and that their file sizes
-  // matche to the in-memory records. It is possible that some live files may
-  // have been deleted by the primary. In this case, CheckConsistency() does
-  // not flag the missing file as inconsistency.
-  Status CheckConsistency() override;
-
 #ifndef NDEBUG
   Status TEST_CompactWithoutInstallation(const OpenAndCompactOptions& options,
                                          ColumnFamilyHandle* cfh,
@@ -309,6 +303,87 @@ class DBImplSecondary : public DBImpl {
                                     const CompactionServiceInput& input,
                                     CompactionServiceResult* result);
 
+ private:
+  // Holds results of compaction progress files and output files from a single
+  // directory scan
+  struct CompactionProgressFilesScan {
+    // The latest (newest) progress file filename
+    std::optional<std::string> latest_progress_filename;
+    uint64_t latest_progress_timestamp = 0;
+
+    // Older progress file filenames (to be deleted)
+    autovector<std::string> old_progress_filenames;
+
+    // Temporary progress file filenames (to be deleted)
+    autovector<std::string> temp_progress_filenames;
+
+    // All output file numbers - for cleanup optimization
+    std::vector<uint64_t> table_file_numbers;
+
+    bool HasLatestProgressFile() const {
+      return latest_progress_filename.has_value();
+    }
+
+    void Clear() {
+      latest_progress_filename.reset();
+      latest_progress_timestamp = 0;
+      old_progress_filenames.clear();
+      temp_progress_filenames.clear();
+      table_file_numbers.clear();
+    }
+  };
+
+  Status InitializeCompactionWorkspace(
+      bool allow_resumption, std::unique_ptr<FSDirectory>* output_dir,
+      std::unique_ptr<log::Writer>* compaction_progress_writer);
+
+  Status PrepareCompactionProgressState();
+
+  Status ScanCompactionProgressFiles(CompactionProgressFilesScan* scan_result);
+
+  Status DeleteCompactionProgressFiles(
+      const std::vector<std::string>& filenames);
+
+  Status CleanupOldAndTemporaryCompactionProgressFiles(
+      bool preserve_latest, const CompactionProgressFilesScan& scan_result);
+
+  Status LoadCompactionProgressAndCleanupExtraOutputFiles(
+      const std::string& compaction_progress_file_path,
+      const CompactionProgressFilesScan& scan_result);
+
+  Status ParseCompactionProgressFile(
+      const std::string& compaction_progress_file_path,
+      CompactionProgress* compaction_progress);
+
+  Status HandleInvalidOrNoCompactionProgress(
+      const std::optional<std::string>& compaction_progress_file_path,
+      const CompactionProgressFilesScan& scan_result);
+
+  Status CleanupPhysicalCompactionOutputFiles(
+      bool preserve_tracked_files,
+      const CompactionProgressFilesScan& scan_result);
+
+  Status FinalizeCompactionProgressWriter(
+      std::unique_ptr<log::Writer>* compaction_progress_writer);
+
+  Status CreateCompactionProgressWriter(
+      const std::string& file_path,
+      std::unique_ptr<log::Writer>* compaction_progress_writer);
+
+  Status PersistInitialCompactionProgress(
+      log::Writer* compaction_progress_writer,
+      const CompactionProgress& compaction_progress);
+
+  Status RenameCompactionProgressFile(const std::string& temp_file_path,
+                                      std::string* final_file_path);
+
+  Status HandleCompactionProgressWriterCreationFailure(
+      const std::string& temp_file_path, const std::string& final_file_path,
+      std::unique_ptr<log::Writer>* compaction_progress_writer);
+
+  uint64_t CalculateResumedCompactionBytes(
+      const CompactionProgress& compaction_progress) const;
+
   // Cache log readers for each log number, used for continue WAL replay
   // after recovery
   std::map<uint64_t, std::unique_ptr<LogReaderContainer>> log_readers_;
@@ -317,6 +392,8 @@ class DBImplSecondary : public DBImpl {
   std::unordered_map<ColumnFamilyData*, uint64_t> cfd_to_current_log_;
 
   const std::string secondary_path_;
+
+  CompactionProgress compaction_progress_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc
index 7051c970aad7..8a4c5ec9be6c 100644
--- a/db/db_impl/db_impl_write.cc
+++ b/db/db_impl/db_impl_write.cc
@@ -157,7 +157,7 @@ Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
   if (s.ok()) {
     s = WriteImpl(write_options, my_batch, /*callback=*/nullptr,
                   /*user_write_cb=*/nullptr,
-                  /*log_used=*/nullptr);
+                  /*wal_used=*/nullptr);
   }
   return s;
 }
@@ -190,11 +190,38 @@ Status DBImpl::WriteWithCallback(const WriteOptions& write_options,
   return s;
 }
 
-Status DBImpl::IngestWBWI(std::shared_ptr<WriteBatchWithIndex> wbwi,
-                          const WBWIMemTable::SeqnoRange& assigned_seqno,
-                          uint64_t prep_log,
-                          SequenceNumber last_seqno_after_ingest,
-                          bool memtable_updated, bool ignore_missing_cf) {
+Status DBImpl::IngestWriteBatchWithIndex(
+    const WriteOptions& write_options,
+    std::shared_ptr<WriteBatchWithIndex> wbwi) {
+  if (!wbwi) {
+    return Status::InvalidArgument("Batch is nullptr!");
+  }
+  if (!write_options.disableWAL) {
+    return Status::NotSupported(
+        "IngestWriteBatchWithIndex does not support disableWAL=true");
+  }
+  Status s;
+  if (write_options.protection_bytes_per_key > 0) {
+    s = WriteBatchInternal::UpdateProtectionInfo(
+        wbwi->GetWriteBatch(), write_options.protection_bytes_per_key);
+  }
+  if (s.ok()) {
+    WriteBatch dummy_empty_batch;
+    s = WriteImpl(
+        write_options, /*updates=*/&dummy_empty_batch, /*callback=*/nullptr,
+        /*user_write_cb=*/nullptr, /*log_used=*/nullptr, /*log_ref=*/0,
+        /*disable_memtable=*/false, /*seq_used=*/nullptr,
+        /*batch_cnt=*/0, /*pre_release_callback=*/nullptr,
+        /*post_memtable_callback=*/nullptr, /*wbwi=*/wbwi);
+  }
+  return s;
+}
+
+Status DBImpl::IngestWBWIAsMemtable(
+    std::shared_ptr<WriteBatchWithIndex> wbwi,
+    const WBWIMemTable::SeqnoRange& assigned_seqno, uint64_t min_prep_log,
+    SequenceNumber last_seqno_after_ingest, bool memtable_updated,
+    bool ignore_missing_cf) {
   // Keys in new memtable have seqno > last_seqno_after_ingest >= keys in wbwi.
   assert(assigned_seqno.upper_bound <= last_seqno_after_ingest);
   // Keys in the current memtable have seqno <= LastSequence() < keys in wbwi.
@@ -238,12 +265,30 @@ Status DBImpl::IngestWBWI(std::shared_ptr<WriteBatchWithIndex> wbwi,
     wbwi_memtable->AssignSequenceNumbers(assigned_seqno);
     // This is needed to keep the WAL that contains Prepare alive until
     // committed data in this memtable is persisted.
-    wbwi_memtable->SetMinPrepLog(prep_log);
+    wbwi_memtable->SetMinPrepLog(min_prep_log);
     memtables.push_back(wbwi_memtable);
     cfd->Ref();
     cfds.push_back(cfd);
   }
 
+  autovector<ColumnFamilyData*> cfds_for_atomic_flush;
+  if (immutable_db_options_.atomic_flush) {
+    SelectColumnFamiliesForAtomicFlush(&cfds_for_atomic_flush);
+    for (auto cfd : cfds_for_atomic_flush) {
+      bool found = false;
+      for (auto existing_cfd : cfds) {
+        if (existing_cfd == cfd) {
+          found = true;
+          break;
+        }
+      }
+      if (!found) {
+        cfd->Ref();
+        cfds.push_back(cfd);
+      }
+    }
+  }
+
   // Stop writes to the DB by entering both write threads
   WriteThread::Writer nonmem_w;
   if (two_write_queues_) {
@@ -253,15 +298,16 @@ Status DBImpl::IngestWBWI(std::shared_ptr<WriteBatchWithIndex> wbwi,
 
   // Switch memtable and add WBWIMemTables
   Status s;
-  for (size_t i = 0; i < memtables.size(); ++i) {
-    assert(!immutable_db_options_.atomic_flush);
-    // NOTE: to support atomic flush, need to call
-    // SelectColumnFamiliesForAtomicFlush()
+  for (size_t i = 0; i < cfds.size(); ++i) {
     WriteContext write_context;
     // TODO: not switch on empty memtable, may need to update metadata
     //   like NextLogNumber(), earliest_seqno and memtable id.
-    s = SwitchMemtable(cfds[i], &write_context, memtables[i],
-                       last_seqno_after_ingest);
+    if (i < memtables.size()) {
+      s = SwitchMemtable(cfds[i], &write_context, memtables[i],
+                         last_seqno_after_ingest);
+    } else {
+      s = SwitchMemtable(cfds[i], &write_context);
+    }
     if (!s.ok()) {
       // SwitchMemtable() can only fail if a new WAL is to be created, this
       // should only happen for the first call to SwitchMemtable(). log will
@@ -301,9 +347,18 @@ Status DBImpl::IngestWBWI(std::shared_ptr<WriteBatchWithIndex> wbwi,
         continue;
       }
       cfd->imm()->FlushRequested();
+      if (!immutable_db_options_.atomic_flush) {
+        FlushRequest flush_req;
+        // TODO: a new flush reason for ingesting memtable
+        GenerateFlushRequest({cfd}, FlushReason::kExternalFileIngestion,
+                             &flush_req);
+        EnqueuePendingFlush(flush_req);
+      }
+    }
+    if (immutable_db_options_.atomic_flush) {
+      AssignAtomicFlushSeq(cfds);
       FlushRequest flush_req;
-      // TODO: a new flush reason for ingesting memtable
-      GenerateFlushRequest({cfd}, FlushReason::kExternalFileIngestion,
+      GenerateFlushRequest(cfds, FlushReason::kExternalFileIngestion,
                            &flush_req);
       EnqueuePendingFlush(flush_req);
     }
@@ -314,13 +369,12 @@ Status DBImpl::IngestWBWI(std::shared_ptr<WriteBatchWithIndex> wbwi,
 
 Status DBImpl::WriteImpl(const WriteOptions& write_options,
                          WriteBatch* my_batch, WriteCallback* callback,
-                         UserWriteCallback* user_write_cb, uint64_t* log_used,
+                         UserWriteCallback* user_write_cb, uint64_t* wal_used,
                          uint64_t log_ref, bool disable_memtable,
                          uint64_t* seq_used, size_t batch_cnt,
                          PreReleaseCallback* pre_release_callback,
                          PostMemTableCallback* post_memtable_callback,
-                         std::shared_ptr<WriteBatchWithIndex> wbwi,
-                         uint64_t prep_log) {
+                         std::shared_ptr<WriteBatchWithIndex> wbwi) {
   assert(!seq_per_batch_ || batch_cnt != 0);
   assert(my_batch == nullptr || my_batch->Count() == 0 ||
          write_options.protection_bytes_per_key == 0 ||
@@ -409,9 +463,17 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
     return Status::NotSupported(
         "DeleteRange is not compatible with row cache.");
   }
+  // Whether the WBWI is from transaction commit or a direct write
+  // (IngestWriteBatchWithIndex())
+  bool ingest_wbwi_for_commit = false;
   if (wbwi) {
-    assert(prep_log > 0);
-    // Used only in WriteCommittedTxn::CommitInternal() with no `callback`.
+    if (my_batch->HasCommit()) {
+      ingest_wbwi_for_commit = true;
+      assert(log_ref);
+    } else {
+      // Only supports disableWAL for directly ingesting WBWI for now.
+      assert(write_options.disableWAL);
+    }
     assert(!callback);
     if (immutable_db_options_.unordered_write) {
       return Status::NotSupported(
@@ -421,9 +483,9 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
       return Status::NotSupported(
           "Ingesting WriteBatch does not support pipelined_write");
     }
-    if (immutable_db_options_.atomic_flush) {
+    if (!wbwi->GetOverwriteKey()) {
       return Status::NotSupported(
-          "Ingesting WriteBatch does not support atomic_flush");
+          "WriteBatchWithIndex ingestion requires overwrite_key=true");
     }
   }
   // Otherwise IsLatestPersistentState optimization does not make sense
@@ -444,7 +506,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
     // they don't consume sequence.
     return WriteImplWALOnly(
         &nonmem_write_thread_, write_options, my_batch, callback, user_write_cb,
-        log_used, log_ref, seq_used, batch_cnt, pre_release_callback,
+        wal_used, log_ref, seq_used, batch_cnt, pre_release_callback,
         assign_order, kDontPublishLastSeq, disable_memtable);
   }
 
@@ -458,7 +520,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
     // sequence in in increasing order, iii) call pre_release_callback serially
     Status status = WriteImplWALOnly(
         &write_thread_, write_options, my_batch, callback, user_write_cb,
-        log_used, log_ref, &seq, sub_batch_cnt, pre_release_callback,
+        wal_used, log_ref, &seq, sub_batch_cnt, pre_release_callback,
         kDoAssignOrder, kDoPublishLastSeq, disable_memtable);
     TEST_SYNC_POINT("DBImpl::WriteImpl:UnorderedWriteAfterWriteWAL");
     if (!status.ok()) {
@@ -477,7 +539,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
 
   if (immutable_db_options_.enable_pipelined_write) {
     return PipelinedWriteImpl(write_options, my_batch, callback, user_write_cb,
-                              log_used, log_ref, disable_memtable, seq_used);
+                              wal_used, log_ref, disable_memtable, seq_used);
   }
 
   PERF_TIMER_GUARD(write_pre_and_post_process_time);
@@ -524,16 +586,19 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
           assert(tmp_s.ok());
         }
       }
-      versions_->SetLastSequence(last_sequence);
-      MemTableInsertStatusCheck(w.status);
+      if (w.status.ok()) {  // Don't publish a partial batch write
+        versions_->SetLastSequence(last_sequence);
+      } else {
+        HandleMemTableInsertFailure(w.status);
+      }
       write_thread_.ExitAsBatchGroupFollower(&w);
     }
     assert(w.state == WriteThread::STATE_COMPLETED);
     // STATE_COMPLETED conditional below handles exit
   }
   if (w.state == WriteThread::STATE_COMPLETED) {
-    if (log_used != nullptr) {
-      *log_used = w.log_used;
+    if (wal_used != nullptr) {
+      *wal_used = w.wal_used;
     }
     if (seq_used != nullptr) {
       *seq_used = w.sequence;
@@ -549,7 +614,8 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
   // when it finds suitable, and finish them in the same write batch.
   // This is how a write job could be done by the other writer.
   WriteContext write_context;
-  LogContext log_context(write_options.sync);
+  // FIXME: also check disableWAL like others?
+  WalContext wal_context(write_options.sync);
   WriteThread::WriteGroup write_group;
   bool in_parallel_group = false;
   uint64_t last_sequence = kMaxSequenceNumber;
@@ -563,7 +629,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
     // PreprocessWrite does its own perf timing.
     PERF_TIMER_STOP(write_pre_and_post_process_time);
 
-    status = PreprocessWrite(write_options, &log_context, &write_context);
+    status = PreprocessWrite(write_options, &wal_context, &write_context);
     if (!two_write_queues_) {
       // Assign it after ::PreprocessWrite since the sequence might advance
       // inside it by WriteRecoverableState
@@ -631,7 +697,13 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
             continue;
           }
           // TODO: maybe handle the tracing status?
-          tracer_->Write(writer->batch).PermitUncheckedError();
+          if (wbwi && !ingest_wbwi_for_commit) {
+            // for transaction write, tracer only needs the commit marker which
+            // is in writer->batch
+            tracer_->Write(wbwi->GetWriteBatch()).PermitUncheckedError();
+          } else {
+            tracer_->Write(writer->batch).PermitUncheckedError();
+          }
         }
       }
     }
@@ -689,22 +761,21 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
 
     if (!two_write_queues_) {
       if (status.ok() && !write_options.disableWAL) {
-        assert(log_context.log_file_number_size);
-        LogFileNumberSize& log_file_number_size =
-            *(log_context.log_file_number_size);
+        assert(wal_context.wal_file_number_size);
+        wal_context.prev_size = wal_context.writer->file()->GetFileSize();
         PERF_TIMER_GUARD(write_wal_time);
-        io_s =
-            WriteToWAL(write_group, log_context.writer, log_used,
-                       log_context.need_log_sync, log_context.need_log_dir_sync,
-                       last_sequence + 1, log_file_number_size);
+        io_s = WriteGroupToWAL(write_group, wal_context.writer, wal_used,
+                               wal_context.need_wal_sync,
+                               wal_context.need_wal_dir_sync, last_sequence + 1,
+                               *wal_context.wal_file_number_size);
       }
     } else {
       if (status.ok() && !write_options.disableWAL) {
         PERF_TIMER_GUARD(write_wal_time);
         // LastAllocatedSequence is increased inside WriteToWAL under
         // wal_write_mutex_ to ensure ordered events in WAL
-        io_s = ConcurrentWriteToWAL(write_group, log_used, &last_sequence,
-                                    seq_inc);
+        io_s = ConcurrentWriteGroupToWAL(write_group, wal_used, &last_sequence,
+                                         seq_inc);
       } else {
         // Otherwise we inc seq number for memtable writes
         last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
@@ -716,16 +787,16 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
     last_sequence += seq_inc;
     // Seqno assigned to this write are [current_sequence, last_sequence]
 
-    if (log_context.need_log_sync) {
+    if (wal_context.need_wal_sync) {
       VersionEdit synced_wals;
-      log_write_mutex_.Lock();
+      wal_write_mutex_.Lock();
       if (status.ok()) {
-        MarkLogsSynced(logfile_number_, log_context.need_log_dir_sync,
+        MarkLogsSynced(cur_wal_number_, wal_context.need_wal_dir_sync,
                        &synced_wals);
       } else {
-        MarkLogsNotSynced(logfile_number_);
+        MarkLogsNotSynced(cur_wal_number_);
       }
-      log_write_mutex_.Unlock();
+      wal_write_mutex_.Unlock();
       if (status.ok() && synced_wals.IsWalAddition()) {
         InstrumentedMutexLock l(&mutex_);
         // TODO: plumb Env::IOActivity, Env::IOPriority
@@ -760,7 +831,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
         writer->sequence = next_sequence;
         if (writer->pre_release_callback) {
           Status ws = writer->pre_release_callback->Callback(
-              writer->sequence, disable_memtable, writer->log_used, index++,
+              writer->sequence, disable_memtable, writer->wal_used, index++,
               pre_release_callback_cnt);
           if (!ws.ok()) {
             status = pre_release_cb_status = ws;
@@ -785,8 +856,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
             write_group, current_sequence, column_family_memtables_.get(),
             &flush_scheduler_, &trim_history_scheduler_,
             write_options.ignore_missing_column_families,
-            0 /*recovery_log_number*/, this, parallel, seq_per_batch_,
-            batch_per_txn_);
+            0 /*recovery_log_number*/, this, seq_per_batch_, batch_per_txn_);
       } else {
         write_group.last_sequence = last_sequence;
         write_thread_.LaunchParallelMemTableWriters(&write_group);
@@ -834,12 +904,13 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
     // handle exit, false means somebody else did
     should_exit_batch_group = write_thread_.CompleteParallelMemTableWriter(&w);
   }
-  if (wbwi) {
-    if (status.ok() && w.status.ok()) {
+  if (wbwi && status.ok() && w.status.ok()) {
+    uint32_t wbwi_count = wbwi->GetWriteBatch()->Count();
+    // skip empty batch case
+    if (wbwi_count) {
       // w.batch contains (potentially empty) commit time batch updates,
       // only ingest wbwi if w.batch is applied to memtable successfully
       uint32_t memtable_update_count = w.batch->Count();
-      uint32_t wbwi_count = wbwi->GetWriteBatch()->Count();
       // Seqno assigned to this write are [last_seq + 1 - seq_inc, last_seq].
       // seq_inc includes w.batch (memtable updates) and wbwi
       // w.batch gets first `memtable_update_count` sequence numbers.
@@ -852,10 +923,12 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
       if (two_write_queues_) {
         assert(ub <= versions_->LastAllocatedSequence());
       }
-      status = IngestWBWI(wbwi, {/*lower_bound=*/lb, /*upper_bound=*/ub},
-                          prep_log, last_sequence,
-                          /*memtable_updated=*/memtable_update_count > 0,
-                          write_options.ignore_missing_column_families);
+      status =
+          IngestWBWIAsMemtable(wbwi, {/*lower_bound=*/lb, /*upper_bound=*/ub},
+                               /*min_prep_log=*/log_ref, last_sequence,
+                               /*memtable_updated=*/memtable_update_count > 0,
+                               write_options.ignore_missing_column_families);
+      RecordTick(stats_, NUMBER_WBWI_INGEST);
     }
   }
 
@@ -873,9 +946,19 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
       }
       // Note: if we are to resume after non-OK statuses we need to revisit how
       // we react to non-OK statuses here.
-      versions_->SetLastSequence(last_sequence);
+      if (w.status.ok()) {  // Don't publish a partial batch write
+        versions_->SetLastSequence(last_sequence);
+      }
+    }
+    if (!w.status.ok()) {
+      if (wal_context.prev_size < SIZE_MAX) {
+        InstrumentedMutexLock l(&wal_write_mutex_);
+        if (logs_.back().number == wal_context.wal_file_number_size->number) {
+          logs_.back().SetAttemptTruncateSize(wal_context.prev_size);
+        }
+      }
+      HandleMemTableInsertFailure(w.status);
     }
-    MemTableInsertStatusCheck(w.status);
     write_thread_.ExitAsBatchGroupLeader(write_group, status);
   }
 
@@ -888,7 +971,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
 Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
                                   WriteBatch* my_batch, WriteCallback* callback,
                                   UserWriteCallback* user_write_cb,
-                                  uint64_t* log_used, uint64_t log_ref,
+                                  uint64_t* wal_used, uint64_t log_ref,
                                   bool disable_memtable, uint64_t* seq_used) {
   PERF_TIMER_GUARD(write_pre_and_post_process_time);
   StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE);
@@ -905,10 +988,10 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
     if (w.callback && !w.callback->AllowWriteBatching()) {
       write_thread_.WaitForMemTableWriters();
     }
-    LogContext log_context(!write_options.disableWAL && write_options.sync);
+    WalContext wal_context(!write_options.disableWAL && write_options.sync);
     // PreprocessWrite does its own perf timing.
     PERF_TIMER_STOP(write_pre_and_post_process_time);
-    w.status = PreprocessWrite(write_options, &log_context, &write_context);
+    w.status = PreprocessWrite(write_options, &wal_context, &write_context);
     PERF_TIMER_START(write_pre_and_post_process_time);
 
     // This can set non-OK status if callback fail.
@@ -977,13 +1060,13 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
                           wal_write_group.size - 1);
         RecordTick(stats_, WRITE_DONE_BY_OTHER, wal_write_group.size - 1);
       }
-      assert(log_context.log_file_number_size);
-      LogFileNumberSize& log_file_number_size =
-          *(log_context.log_file_number_size);
-      io_s =
-          WriteToWAL(wal_write_group, log_context.writer, log_used,
-                     log_context.need_log_sync, log_context.need_log_dir_sync,
-                     current_sequence, log_file_number_size);
+      assert(wal_context.wal_file_number_size);
+      WalFileNumberSize& wal_file_number_size =
+          *(wal_context.wal_file_number_size);
+      io_s = WriteGroupToWAL(wal_write_group, wal_context.writer, wal_used,
+                             wal_context.need_wal_sync,
+                             wal_context.need_wal_dir_sync, current_sequence,
+                             wal_file_number_size);
       w.status = io_s;
     }
 
@@ -995,13 +1078,13 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
     }
 
     VersionEdit synced_wals;
-    if (log_context.need_log_sync) {
-      InstrumentedMutexLock l(&log_write_mutex_);
+    if (wal_context.need_wal_sync) {
+      InstrumentedMutexLock l(&wal_write_mutex_);
       if (w.status.ok()) {
-        MarkLogsSynced(logfile_number_, log_context.need_log_dir_sync,
+        MarkLogsSynced(cur_wal_number_, wal_context.need_wal_dir_sync,
                        &synced_wals);
       } else {
-        MarkLogsNotSynced(logfile_number_);
+        MarkLogsNotSynced(cur_wal_number_);
       }
     }
     if (w.status.ok() && synced_wals.IsWalAddition()) {
@@ -1031,8 +1114,13 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
           memtable_write_group, w.sequence, column_family_memtables_.get(),
           &flush_scheduler_, &trim_history_scheduler_,
           write_options.ignore_missing_column_families, 0 /*log_number*/, this,
-          false /*concurrent_memtable_writes*/, seq_per_batch_, batch_per_txn_);
-      versions_->SetLastSequence(memtable_write_group.last_sequence);
+          seq_per_batch_, batch_per_txn_);
+      if (memtable_write_group.status
+              .ok()) {  // Don't publish a partial batch write
+        versions_->SetLastSequence(memtable_write_group.last_sequence);
+      } else {
+        HandleMemTableInsertFailure(memtable_write_group.status);
+      }
       write_thread_.ExitAsMemTableWriter(&w, memtable_write_group);
     }
   } else {
@@ -1061,8 +1149,11 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
     PERF_TIMER_START(write_pre_and_post_process_time);
 
     if (write_thread_.CompleteParallelMemTableWriter(&w)) {
-      MemTableInsertStatusCheck(w.status);
-      versions_->SetLastSequence(w.write_group->last_sequence);
+      if (w.status.ok()) {  // Don't publish a partial batch write
+        versions_->SetLastSequence(w.write_group->last_sequence);
+      } else {
+        HandleMemTableInsertFailure(w.status);
+      }
       write_thread_.ExitAsMemTableWriter(&w, *w.write_group);
     }
   }
@@ -1134,7 +1225,7 @@ Status DBImpl::UnorderedWriteMemtable(const WriteOptions& write_options,
 Status DBImpl::WriteImplWALOnly(
     WriteThread* write_thread, const WriteOptions& write_options,
     WriteBatch* my_batch, WriteCallback* callback,
-    UserWriteCallback* user_write_cb, uint64_t* log_used,
+    UserWriteCallback* user_write_cb, uint64_t* wal_used,
     const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt,
     PreReleaseCallback* pre_release_callback, const AssignOrder assign_order,
     const PublishLastSeq publish_last_seq, const bool disable_memtable) {
@@ -1147,8 +1238,8 @@ Status DBImpl::WriteImplWALOnly(
   write_thread->JoinBatchGroup(&w);
   assert(w.state != WriteThread::STATE_PARALLEL_MEMTABLE_WRITER);
   if (w.state == WriteThread::STATE_COMPLETED) {
-    if (log_used != nullptr) {
-      *log_used = w.log_used;
+    if (wal_used != nullptr) {
+      *wal_used = w.wal_used;
     }
     if (seq_used != nullptr) {
       *seq_used = w.sequence;
@@ -1164,10 +1255,10 @@ Status DBImpl::WriteImplWALOnly(
 
     // TODO(myabandeh): Make preliminary checks thread-safe so we could do them
     // without paying the cost of obtaining the mutex.
-    LogContext log_context;
+    WalContext wal_context;
     WriteContext write_context;
     Status status =
-        PreprocessWrite(write_options, &log_context, &write_context);
+        PreprocessWrite(write_options, &wal_context, &write_context);
     WriteStatusCheckOnLocked(status);
 
     if (!status.ok()) {
@@ -1264,8 +1355,8 @@ Status DBImpl::WriteImplWALOnly(
   }
   Status status;
   if (!write_options.disableWAL) {
-    IOStatus io_s =
-        ConcurrentWriteToWAL(write_group, log_used, &last_sequence, seq_inc);
+    IOStatus io_s = ConcurrentWriteGroupToWAL(write_group, wal_used,
+                                              &last_sequence, seq_inc);
     status = io_s;
     // last_sequence may not be set if there is an error
     // This error checking and return is moved up to avoid using uninitialized
@@ -1317,7 +1408,7 @@ Status DBImpl::WriteImplWALOnly(
       if (!writer->CallbackFailed() && writer->pre_release_callback) {
         assert(writer->sequence != kMaxSequenceNumber);
         Status ws = writer->pre_release_callback->Callback(
-            writer->sequence, disable_memtable, writer->log_used, index++,
+            writer->sequence, disable_memtable, writer->wal_used, index++,
             pre_release_callback_cnt);
         if (!ws.ok()) {
           status = ws;
@@ -1386,24 +1477,22 @@ void DBImpl::WALIOStatusCheck(const IOStatus& io_status) {
   }
 }
 
-void DBImpl::MemTableInsertStatusCheck(const Status& status) {
-  // A non-OK status here indicates that the state implied by the
-  // WAL has diverged from the in-memory state.  This could be
-  // because of a corrupt write_batch (very bad), or because the
-  // client specified an invalid column family and didn't specify
-  // ignore_missing_column_families.
-  if (!status.ok()) {
-    mutex_.Lock();
-    assert(!error_handler_.IsBGWorkStopped());
-    error_handler_.SetBGError(status, BackgroundErrorReason::kMemTable);
-    mutex_.Unlock();
-  }
+void DBImpl::HandleMemTableInsertFailure(const Status& status) {
+  assert(!status.ok());
+  // A non-OK status on memtable insert indicates that the state implied by the
+  // WAL has diverged from the in-memory state.  This could be because of a
+  // corrupt write_batch (very bad), or because the client specified an invalid
+  // column family and didn't specify ignore_missing_column_families.
+  mutex_.Lock();
+  assert(!error_handler_.IsBGWorkStopped());
+  error_handler_.SetBGError(status, BackgroundErrorReason::kMemTable);
+  mutex_.Unlock();
 }
 
 Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
-                               LogContext* log_context,
+                               WalContext* wal_context,
                                WriteContext* write_context) {
-  assert(write_context != nullptr && log_context != nullptr);
+  assert(write_context != nullptr && wal_context != nullptr);
   Status status;
 
   if (error_handler_.IsDBStopped()) {
@@ -1413,7 +1502,8 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
 
   PERF_TIMER_GUARD(write_scheduling_flushes_compactions_time);
 
-  if (UNLIKELY(status.ok() && total_log_size_ > GetMaxTotalWalSize())) {
+  if (UNLIKELY(status.ok() &&
+               wals_total_size_.LoadRelaxed() > GetMaxTotalWalSize())) {
     assert(versions_);
     InstrumentedMutexLock l(&mutex_);
     const ColumnFamilySet* const column_families =
@@ -1482,17 +1572,17 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
       WriteBufferManagerStallWrites();
     }
   }
-  InstrumentedMutexLock l(&log_write_mutex_);
-  if (status.ok() && log_context->need_log_sync) {
+  InstrumentedMutexLock l(&wal_write_mutex_);
+  if (status.ok() && wal_context->need_wal_sync) {
     // Wait until the parallel syncs are finished. Any sync process has to sync
     // the front log too so it is enough to check the status of front()
-    // We do a while loop since log_sync_cv_ is signalled when any sync is
+    // We do a while loop since wal_sync_cv_ is signalled when any sync is
     // finished
     // Note: there does not seem to be a reason to wait for parallel sync at
     // this early step but it is not important since parallel sync (SyncWAL) and
-    // need_log_sync are usually not used together.
+    // need_wal_sync are usually not used together.
     while (logs_.front().IsSyncing()) {
-      log_sync_cv_.Wait();
+      wal_sync_cv_.Wait();
     }
     for (auto& log : logs_) {
       // This is just to prevent the logs to be synced by a parallel SyncWAL
@@ -1503,12 +1593,12 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
       log.PrepareForSync();
     }
   } else {
-    log_context->need_log_sync = false;
+    wal_context->need_wal_sync = false;
   }
-  log_context->writer = logs_.back().writer;
-  log_context->need_log_dir_sync =
-      log_context->need_log_dir_sync && !log_dir_synced_;
-  log_context->log_file_number_size = std::addressof(alive_log_files_.back());
+  wal_context->writer = logs_.back().writer;
+  wal_context->need_wal_dir_sync =
+      wal_context->need_wal_dir_sync && !wal_dir_synced_;
+  wal_context->wal_file_number_size = std::addressof(alive_wal_files_.back());
 
   return status;
 }
@@ -1559,12 +1649,12 @@ Status DBImpl::MergeBatch(const WriteThread::WriteGroup& write_group,
 }
 
 // When two_write_queues_ is disabled, this function is called from the only
-// write thread. Otherwise this must be called holding log_write_mutex_.
+// write thread. Otherwise this must be called holding wal_write_mutex_.
 IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch,
                             const WriteOptions& write_options,
-                            log::Writer* log_writer, uint64_t* log_used,
+                            log::Writer* log_writer, uint64_t* wal_used,
                             uint64_t* log_size,
-                            LogFileNumberSize& log_file_number_size,
+                            WalFileNumberSize& wal_file_number_size,
                             SequenceNumber sequence) {
   assert(log_size != nullptr);
 
@@ -1576,7 +1666,7 @@ IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch,
   }
   *log_size = log_entry.size();
   // When two_write_queues_ WriteToWAL has to be protected from concurretn calls
-  // from the two queues anyway and log_write_mutex_ is already held. Otherwise
+  // from the two queues anyway and wal_write_mutex_ is already held. Otherwise
   // if manual_wal_flush_ is enabled we need to protect log_writer->AddRecord
   // from possible concurrent calls via the FlushWAL by the application.
   const bool needs_locking = manual_wal_flush_ && !two_write_queues_;
@@ -1584,7 +1674,7 @@ IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch,
   // manual_wal_flush_ feature (by UNLIKELY) instead of the more common case
   // when we do not need any locking.
   if (UNLIKELY(needs_locking)) {
-    log_write_mutex_.Lock();
+    wal_write_mutex_.Lock();
   }
   IOStatus io_s = log_writer->MaybeAddUserDefinedTimestampSizeRecord(
       write_options, versions_->GetColumnFamiliesTimestampSizeForRecord());
@@ -1594,23 +1684,24 @@ IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch,
   io_s = log_writer->AddRecord(write_options, log_entry, sequence);
 
   if (UNLIKELY(needs_locking)) {
-    log_write_mutex_.Unlock();
+    wal_write_mutex_.Unlock();
   }
-  if (log_used != nullptr) {
-    *log_used = logfile_number_;
+  if (wal_used != nullptr) {
+    *wal_used = cur_wal_number_;
+    assert(*wal_used == wal_file_number_size.number);
   }
-  total_log_size_ += log_entry.size();
-  log_file_number_size.AddSize(*log_size);
-  log_empty_ = false;
+  wals_total_size_.FetchAddRelaxed(log_entry.size());
+  wal_file_number_size.AddSize(*log_size);
+  wal_empty_ = false;
 
   return io_s;
 }
 
-IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
-                            log::Writer* log_writer, uint64_t* log_used,
-                            bool need_log_sync, bool need_log_dir_sync,
-                            SequenceNumber sequence,
-                            LogFileNumberSize& log_file_number_size) {
+IOStatus DBImpl::WriteGroupToWAL(const WriteThread::WriteGroup& write_group,
+                                 log::Writer* log_writer, uint64_t* wal_used,
+                                 bool need_wal_sync, bool need_wal_dir_sync,
+                                 SequenceNumber sequence,
+                                 WalFileNumberSize& wal_file_number_size) {
   IOStatus io_s;
   assert(!two_write_queues_);
   assert(!write_group.leader->disable_wal);
@@ -1625,10 +1716,10 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
   }
 
   if (merged_batch == write_group.leader->batch) {
-    write_group.leader->log_used = logfile_number_;
+    write_group.leader->wal_used = cur_wal_number_;
   } else if (write_with_wal > 1) {
     for (auto writer : write_group) {
-      writer->log_used = logfile_number_;
+      writer->wal_used = cur_wal_number_;
     }
   }
 
@@ -1640,14 +1731,14 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
   WriteOptions write_options;
   write_options.rate_limiter_priority =
       write_group.leader->rate_limiter_priority;
-  io_s = WriteToWAL(*merged_batch, write_options, log_writer, log_used,
-                    &log_size, log_file_number_size, sequence);
+  io_s = WriteToWAL(*merged_batch, write_options, log_writer, wal_used,
+                    &log_size, wal_file_number_size, sequence);
   if (to_be_cached_state) {
     cached_recoverable_state_ = *to_be_cached_state;
     cached_recoverable_state_empty_ = false;
   }
 
-  if (io_s.ok() && need_log_sync) {
+  if (io_s.ok() && need_wal_sync) {
     StopWatch sw(immutable_db_options_.clock, stats_, WAL_FILE_SYNC_MICROS);
     // It's safe to access logs_ with unlocked mutex_ here because:
     //  - we've set getting_synced=true for all logs,
@@ -1657,15 +1748,15 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
     //  - as long as other threads don't modify it, it's safe to read
     //    from std::deque from multiple threads concurrently.
     //
-    // Sync operation should work with locked log_write_mutex_, because:
+    // Sync operation should work with locked wal_write_mutex_, because:
     //   when DBOptions.manual_wal_flush_ is set,
     //   FlushWAL function will be invoked by another thread.
-    //   if without locked log_write_mutex_, the log file may get data
+    //   if without locked wal_write_mutex_, the log file may get data
     //   corruption
 
     const bool needs_locking = manual_wal_flush_ && !two_write_queues_;
     if (UNLIKELY(needs_locking)) {
-      log_write_mutex_.Lock();
+      wal_write_mutex_.Lock();
     }
 
     if (io_s.ok()) {
@@ -1688,10 +1779,10 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
     }
 
     if (UNLIKELY(needs_locking)) {
-      log_write_mutex_.Unlock();
+      wal_write_mutex_.Unlock();
     }
 
-    if (io_s.ok() && need_log_dir_sync) {
+    if (io_s.ok() && need_wal_dir_sync) {
       // We only sync WAL directory the first time WAL syncing is
       // requested, so that in case users never turn on WAL sync,
       // we can avoid the disk I/O in the write code path.
@@ -1706,7 +1797,7 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
   }
   if (io_s.ok()) {
     auto stats = default_cf_internal_stats_;
-    if (need_log_sync) {
+    if (need_wal_sync) {
       stats->AddDBStats(InternalStats::kIntStatsWalFileSynced, 1);
       RecordTick(stats_, WAL_FILE_SYNCED);
     }
@@ -1723,8 +1814,8 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
   return io_s;
 }
 
-IOStatus DBImpl::ConcurrentWriteToWAL(
-    const WriteThread::WriteGroup& write_group, uint64_t* log_used,
+IOStatus DBImpl::ConcurrentWriteGroupToWAL(
+    const WriteThread::WriteGroup& write_group, uint64_t* wal_used,
     SequenceNumber* last_sequence, size_t seq_inc) {
   IOStatus io_s;
 
@@ -1741,14 +1832,14 @@ IOStatus DBImpl::ConcurrentWriteToWAL(
     return io_s;
   }
 
-  // We need to lock log_write_mutex_ since logs_ and alive_log_files might be
+  // We need to lock wal_write_mutex_ since logs_ and alive_wal_files might be
   // pushed back concurrently
-  log_write_mutex_.Lock();
+  wal_write_mutex_.Lock();
   if (merged_batch == write_group.leader->batch) {
-    write_group.leader->log_used = logfile_number_;
+    write_group.leader->wal_used = cur_wal_number_;
   } else if (write_with_wal > 1) {
     for (auto writer : write_group) {
-      writer->log_used = logfile_number_;
+      writer->wal_used = cur_wal_number_;
     }
   }
   *last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
@@ -1756,9 +1847,9 @@ IOStatus DBImpl::ConcurrentWriteToWAL(
   WriteBatchInternal::SetSequence(merged_batch, sequence);
 
   log::Writer* log_writer = logs_.back().writer;
-  LogFileNumberSize& log_file_number_size = alive_log_files_.back();
+  WalFileNumberSize& wal_file_number_size = alive_wal_files_.back();
 
-  assert(log_writer->get_log_number() == log_file_number_size.number);
+  assert(log_writer->get_log_number() == wal_file_number_size.number);
 
   uint64_t log_size;
 
@@ -1766,13 +1857,13 @@ IOStatus DBImpl::ConcurrentWriteToWAL(
   WriteOptions write_options;
   write_options.rate_limiter_priority =
       write_group.leader->rate_limiter_priority;
-  io_s = WriteToWAL(*merged_batch, write_options, log_writer, log_used,
-                    &log_size, log_file_number_size, sequence);
+  io_s = WriteToWAL(*merged_batch, write_options, log_writer, wal_used,
+                    &log_size, wal_file_number_size, sequence);
   if (to_be_cached_state) {
     cached_recoverable_state_ = *to_be_cached_state;
     cached_recoverable_state_empty_ = false;
   }
-  log_write_mutex_.Unlock();
+  wal_write_mutex_.Unlock();
 
   if (io_s.ok()) {
     const bool concurrent = true;
@@ -1800,7 +1891,7 @@ Status DBImpl::WriteRecoverableState() {
     bool dont_care_bool;
     SequenceNumber next_seq;
     if (two_write_queues_) {
-      log_write_mutex_.Lock();
+      wal_write_mutex_.Lock();
     }
     SequenceNumber seq;
     if (two_write_queues_) {
@@ -1815,13 +1906,17 @@ Status DBImpl::WriteRecoverableState() {
         0 /*recovery_log_number*/, this, false /* concurrent_memtable_writes */,
         &next_seq, &dont_care_bool, seq_per_batch_);
     auto last_seq = next_seq - 1;
-    if (two_write_queues_) {
-      versions_->FetchAddLastAllocatedSequence(last_seq - seq);
-      versions_->SetLastPublishedSequence(last_seq);
+    if (status.ok()) {  // Don't publish a partial batch write
+      if (two_write_queues_) {
+        versions_->FetchAddLastAllocatedSequence(last_seq - seq);
+        versions_->SetLastPublishedSequence(last_seq);
+      }
+      versions_->SetLastSequence(last_seq);
+    } else {
+      HandleMemTableInsertFailure(status);
     }
-    versions_->SetLastSequence(last_seq);
     if (two_write_queues_) {
-      log_write_mutex_.Unlock();
+      wal_write_mutex_.Unlock();
     }
     if (status.ok() && recoverable_state_pre_release_callback_) {
       const bool DISABLE_MEMTABLE = true;
@@ -1893,7 +1988,10 @@ void DBImpl::AssignAtomicFlushSeq(const autovector<ColumnFamilyData*>& cfds) {
   assert(immutable_db_options_.atomic_flush);
   auto seq = versions_->LastSequence();
   for (auto cfd : cfds) {
-    cfd->imm()->AssignAtomicFlushSeq(seq);
+    // cfd can be nullptr, see ScheduleFlushes()
+    if (cfd) {
+      cfd->imm()->AssignAtomicFlushSeq(seq);
+    }
   }
 }
 
@@ -1902,11 +2000,11 @@ Status DBImpl::SwitchWAL(WriteContext* write_context) {
   assert(write_context != nullptr);
   Status status;
 
-  if (alive_log_files_.begin()->getting_flushed) {
+  if (alive_wal_files_.begin()->getting_flushed) {
     return status;
   }
 
-  auto oldest_alive_log = alive_log_files_.begin()->number;
+  auto oldest_alive_log = alive_wal_files_.begin()->number;
   bool flush_wont_release_oldest_log = false;
   if (allow_2pc()) {
     auto oldest_log_with_uncommitted_prep =
@@ -1936,14 +2034,14 @@ Status DBImpl::SwitchWAL(WriteContext* write_context) {
     // transactions then we cannot flush this log until those transactions are
     // commited.
     unable_to_release_oldest_log_ = false;
-    alive_log_files_.begin()->getting_flushed = true;
+    alive_wal_files_.begin()->getting_flushed = true;
   }
 
   ROCKS_LOG_INFO(
       immutable_db_options_.info_log,
       "Flushing all column families with data in WAL number %" PRIu64
       ". Total log size is %" PRIu64 " while max_total_wal_size is %" PRIu64,
-      oldest_alive_log, total_log_size_.load(), GetMaxTotalWalSize());
+      oldest_alive_log, wals_total_size_.LoadRelaxed(), GetMaxTotalWalSize());
   // no need to refcount because drop is happening in write thread, so can't
   // happen while we're in the write thread
   autovector<ColumnFamilyData*> cfds;
@@ -2413,21 +2511,21 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
   // Do this without holding the dbmutex lock.
   assert(versions_->prev_log_number() == 0);
   if (two_write_queues_) {
-    log_write_mutex_.Lock();
+    wal_write_mutex_.Lock();
   }
-  bool creating_new_log = !log_empty_;
+  bool creating_new_log = !wal_empty_;
   if (two_write_queues_) {
-    log_write_mutex_.Unlock();
+    wal_write_mutex_.Unlock();
   }
   uint64_t recycle_log_number = 0;
   // If file deletion is disabled, don't recycle logs since it'll result in
   // the file getting renamed
   if (creating_new_log && immutable_db_options_.recycle_log_file_num &&
-      !log_recycle_files_.empty() && IsFileDeletionsEnabled()) {
-    recycle_log_number = log_recycle_files_.front();
+      !wal_recycle_files_.empty() && IsFileDeletionsEnabled()) {
+    recycle_log_number = wal_recycle_files_.front();
   }
   uint64_t new_log_number =
-      creating_new_log ? versions_->NewFileNumber() : logfile_number_;
+      creating_new_log ? versions_->NewFileNumber() : cur_wal_number_;
   // For use outside of holding DB mutex
   const MutableCFOptions mutable_cf_options_copy =
       cfd->GetLatestMutableCFOptions();
@@ -2453,14 +2551,14 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
   mutex_.Unlock();
   if (creating_new_log) {
     PredecessorWALInfo info;
-    log_write_mutex_.Lock();
+    wal_write_mutex_.Lock();
     if (!logs_.empty()) {
       log::Writer* cur_log_writer = logs_.back().writer;
       info = PredecessorWALInfo(cur_log_writer->get_log_number(),
                                 cur_log_writer->file()->GetFileSize(),
                                 cur_log_writer->GetLastSeqnoRecorded());
     }
-    log_write_mutex_.Unlock();
+    wal_write_mutex_.Unlock();
     // TODO: Write buffer size passed in should be max of all CF's instead
     // of mutable_cf_options.write_buffer_size.
     io_s = CreateWAL(write_options, new_log_number, recycle_log_number,
@@ -2501,11 +2599,11 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
     // concurrent full purges don't delete the file while we're recycling it.
     // To achieve that we hold the old log number in the recyclable list until
     // after it has been renamed.
-    assert(log_recycle_files_.front() == recycle_log_number);
-    log_recycle_files_.pop_front();
+    assert(wal_recycle_files_.front() == recycle_log_number);
+    wal_recycle_files_.pop_front();
   }
   if (s.ok() && creating_new_log) {
-    InstrumentedMutexLock l(&log_write_mutex_);
+    InstrumentedMutexLock l(&wal_write_mutex_);
     assert(new_log != nullptr);
     if (!logs_.empty()) {
       // Alway flush the buffer of the last log before switching to a new one
@@ -2527,11 +2625,11 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
       }
     }
     if (s.ok()) {
-      logfile_number_ = new_log_number;
-      log_empty_ = true;
-      log_dir_synced_ = false;
-      logs_.emplace_back(logfile_number_, new_log);
-      alive_log_files_.emplace_back(logfile_number_);
+      cur_wal_number_ = new_log_number;
+      wal_empty_ = true;
+      wal_dir_synced_ = false;
+      logs_.emplace_back(cur_wal_number_, new_log);
+      alive_wal_files_.emplace_back(cur_wal_number_);
     }
   }
 
@@ -2562,7 +2660,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
     // obsolete. So we should track the WAL obsoletion event before actually
     // updating the empty CF's log number.
     uint64_t min_wal_number_to_keep =
-        versions_->PreComputeMinLogNumberWithUnflushedData(logfile_number_);
+        versions_->PreComputeMinLogNumberWithUnflushedData(cur_wal_number_);
     if (min_wal_number_to_keep >
         versions_->GetWalSet().GetMinWalNumberToKeep()) {
       // TODO: plumb Env::IOActivity, Env::IOPriority
@@ -2597,7 +2695,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
 
       for (auto cf : empty_cfs) {
         if (cf->IsEmpty()) {
-          cf->SetLogNumber(logfile_number_);
+          cf->SetLogNumber(cur_wal_number_);
           // MEMPURGE: No need to change this, because new adds
           // should still receive new sequence numbers.
           cf->mem()->SetCreationSeq(versions_->LastSequence());
@@ -2614,14 +2712,14 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
       // advance the log number. no need to persist this in the manifest
       if (cf->IsEmpty()) {
         if (creating_new_log) {
-          cf->SetLogNumber(logfile_number_);
+          cf->SetLogNumber(cur_wal_number_);
         }
         cf->mem()->SetCreationSeq(versions_->LastSequence());
       }
     }
   }
 
-  cfd->mem()->SetNextLogNumber(logfile_number_);
+  cfd->mem()->SetNextLogNumber(cur_wal_number_);
   assert(new_mem != nullptr);
   cfd->imm()->Add(cfd->mem(), &context->memtables_to_free_);
   if (new_imm) {
@@ -2633,7 +2731,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
     // we always try to flush all immutable memtable. For atomic flush, these
     // two memtables will be marked eligible for flush in the same call to
     // AssignAtomicFlushSeq().
-    new_imm->SetNextLogNumber(logfile_number_);
+    new_imm->SetNextLogNumber(cur_wal_number_);
     cfd->imm()->Add(new_imm, &context->memtables_to_free_);
   }
   new_mem->Ref();
diff --git a/db/db_io_failure_test.cc b/db/db_io_failure_test.cc
index ecef6e860aba..4021ea73d30a 100644
--- a/db/db_io_failure_test.cc
+++ b/db/db_io_failure_test.cc
@@ -7,6 +7,8 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#include <iomanip>
+
 #include "db/db_test_util.h"
 #include "port/stack_trace.h"
 #include "test_util/testutil.h"
diff --git a/db/db_iter.cc b/db/db_iter.cc
index c5a099103653..bd8f179655a6 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -9,7 +9,6 @@
 
 #include "db/db_iter.h"
 
-#include <iostream>
 #include <limits>
 #include <string>
 
@@ -24,6 +23,7 @@
 #include "memory/arena.h"
 #include "monitoring/perf_context_imp.h"
 #include "rocksdb/env.h"
+#include "rocksdb/io_dispatcher.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/options.h"
@@ -42,9 +42,8 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options,
                const MutableCFOptions& mutable_cf_options,
                const Comparator* cmp, InternalIterator* iter,
                const Version* version, SequenceNumber s, bool arena_mode,
-               uint64_t max_sequential_skip_in_iterations,
                ReadCallback* read_callback, ColumnFamilyHandleImpl* cfh,
-               bool expose_blob_index)
+               bool expose_blob_index, ReadOnlyMemTable* active_mem)
     : prefix_extractor_(mutable_cf_options.prefix_extractor.get()),
       env_(_env),
       clock_(ioptions.clock),
@@ -58,11 +57,21 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options,
       read_callback_(read_callback),
       sequence_(s),
       statistics_(ioptions.stats),
-      max_skip_(max_sequential_skip_in_iterations),
+      max_skip_(mutable_cf_options.max_sequential_skip_in_iterations),
       max_skippable_internal_keys_(read_options.max_skippable_internal_keys),
       num_internal_keys_skipped_(0),
       iterate_lower_bound_(read_options.iterate_lower_bound),
       iterate_upper_bound_(read_options.iterate_upper_bound),
+      cfh_(cfh),
+      timestamp_ub_(read_options.timestamp),
+      timestamp_lb_(read_options.iter_start_ts),
+      timestamp_size_(timestamp_ub_ ? timestamp_ub_->size() : 0),
+      active_mem_(active_mem),
+      memtable_seqno_lb_(kMaxSequenceNumber),
+      memtable_op_scan_flush_trigger_(0),
+      avg_op_scan_flush_trigger_(0),
+      iter_step_since_seek_(1),
+      mem_hidden_op_scanned_since_seek_(0),
       direction_(kForward),
       valid_(false),
       current_entry_is_merged_(false),
@@ -76,11 +85,7 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options,
       expose_blob_index_(expose_blob_index),
       allow_unprepared_value_(read_options.allow_unprepared_value),
       is_blob_(false),
-      arena_mode_(arena_mode),
-      cfh_(cfh),
-      timestamp_ub_(read_options.timestamp),
-      timestamp_lb_(read_options.iter_start_ts),
-      timestamp_size_(timestamp_ub_ ? timestamp_ub_->size() : 0) {
+      arena_mode_(arena_mode) {
   RecordTick(statistics_, NO_ITERATOR_CREATED);
   if (pin_thru_lifetime_) {
     pinned_iters_mgr_.StartPinning();
@@ -94,6 +99,25 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options,
   // prefix_seek_opt_in_only should force total_order_seek whereever the caller
   // is duplicating the original ReadOptions
   assert(!ioptions.prefix_seek_opt_in_only || read_options.total_order_seek);
+  if (active_mem_) {
+    // FIXME: GetEarliestSequenceNumber() may return a seqno that is one smaller
+    // than the smallest seqno in the memtable. This violates its comment and
+    // entries with that seqno may not be in the active memtable. Before it's
+    // fixed, we use GetFirstSequenceNumber() for more accurate result.
+    memtable_seqno_lb_ = active_mem_->IsEmpty()
+                             ? active_mem_->GetEarliestSequenceNumber()
+                             : active_mem_->GetFirstSequenceNumber();
+    memtable_op_scan_flush_trigger_ =
+        mutable_cf_options.memtable_op_scan_flush_trigger;
+    if (memtable_op_scan_flush_trigger_) {
+      // avg_op_scan_flush_trigger_ requires memtable_op_scan_flush_trigger_ > 0
+      avg_op_scan_flush_trigger_ =
+          mutable_cf_options.memtable_avg_op_scan_flush_trigger;
+    }
+  } else {
+    // memtable_op_scan_flush_trigger_ and avg_op_scan_flush_trigger_ are
+    // initialized to 0(disabled) as default.
+  }
 }
 
 Status DBIter::GetProperty(std::string prop_name, std::string* prop) {
@@ -155,6 +179,7 @@ void DBIter::Next() {
   local_stats_.skip_count_ += num_internal_keys_skipped_;
   local_stats_.skip_count_--;
   num_internal_keys_skipped_ = 0;
+  iter_step_since_seek_++;
   bool ok = true;
   if (direction_ == kReverse) {
     is_key_seqnum_zero_ = false;
@@ -369,6 +394,7 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
   // to one.
   bool reseek_done = false;
 
+  uint64_t mem_hidden_op_scanned = 0;
   do {
     // Will update is_key_seqnum_zero_ as soon as we parsed the current key
     // but we need to save the previous value to be used in the loop.
@@ -425,6 +451,7 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
           CompareKeyForSkip(ikey_.user_key, saved_key_.GetUserKey()) <= 0) {
         num_skipped++;  // skip this entry
         PERF_COUNTER_ADD(internal_key_skipped_count, 1);
+        MarkMemtableForFlushForPerOpTrigger(mem_hidden_op_scanned);
       } else {
         assert(!skipping_saved_key ||
                CompareKeyForSkip(ikey_.user_key, saved_key_.GetUserKey()) > 0);
@@ -446,6 +473,7 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
                                       !iter_.iter()->IsKeyPinned() /* copy */);
               skipping_saved_key = true;
               PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
+              MarkMemtableForFlushForPerOpTrigger(mem_hidden_op_scanned);
             }
             break;
           case kTypeValue:
@@ -484,7 +512,6 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
 
             valid_ = true;
             return true;
-            break;
           case kTypeMerge:
             if (!PrepareValueInternal()) {
               return false;
@@ -496,7 +523,6 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
             current_entry_is_merged_ = true;
             valid_ = true;
             return MergeValuesNewToOld();  // Go to a different state machine
-            break;
           default:
             valid_ = false;
             status_ = Status::Corruption(
@@ -1097,7 +1123,6 @@ bool DBIter::FindValueForCurrentKey() {
         }
         return true;
       }
-      break;
     case kTypeValue:
     case kTypeValuePreferredSeqno:
       SetValueAndColumnsFromPlain(pinned_value_);
@@ -1224,6 +1249,8 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
 
     if (timestamp_lb_ != nullptr) {
       saved_key_.SetInternalKey(ikey);
+    } else {
+      saved_key_.SetUserKey(ikey.user_key);
     }
 
     valid_ = true;
@@ -1539,11 +1566,123 @@ void DBIter::SetSavedKeyToSeekForPrevTarget(const Slice& target) {
   }
 }
 
+Status DBIter::ValidateScanOptions(const MultiScanArgs& multiscan_opts) const {
+  if (multiscan_opts.empty()) {
+    return Status::InvalidArgument("Empty MultiScanArgs");
+  }
+
+  const std::vector<ScanOptions>& scan_opts = multiscan_opts.GetScanRanges();
+  const bool has_limit = scan_opts.front().range.limit.has_value();
+  if (!has_limit && scan_opts.size() > 1) {
+    return Status::InvalidArgument("Scan has no upper bound");
+  }
+
+  for (size_t i = 0; i < scan_opts.size(); ++i) {
+    const auto& scan_range = scan_opts[i].range;
+    if (!scan_range.start.has_value()) {
+      return Status::InvalidArgument("Scan has no start key at index " +
+                                     std::to_string(i));
+    }
+
+    if (scan_range.limit.has_value()) {
+      if (user_comparator_.CompareWithoutTimestamp(
+              scan_range.start.value(), /*a_has_ts=*/false,
+              scan_range.limit.value(), /*b_has_ts=*/false) >= 0) {
+        return Status::InvalidArgument(
+            "Scan start key is large or equal than limit at index " +
+            std::to_string(i));
+      }
+    }
+
+    if (i > 0) {
+      if (!scan_range.limit.has_value()) {
+        // multiple scan without limit scan ranges
+        return Status::InvalidArgument("Scan has no upper bound at index " +
+                                       std::to_string(i));
+      }
+
+      const auto& last_end_key = scan_opts[i - 1].range.limit.value();
+      if (user_comparator_.CompareWithoutTimestamp(
+              scan_range.start.value(), /*a_has_ts=*/false, last_end_key,
+              /*b_has_ts=*/false) < 0) {
+        return Status::InvalidArgument("Overlapping ranges at index " +
+                                       std::to_string(i));
+      }
+    }
+  }
+  return Status::OK();
+}
+
+void DBIter::Prepare(const MultiScanArgs& scan_opts) {
+  status_ = ValidateScanOptions(scan_opts);
+  if (!status_.ok()) {
+    return;
+  }
+  std::optional<MultiScanArgs> new_scan_opts;
+  new_scan_opts.emplace(scan_opts);
+  scan_opts_.swap(new_scan_opts);
+  scan_index_ = 0;
+
+  // Create a shared IODispatcher if not provided. This allows all
+  // BlockBasedTableIterators in this scan to share a single dispatcher,
+  // enabling better IO coordination and future rate limiting.
+  if (!scan_opts_.value().io_dispatcher) {
+    scan_opts_->io_dispatcher.reset(NewIODispatcher());
+  }
+
+  if (!scan_opts.empty()) {
+    iter_.Prepare(&scan_opts_.value());
+  } else {
+    iter_.Prepare(nullptr);
+  }
+}
+
 void DBIter::Seek(const Slice& target) {
   PERF_COUNTER_ADD(iter_seek_count, 1);
   PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_);
   StopWatch sw(clock_, statistics_, DB_SEEK);
 
+  if (scan_opts_.has_value()) {
+    // Validate the seek target is as expected in the previously prepared range
+    auto const& scan_ranges = scan_opts_.value().GetScanRanges();
+    if (scan_index_ >= scan_ranges.size()) {
+      status_ = Status::InvalidArgument(
+          "Seek called after exhausting all of the scan ranges");
+      valid_ = false;
+      return;
+    }
+
+    // Validate start key of next prepare range matches the seek target
+    auto const& range = scan_ranges[scan_index_];
+    auto const& start = range.range.start;
+    assert(start.has_value());
+    if (user_comparator_.CompareWithoutTimestamp(target, *start) != 0) {
+      status_ = Status::InvalidArgument(
+          "Seek target does not match the start of the next prepared range at "
+          "index " +
+          std::to_string(scan_index_));
+      valid_ = false;
+      return;
+    }
+
+    // validate the upper bound is set to the same value of limit, if limit
+    // exists
+    auto const& limit = range.range.limit;
+    if (limit.has_value()) {
+      if (iterate_upper_bound_ == nullptr ||
+          user_comparator_.CompareWithoutTimestamp(
+              limit.value(), *iterate_upper_bound_) != 0) {
+        status_ = Status::InvalidArgument(
+            "Upper bound is not set to the same limit value of the next "
+            "prepared range at index " +
+            std::to_string(scan_index_));
+        valid_ = false;
+        return;
+      }
+    }
+    scan_index_++;
+  }
+
   if (cfh_ != nullptr) {
     // TODO: What do we do if this returns an error?
     Slice lower_bound, upper_bound;
@@ -1568,6 +1707,7 @@ void DBIter::Seek(const Slice& target) {
   ResetBlobData();
   ResetValueAndColumns();
   ResetInternalKeysSkippedCounter();
+  MarkMemtableForFlushForAvgTrigger();
 
   // Seek the inner iterator based on the target key.
   {
@@ -1644,6 +1784,7 @@ void DBIter::SeekForPrev(const Slice& target) {
   ResetBlobData();
   ResetValueAndColumns();
   ResetInternalKeysSkippedCounter();
+  MarkMemtableForFlushForAvgTrigger();
 
   // Seek the inner iterator based on the target key.
   {
@@ -1705,6 +1846,7 @@ void DBIter::SeekToFirst() {
   ResetBlobData();
   ResetValueAndColumns();
   ResetInternalKeysSkippedCounter();
+  MarkMemtableForFlushForAvgTrigger();
   ClearSavedValue();
   is_key_seqnum_zero_ = false;
 
@@ -1768,6 +1910,7 @@ void DBIter::SeekToLast() {
   ResetBlobData();
   ResetValueAndColumns();
   ResetInternalKeysSkippedCounter();
+  MarkMemtableForFlushForAvgTrigger();
   ClearSavedValue();
   is_key_seqnum_zero_ = false;
 
@@ -1790,21 +1933,4 @@ void DBIter::SeekToLast() {
         StripTimestampFromUserKey(saved_key_.GetUserKey(), timestamp_size_)));
   }
 }
-
-Iterator* NewDBIterator(Env* env, const ReadOptions& read_options,
-                        const ImmutableOptions& ioptions,
-                        const MutableCFOptions& mutable_cf_options,
-                        const Comparator* user_key_comparator,
-                        InternalIterator* internal_iter, const Version* version,
-                        const SequenceNumber& sequence,
-                        uint64_t max_sequential_skip_in_iterations,
-                        ReadCallback* read_callback,
-                        ColumnFamilyHandleImpl* cfh, bool expose_blob_index) {
-  DBIter* db_iter = new DBIter(
-      env, read_options, ioptions, mutable_cf_options, user_key_comparator,
-      internal_iter, version, sequence, false,
-      max_sequential_skip_in_iterations, read_callback, cfh, expose_blob_index);
-  return db_iter;
-}
-
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/db_iter.h b/db/db_iter.h
index 084ed80d41a0..575dc455eedc 100644
--- a/db/db_iter.h
+++ b/db/db_iter.h
@@ -12,7 +12,6 @@
 #include <string>
 
 #include "db/db_impl/db_impl.h"
-#include "db/range_del_aggregator.h"
 #include "memory/arena.h"
 #include "options/cf_options.h"
 #include "rocksdb/db.h"
@@ -57,6 +56,34 @@ class Version;
 // numbers, deletion markers, overwrites, etc.
 class DBIter final : public Iterator {
  public:
+  // Return a new DBIter that reads from `internal_iter` at the specified
+  // `sequence` number.
+  //
+  // @param active_mem Pointer to the active memtable that `internal_iter`
+  // is reading from. If not null, the memtable can be marked for flush
+  // according to options mutable_cf_options.memtable_op_scan_flush_trigger
+  // and mutable_cf_options.memtable_avg_op_scan_flush_trigger.
+  // @param arena_mode If true, the DBIter will be allocated from the arena.
+  static DBIter* NewIter(Env* env, const ReadOptions& read_options,
+                         const ImmutableOptions& ioptions,
+                         const MutableCFOptions& mutable_cf_options,
+                         const Comparator* user_key_comparator,
+                         InternalIterator* internal_iter,
+                         const Version* version, const SequenceNumber& sequence,
+                         ReadCallback* read_callback,
+                         ReadOnlyMemTable* active_mem,
+                         ColumnFamilyHandleImpl* cfh = nullptr,
+                         bool expose_blob_index = false,
+                         Arena* arena = nullptr) {
+    void* mem = arena ? arena->AllocateAligned(sizeof(DBIter))
+                      : operator new(sizeof(DBIter));
+    DBIter* db_iter = new (mem)
+        DBIter(env, read_options, ioptions, mutable_cf_options,
+               user_key_comparator, internal_iter, version, sequence, arena,
+               read_callback, cfh, expose_blob_index, active_mem);
+    return db_iter;
+  }
+
   // The following is grossly complicated. TODO: clean it up
   // Which direction is the iterator currently moving?
   // (1) When moving forward:
@@ -113,19 +140,12 @@ class DBIter final : public Iterator {
     uint64_t skip_count_;
   };
 
-  DBIter(Env* _env, const ReadOptions& read_options,
-         const ImmutableOptions& ioptions,
-         const MutableCFOptions& mutable_cf_options, const Comparator* cmp,
-         InternalIterator* iter, const Version* version, SequenceNumber s,
-         bool arena_mode, uint64_t max_sequential_skip_in_iterations,
-         ReadCallback* read_callback, ColumnFamilyHandleImpl* cfh,
-         bool expose_blob_index);
-
   // No copying allowed
   DBIter(const DBIter&) = delete;
   void operator=(const DBIter&) = delete;
 
   ~DBIter() override {
+    MarkMemtableForFlushForAvgTrigger();
     ThreadStatus::OperationType cur_op_type =
         ThreadStatusUtil::GetThreadOperation();
     ThreadStatusUtil::SetThreadOperation(
@@ -220,7 +240,18 @@ class DBIter final : public Iterator {
 
   bool PrepareValue() override;
 
+  void Prepare(const MultiScanArgs& scan_opts) override;
+  Status ValidateScanOptions(const MultiScanArgs& multiscan_opts) const;
+
  private:
+  DBIter(Env* _env, const ReadOptions& read_options,
+         const ImmutableOptions& ioptions,
+         const MutableCFOptions& mutable_cf_options, const Comparator* cmp,
+         InternalIterator* iter, const Version* version, SequenceNumber s,
+         bool arena_mode, ReadCallback* read_callback,
+         ColumnFamilyHandleImpl* cfh, bool expose_blob_index,
+         ReadOnlyMemTable* active_mem);
+
   class BlobReader {
    public:
     BlobReader(const Version* version, ReadTier read_tier,
@@ -379,6 +410,36 @@ class DBIter final : public Iterator {
     return true;
   }
 
+  void MarkMemtableForFlushForAvgTrigger() {
+    if (avg_op_scan_flush_trigger_ &&
+        mem_hidden_op_scanned_since_seek_ >= memtable_op_scan_flush_trigger_ &&
+        mem_hidden_op_scanned_since_seek_ >=
+            static_cast<uint64_t>(iter_step_since_seek_) *
+                avg_op_scan_flush_trigger_) {
+      assert(memtable_op_scan_flush_trigger_ > 0);
+      active_mem_->MarkForFlush();
+      avg_op_scan_flush_trigger_ = 0;
+      memtable_op_scan_flush_trigger_ = 0;
+    }
+    iter_step_since_seek_ = 1;
+    mem_hidden_op_scanned_since_seek_ = 0;
+  }
+
+  void MarkMemtableForFlushForPerOpTrigger(uint64_t& mem_hidden_op_scanned) {
+    if (memtable_op_scan_flush_trigger_ &&
+        ikey_.sequence >= memtable_seqno_lb_) {
+      if (++mem_hidden_op_scanned >= memtable_op_scan_flush_trigger_) {
+        active_mem_->MarkForFlush();
+        // Turn off the flush trigger checks.
+        memtable_op_scan_flush_trigger_ = 0;
+        avg_op_scan_flush_trigger_ = 0;
+      }
+      if (avg_op_scan_flush_trigger_) {
+        ++mem_hidden_op_scanned_since_seek_;
+      }
+    }
+  }
+
   const SliceTransform* prefix_extractor_;
   Env* const env_;
   SystemClock* clock_;
@@ -425,6 +486,25 @@ class DBIter final : public Iterator {
   IterKey prefix_;
 
   Status status_;
+  Slice lazy_blob_index_;
+
+  // List of operands for merge operator.
+  MergeContext merge_context_;
+  LocalStatistics local_stats_;
+  PinnedIteratorsManager pinned_iters_mgr_;
+  ColumnFamilyHandleImpl* cfh_;
+  const Slice* const timestamp_ub_;
+  const Slice* const timestamp_lb_;
+  const size_t timestamp_size_;
+  std::string saved_timestamp_;
+  std::optional<MultiScanArgs> scan_opts_;
+  size_t scan_index_{0};
+  ReadOnlyMemTable* const active_mem_;
+  SequenceNumber memtable_seqno_lb_;
+  uint32_t memtable_op_scan_flush_trigger_;
+  uint32_t avg_op_scan_flush_trigger_;
+  uint32_t iter_step_since_seek_;
+  uint32_t mem_hidden_op_scanned_since_seek_;
   Direction direction_;
   bool valid_;
   bool current_entry_is_merged_;
@@ -443,29 +523,7 @@ class DBIter final : public Iterator {
   // the stacked BlobDB implementation is used, false otherwise.
   bool expose_blob_index_;
   bool allow_unprepared_value_;
-  Slice lazy_blob_index_;
   bool is_blob_;
   bool arena_mode_;
-  // List of operands for merge operator.
-  MergeContext merge_context_;
-  LocalStatistics local_stats_;
-  PinnedIteratorsManager pinned_iters_mgr_;
-  ColumnFamilyHandleImpl* cfh_;
-  const Slice* const timestamp_ub_;
-  const Slice* const timestamp_lb_;
-  const size_t timestamp_size_;
-  std::string saved_timestamp_;
 };
-
-// Return a new iterator that converts internal keys (yielded by
-// "*internal_iter") that were live at the specified `sequence` number
-// into appropriate user keys.
-Iterator* NewDBIterator(
-    Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions,
-    const MutableCFOptions& mutable_cf_options,
-    const Comparator* user_key_comparator, InternalIterator* internal_iter,
-    const Version* version, const SequenceNumber& sequence,
-    uint64_t max_sequential_skip_in_iterations, ReadCallback* read_callback,
-    ColumnFamilyHandleImpl* cfh = nullptr, bool expose_blob_index = false);
-
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/db_iter_stress_test.cc b/db/db_iter_stress_test.cc
index daecbcc7acb6..c6d3936b3ccf 100644
--- a/db/db_iter_stress_test.cc
+++ b/db/db_iter_stress_test.cc
@@ -528,12 +528,11 @@ TEST_F(DBIteratorStressTest, StressTest) {
                   internal_iter->target_hidden_fraction =
                       target_hidden_fraction;
                   internal_iter->trace = trace;
-                  db_iter.reset(NewDBIterator(
+                  db_iter.reset(DBIter::NewIter(
                       env_, ropt, ImmutableOptions(options),
                       MutableCFOptions(options), BytewiseComparator(),
-                      internal_iter, nullptr /* version */, sequence,
-                      options.max_sequential_skip_in_iterations,
-                      nullptr /*read_callback*/));
+                      internal_iter, /*version=*/nullptr, sequence,
+                      nullptr /*read_callback*/, /*active_mem=*/nullptr));
                 }
 
                 // Do a random operation. It's important to do it on ref_it
diff --git a/db/db_iter_test.cc b/db/db_iter_test.cc
index cf8321808f9f..d18aa0bac4a1 100644
--- a/db/db_iter_test.cc
+++ b/db/db_iter_test.cc
@@ -259,11 +259,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     internal_iter->Finish();
 
     ReadOptions ro;
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -294,11 +293,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     internal_iter->Finish();
 
     ReadOptions ro;
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -322,11 +320,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     ReadOptions ro;
     ro.iterate_upper_bound = &prefix;
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -356,11 +353,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     ReadOptions ro;
     ro.iterate_upper_bound = &prefix;
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -393,11 +389,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     ReadOptions ro;
     ro.iterate_upper_bound = &prefix;
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(!db_iter->Valid());
@@ -425,11 +420,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     ReadOptions ro;
     ro.iterate_upper_bound = &prefix;
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 7 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     SetPerfLevel(kEnableCount);
     ASSERT_TRUE(GetPerfLevel() == kEnableCount);
@@ -465,11 +459,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     ReadOptions ro;
     ro.iterate_upper_bound = &prefix;
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 4 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -492,11 +485,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     ReadOptions ro;
     ro.iterate_upper_bound = &prefix;
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(!db_iter->Valid());
@@ -517,11 +509,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     ReadOptions ro;
     ro.iterate_upper_bound = &prefix;
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -554,11 +545,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     ReadOptions ro;
     ro.iterate_upper_bound = &prefix;
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 7 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     SetPerfLevel(kEnableCount);
     ASSERT_TRUE(GetPerfLevel() == kEnableCount);
@@ -586,11 +576,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     internal_iter->Finish();
 
     ReadOptions ro;
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -631,11 +620,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     internal_iter->Finish();
 
     ReadOptions ro;
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 2 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "b");
@@ -664,11 +652,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     internal_iter->Finish();
 
     ReadOptions ro;
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "c");
@@ -696,11 +683,10 @@ TEST_F(DBIteratorTest, DBIteratorEmpty) {
     TestIterator* internal_iter = new TestIterator(BytewiseComparator());
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 0 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(!db_iter->Valid());
     ASSERT_OK(db_iter->status());
@@ -710,11 +696,10 @@ TEST_F(DBIteratorTest, DBIteratorEmpty) {
     TestIterator* internal_iter = new TestIterator(BytewiseComparator());
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 0 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToFirst();
     ASSERT_TRUE(!db_iter->Valid());
     ASSERT_OK(db_iter->status());
@@ -735,11 +720,10 @@ TEST_F(DBIteratorTest, DBIteratorUseSkipCountSkips) {
   }
   internal_iter->Finish();
 
-  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+  std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      2 /* sequence */, options.max_sequential_skip_in_iterations,
-      nullptr /* read_callback */));
+      2 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr));
   db_iter->SeekToLast();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "c");
@@ -782,11 +766,11 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
       internal_iter->Finish();
 
       options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
-      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
           env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
           internal_iter, nullptr /* version */, i + 2 /* sequence */,
-          options.max_sequential_skip_in_iterations,
-          nullptr /* read_callback */));
+
+          nullptr /* read_callback */, /*active_mem=*/nullptr));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -820,11 +804,11 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
       internal_iter->AddPut("c", "200");
       internal_iter->Finish();
 
-      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
           env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
           internal_iter, nullptr /* version */, i + 2 /* sequence */,
-          options.max_sequential_skip_in_iterations,
-          nullptr /* read_callback */));
+
+          nullptr /* read_callback */, /*active_mem=*/nullptr));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -851,11 +835,10 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
       internal_iter->AddPut("c", "200");
       internal_iter->Finish();
 
-      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
           env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
           internal_iter, nullptr /* version */, 202 /* sequence */,
-          options.max_sequential_skip_in_iterations,
-          nullptr /* read_callback */));
+          nullptr /* read_callback */, /*active_mem=*/nullptr));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -886,11 +869,11 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
       }
       internal_iter->AddPut("c", "200");
       internal_iter->Finish();
-      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
           env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
           internal_iter, nullptr /* version */, i /* sequence */,
-          options.max_sequential_skip_in_iterations,
-          nullptr /* read_callback */));
+
+          nullptr /* read_callback */, /*active_mem=*/nullptr));
       db_iter->SeekToLast();
       ASSERT_TRUE(!db_iter->Valid());
       ASSERT_OK(db_iter->status());
@@ -906,11 +889,10 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
     }
     internal_iter->AddPut("c", "200");
     internal_iter->Finish();
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 200 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "c");
@@ -944,11 +926,11 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
       }
       internal_iter->Finish();
 
-      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
           env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
           internal_iter, nullptr /* version */, i + 2 /* sequence */,
-          options.max_sequential_skip_in_iterations,
-          nullptr /* read_callback */));
+
+          nullptr /* read_callback */, /*active_mem=*/nullptr));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -981,11 +963,11 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
       }
       internal_iter->Finish();
 
-      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
           env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
           internal_iter, nullptr /* version */, i + 2 /* sequence */,
-          options.max_sequential_skip_in_iterations,
-          nullptr /* read_callback */));
+
+          nullptr /* read_callback */, /*active_mem=*/nullptr));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -1033,11 +1015,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
     internal_iter->Finish();
 
     ro.max_skippable_internal_keys = 0;
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -1081,11 +1062,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
     internal_iter->Finish();
 
     ro.max_skippable_internal_keys = 2;
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -1127,11 +1107,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
     internal_iter->Finish();
 
     ro.max_skippable_internal_keys = 2;
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -1167,11 +1146,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
     internal_iter->Finish();
 
     ro.max_skippable_internal_keys = 2;
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -1204,11 +1182,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
     internal_iter->Finish();
 
     ro.max_skippable_internal_keys = 2;
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -1236,11 +1213,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
     internal_iter->Finish();
 
     ro.max_skippable_internal_keys = 2;
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -1275,11 +1251,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
     internal_iter->Finish();
 
     ro.max_skippable_internal_keys = 2;
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -1314,11 +1289,11 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
       internal_iter->Finish();
 
       ro.max_skippable_internal_keys = i;
-      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
           env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
           internal_iter, nullptr /* version */, 2 * i + 1 /* sequence */,
-          options.max_sequential_skip_in_iterations,
-          nullptr /* read_callback */));
+
+          nullptr /* read_callback */, /*active_mem=*/nullptr));
 
       db_iter->SeekToFirst();
       ASSERT_TRUE(db_iter->Valid());
@@ -1369,11 +1344,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
 
       options.max_sequential_skip_in_iterations = 1000;
       ro.max_skippable_internal_keys = i;
-      std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+      std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
+          env_, ro, ioptions, MutableCFOptions(options), BytewiseComparator(),
           internal_iter, nullptr /* version */, 2 * i + 1 /* sequence */,
-          options.max_sequential_skip_in_iterations,
-          nullptr /* read_callback */));
+          nullptr /* read_callback */, /*active_mem=*/nullptr));
 
       db_iter->SeekToFirst();
       ASSERT_TRUE(db_iter->Valid());
@@ -1412,11 +1386,11 @@ TEST_F(DBIteratorTest, DBIteratorTimedPutBasic) {
   internal_iter->AddTimedPut("d", "3", /*write_unix_time=*/0);
   internal_iter->Finish();
 
-  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+  options.max_sequential_skip_in_iterations = 1;
+  std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      7 /* sequence */, /*max_sequential_skip_in_iterations*/ 1,
-      nullptr /* read_callback */));
+      7 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1463,11 +1437,10 @@ TEST_F(DBIteratorTest, DBIterator1) {
   internal_iter->AddMerge("b", "2");
   internal_iter->Finish();
 
-  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+  std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      1 /* sequence */, options.max_sequential_skip_in_iterations,
-      nullptr /* read_callback */));
+      1 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1493,11 +1466,10 @@ TEST_F(DBIteratorTest, DBIterator2) {
   internal_iter->AddMerge("b", "2");
   internal_iter->Finish();
 
-  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+  std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      0 /* sequence */, options.max_sequential_skip_in_iterations,
-      nullptr /* read_callback */));
+      0 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1519,11 +1491,10 @@ TEST_F(DBIteratorTest, DBIterator3) {
   internal_iter->AddMerge("b", "2");
   internal_iter->Finish();
 
-  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+  std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      2 /* sequence */, options.max_sequential_skip_in_iterations,
-      nullptr /* read_callback */));
+      2 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1545,11 +1516,10 @@ TEST_F(DBIteratorTest, DBIterator4) {
   internal_iter->AddMerge("b", "2");
   internal_iter->Finish();
 
-  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+  std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      4 /* sequence */, options.max_sequential_skip_in_iterations,
-      nullptr /* read_callback */));
+      4 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1580,11 +1550,10 @@ TEST_F(DBIteratorTest, DBIterator5) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 0 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1605,11 +1574,10 @@ TEST_F(DBIteratorTest, DBIterator5) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 1 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1630,11 +1598,10 @@ TEST_F(DBIteratorTest, DBIterator5) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 2 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1655,11 +1622,10 @@ TEST_F(DBIteratorTest, DBIterator5) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 3 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1680,11 +1646,10 @@ TEST_F(DBIteratorTest, DBIterator5) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 4 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1705,11 +1670,10 @@ TEST_F(DBIteratorTest, DBIterator5) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 5 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1730,11 +1694,10 @@ TEST_F(DBIteratorTest, DBIterator5) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 6 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1753,11 +1716,10 @@ TEST_F(DBIteratorTest, DBIterator5) {
     internal_iter->AddMerge("a", "merge_2");
     internal_iter->AddPut("b", "val_b");
     internal_iter->Finish();
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->Seek("b");
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "b");
@@ -1785,11 +1747,10 @@ TEST_F(DBIteratorTest, DBIterator6) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 0 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1810,11 +1771,10 @@ TEST_F(DBIteratorTest, DBIterator6) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 1 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1835,11 +1795,10 @@ TEST_F(DBIteratorTest, DBIterator6) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 2 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1860,11 +1819,10 @@ TEST_F(DBIteratorTest, DBIterator6) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 3 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(!db_iter->Valid());
     ASSERT_OK(db_iter->status());
@@ -1881,11 +1839,10 @@ TEST_F(DBIteratorTest, DBIterator6) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 4 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1906,11 +1863,10 @@ TEST_F(DBIteratorTest, DBIterator6) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 5 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1931,11 +1887,10 @@ TEST_F(DBIteratorTest, DBIterator6) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 6 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1976,11 +1931,10 @@ TEST_F(DBIteratorTest, DBIterator7) {
     internal_iter->AddDeletion("c");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 0 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -2013,11 +1967,10 @@ TEST_F(DBIteratorTest, DBIterator7) {
     internal_iter->AddDeletion("c");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 2 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -2056,11 +2009,10 @@ TEST_F(DBIteratorTest, DBIterator7) {
     internal_iter->AddDeletion("c");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 4 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -2099,11 +2051,10 @@ TEST_F(DBIteratorTest, DBIterator7) {
     internal_iter->AddDeletion("c");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 5 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -2147,11 +2098,10 @@ TEST_F(DBIteratorTest, DBIterator7) {
     internal_iter->AddDeletion("c");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 6 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -2196,11 +2146,10 @@ TEST_F(DBIteratorTest, DBIterator7) {
     internal_iter->AddDeletion("c");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 7 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -2239,11 +2188,10 @@ TEST_F(DBIteratorTest, DBIterator7) {
     internal_iter->AddDeletion("c");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 9 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -2288,11 +2236,10 @@ TEST_F(DBIteratorTest, DBIterator7) {
     internal_iter->AddDeletion("c");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 13 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -2338,11 +2285,10 @@ TEST_F(DBIteratorTest, DBIterator7) {
     internal_iter->AddDeletion("c");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 14 /* sequence */,
-        options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -2371,11 +2317,10 @@ TEST_F(DBIteratorTest, DBIterator8) {
   internal_iter->AddPut("b", "0");
   internal_iter->Finish();
 
-  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+  std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      10 /* sequence */, options.max_sequential_skip_in_iterations,
-      nullptr /* read_callback */));
+      10 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr));
   db_iter->SeekToLast();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "b");
@@ -2403,11 +2348,11 @@ TEST_F(DBIteratorTest, DBIterator9) {
     internal_iter->AddMerge("d", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, ImmutableOptions(options), MutableCFOptions(options),
-        BytewiseComparator(), internal_iter, nullptr /* version */,
-        10 /* sequence */, options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+    std::unique_ptr<Iterator> db_iter(
+        DBIter::NewIter(env_, ro, ImmutableOptions(options),
+                        MutableCFOptions(options), BytewiseComparator(),
+                        internal_iter, nullptr /* version */, 10 /* sequence */,
+                        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -2471,11 +2416,10 @@ TEST_F(DBIteratorTest, DBIterator10) {
   internal_iter->AddPut("d", "4");
   internal_iter->Finish();
 
-  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+  std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      10 /* sequence */, options.max_sequential_skip_in_iterations,
-      nullptr /* read_callback */));
+      10 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr));
 
   db_iter->Seek("c");
   ASSERT_TRUE(db_iter->Valid());
@@ -2512,10 +2456,10 @@ TEST_F(DBIteratorTest, SeekToLastOccurrenceSeq0) {
   internal_iter->AddPut("b", "2");
   internal_iter->Finish();
 
-  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+  std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      10 /* sequence */, 0 /* force seek */, nullptr /* read_callback */));
+      10 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -2542,11 +2486,10 @@ TEST_F(DBIteratorTest, DBIterator11) {
   internal_iter->AddMerge("b", "2");
   internal_iter->Finish();
 
-  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+  std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      1 /* sequence */, options.max_sequential_skip_in_iterations,
-      nullptr /* read_callback */));
+      1 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -2571,10 +2514,10 @@ TEST_F(DBIteratorTest, DBIterator12) {
   internal_iter->AddSingleDeletion("b");
   internal_iter->Finish();
 
-  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+  std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      10 /* sequence */, 0 /* force seek */, nullptr /* read_callback */));
+      10 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr));
   db_iter->SeekToLast();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "c");
@@ -2610,11 +2553,11 @@ TEST_F(DBIteratorTest, DBIterator13) {
   internal_iter->AddPut(key, "8");
   internal_iter->Finish();
 
-  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+  options.max_sequential_skip_in_iterations = 3;
+  std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      2 /* sequence */, 3 /* max_sequential_skip_in_iterations */,
-      nullptr /* read_callback */));
+      2 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr));
   db_iter->Seek("b");
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), key);
@@ -2640,11 +2583,11 @@ TEST_F(DBIteratorTest, DBIterator14) {
   internal_iter->AddPut("c", "9");
   internal_iter->Finish();
 
-  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+  options.max_sequential_skip_in_iterations = 1;
+  std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      4 /* sequence */, 1 /* max_sequential_skip_in_iterations */,
-      nullptr /* read_callback */));
+      4 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr));
   db_iter->Seek("b");
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "b");
@@ -2680,11 +2623,12 @@ class DBIterWithMergeIterTest : public testing::Test {
     InternalIterator* merge_iter =
         NewMergingIterator(&icomp_, child_iters.data(), 2u);
 
-    db_iter_.reset(NewDBIterator(
+    options_.max_sequential_skip_in_iterations = 3;
+    db_iter_.reset(DBIter::NewIter(
         env_, ro_, ImmutableOptions(options_), MutableCFOptions(options_),
         BytewiseComparator(), merge_iter, nullptr /* version */,
-        8 /* read data earlier than seqId 8 */,
-        3 /* max iterators before reseek */, nullptr /* read_callback */));
+        8 /* read data earlier than seqId 8 */, nullptr /* read_callback */,
+        /*active_mem=*/nullptr));
   }
 
   Env* env_;
@@ -3120,11 +3064,10 @@ TEST_F(DBIteratorTest, SeekPrefixTombstones) {
   internal_iter->Finish();
 
   ro.prefix_same_as_start = true;
-  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+  std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      10 /* sequence */, options.max_sequential_skip_in_iterations,
-      nullptr /* read_callback */));
+      10 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr));
 
   int skipped_keys = 0;
 
@@ -3157,11 +3100,11 @@ TEST_F(DBIteratorTest, SeekToFirstLowerBound) {
     Slice lower_bound(lower_bound_str);
     ro.iterate_lower_bound = &lower_bound;
     Options options;
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, ImmutableOptions(options), MutableCFOptions(options),
-        BytewiseComparator(), internal_iter, nullptr /* version */,
-        10 /* sequence */, options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+    std::unique_ptr<Iterator> db_iter(
+        DBIter::NewIter(env_, ro, ImmutableOptions(options),
+                        MutableCFOptions(options), BytewiseComparator(),
+                        internal_iter, nullptr /* version */, 10 /* sequence */,
+                        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     db_iter->SeekToFirst();
     if (i == kNumKeys + 1) {
@@ -3197,11 +3140,10 @@ TEST_F(DBIteratorTest, PrevLowerBound) {
   Slice lower_bound(lower_bound_str);
   ro.iterate_lower_bound = &lower_bound;
   Options options;
-  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+  std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      10 /* sequence */, options.max_sequential_skip_in_iterations,
-      nullptr /* read_callback */));
+      10 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr));
 
   db_iter->SeekToLast();
   for (int i = kNumKeys; i >= kLowerBound; --i) {
@@ -3226,11 +3168,10 @@ TEST_F(DBIteratorTest, SeekLessLowerBound) {
   Slice lower_bound(lower_bound_str);
   ro.iterate_lower_bound = &lower_bound;
   Options options;
-  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+  std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      10 /* sequence */, options.max_sequential_skip_in_iterations,
-      nullptr /* read_callback */));
+      10 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr));
 
   auto before_lower_bound_str = std::to_string(kLowerBound - 1);
   Slice before_lower_bound(lower_bound_str);
@@ -3252,11 +3193,10 @@ TEST_F(DBIteratorTest, ReverseToForwardWithDisappearingKeys) {
   }
   internal_iter->Finish();
 
-  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+  std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ReadOptions(), ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      10 /* sequence */, options.max_sequential_skip_in_iterations,
-      nullptr /* read_callback */));
+      10 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr));
 
   db_iter->SeekForPrev("a");
   ASSERT_TRUE(db_iter->Valid());
diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc
index ad3afd17f4f2..d2371abfa890 100644
--- a/db/db_iterator_test.cc
+++ b/db/db_iterator_test.cc
@@ -8,12 +8,15 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include <functional>
+#include <iomanip>
+#include <iostream>
 
 #include "db/arena_wrapped_db_iter.h"
 #include "db/db_iter.h"
 #include "db/db_test_util.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
+#include "rocksdb/io_dispatcher.h"
 #include "rocksdb/iostats_context.h"
 #include "rocksdb/perf_context.h"
 #include "table/block_based/flush_block_policy_impl.h"
@@ -1839,11 +1842,6 @@ class SliceTransformLimitedDomainGeneric : public SliceTransform {
     // prefix will be x????
     return src.size() >= 1;
   }
-
-  bool InRange(const Slice& dst) const override {
-    // prefix will be x????
-    return dst.size() == 1;
-  }
 };
 
 TEST_P(DBIteratorTest, IterSeekForPrevCrossingFiles) {
@@ -2571,7 +2569,7 @@ TEST_P(DBIteratorTest, AutoRefreshIterator) {
         ReadOptions read_options;
         std::unique_ptr<ManagedSnapshot> snapshot = nullptr;
         if (explicit_snapshot) {
-          snapshot = std::make_unique<ManagedSnapshot>(db_);
+          snapshot = std::make_unique<ManagedSnapshot>(db_.get());
         }
         read_options.snapshot =
             explicit_snapshot ? snapshot->snapshot() : nullptr;
@@ -3824,6 +3822,1576 @@ TEST_F(DBIteratorTest, IteratorsConsistentViewExplicitSnapshot) {
   }
 }
 
+TEST_P(DBIteratorTest, MemtableOpsScanFlushTriggerWithSeek) {
+  // Tests that option memtable_op_scan_flush_trigger works when the limit
+  // is reached during a Seek() operation.
+  const int kTrigger = 10;
+  Random* r = Random::GetTLSInstance();
+
+  for (int trigger : {kTrigger, kTrigger + 1}) {
+    for (bool delete_only : {false, true}) {
+      Options options;
+      options.create_if_missing = true;
+      options.memtable_op_scan_flush_trigger = trigger;
+      options.level_compaction_dynamic_level_bytes = true;
+      DestroyAndReopen(options);
+
+      // Base data that will be covered by a consecutive sequence of tombstones.
+      int kNumKeys = delete_only ? kTrigger : kTrigger / 2;
+      for (int i = 0; i < kNumKeys; ++i) {
+        ASSERT_OK(Put(Key(i), r->RandomString(100)));
+      }
+      ASSERT_OK(Flush());
+      ASSERT_OK(db_->CompactRange({}, nullptr, nullptr));
+      ASSERT_EQ(1, NumTableFilesAtLevel(6));
+
+      if (delete_only) {
+        for (int i = 0; i < kNumKeys; ++i) {
+          ASSERT_OK(SingleDelete(Key(i)));
+        }
+      } else {
+        for (int i = 0; i < kNumKeys; ++i) {
+          ASSERT_OK(Put(Key(i), r->RandomString(100)));
+        }
+        for (int i = 0; i < kNumKeys; ++i) {
+          ASSERT_OK(Delete(Key(i)));
+        }
+      }
+
+      SetPerfLevel(PerfLevel::kEnableCount);
+      get_perf_context()->Reset();
+      ReadOptions ro;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+
+      // Seek to the first key, this will scan through all the tombstones and
+      // hidden puts
+      iter->Seek(Key(0));
+      ASSERT_FALSE(
+          iter->Valid());  // All keys are deleted, so iterator is not valid
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(get_perf_context()->next_on_memtable_count, kTrigger);
+
+      // Skipping kNumTrigger memtable entries in a single iterator operation
+      // should mark the memtable for flush.
+      //
+      // At the end of a write, we check and update memtable to request a flush
+      ASSERT_OK(Put(Key(11), "val"));
+      // Before a write, we schedule memtables for flush if requested.
+      ASSERT_OK(Put(Key(12), "val"));
+      ASSERT_OK(db_->WaitForCompact({}));
+
+      if (trigger <= kTrigger) {
+        // Check if memtable was flushed due to scan trigger
+        ASSERT_EQ(1, NumTableFilesAtLevel(0));
+        uint64_t val = 0;
+        ASSERT_TRUE(
+            db_->GetIntProperty("rocksdb.num-deletes-active-mem-table", &val));
+        ASSERT_EQ(0, val);
+      } else {
+        ASSERT_EQ(0, NumTableFilesAtLevel(0));
+        uint64_t val = 0;
+        ASSERT_TRUE(
+            db_->GetIntProperty("rocksdb.num-deletes-active-mem-table", &val));
+        ASSERT_EQ(kNumKeys, val);
+      }
+    }
+  }
+}
+
+TEST_P(DBIteratorTest, MemtableOpsScanFlushTriggerWithNext) {
+  // Tests that option memtable_op_scan_flush_trigger works when the limit
+  // is reached during a Next() operation, and not trigger a flush when
+  // the limit is reached across multiple Next() operations.
+  const int kTrigger = 10;
+  Random* r = Random::GetTLSInstance();
+
+  for (int trigger : {kTrigger, kTrigger + 1}) {
+    for (bool delete_only : {false, true}) {
+      Options options;
+      options.create_if_missing = true;
+      options.memtable_op_scan_flush_trigger = trigger;
+      options.level_compaction_dynamic_level_bytes = true;
+      DestroyAndReopen(options);
+
+      // Base data that will be covered by a consecutive sequence of tombstones.
+      int kNumKeys = delete_only ? kTrigger : kTrigger / 2;
+      for (int i = 0; i <= kNumKeys; ++i) {
+        ASSERT_OK(Put(Key(i), r->RandomString(100)));
+      }
+      ASSERT_OK(Flush());
+      ASSERT_OK(db_->CompactRange({}, nullptr, nullptr));
+      ASSERT_EQ(1, NumTableFilesAtLevel(6));
+
+      ASSERT_OK(Put(Key(0), "val"));
+      if (delete_only) {
+        for (int i = 1; i <= kNumKeys; ++i) {
+          ASSERT_OK(SingleDelete(Key(i)));
+        }
+      } else {
+        for (int i = 1; i <= kNumKeys; ++i) {
+          ASSERT_OK(Put(Key(i), r->RandomString(100)));
+        }
+        for (int i = 1; i <= kNumKeys; ++i) {
+          ASSERT_OK(Delete(Key(i)));
+        }
+      }
+
+      // Total number of tombstones and hidden puts scanned across multiple
+      // Next() operations below will be kTrigger, and it should not trigger a
+      // flush when the limit is kTrigger + 1.
+      ASSERT_OK(Put(Key(kNumKeys + 1), "v1"));
+      ASSERT_OK(Delete(Key(kNumKeys + 2)));
+      ASSERT_OK(Put(Key(kNumKeys + 3), "v3"));
+
+      SetPerfLevel(PerfLevel::kEnableCount);
+      get_perf_context()->Reset();
+      ReadOptions ro;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+      iter->Seek(Key(0));
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ(iter->value(), "val");
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(get_perf_context()->next_on_memtable_count, 0);
+      iter->Next();
+      // kTrigger tombstones and invisible puts and 1 for the visible put
+      ASSERT_EQ(get_perf_context()->next_on_memtable_count, kTrigger + 1);
+      iter->Next();
+      ASSERT_EQ(get_perf_context()->next_on_memtable_count, kTrigger + 3);
+
+      // Skipping kNumTrigger memtable entries in a single iterator operation
+      // should mark the memtable for flush.
+      //
+      // At the end of a write, we check and update memtable to request a flush
+      ASSERT_OK(Put(Key(11), "val"));
+      // Before a write, we schedule memtables for flush if requested.
+      ASSERT_OK(Put(Key(12), "val"));
+      ASSERT_OK(db_->WaitForCompact({}));
+
+      if (trigger <= kTrigger) {
+        // Check if memtable was flushed due to scan trigger
+        ASSERT_EQ(1, NumTableFilesAtLevel(0));
+        uint64_t val = 0;
+        ASSERT_TRUE(
+            db_->GetIntProperty("rocksdb.num-deletes-active-mem-table", &val));
+        ASSERT_EQ(0, val);
+      } else {
+        uint64_t val = 0;
+        ASSERT_TRUE(
+            db_->GetIntProperty("rocksdb.num-deletes-active-mem-table", &val));
+        ASSERT_EQ(kNumKeys + 1, val);
+      }
+    }
+  }
+}
+
+TEST_P(DBIteratorTest, AverageMemtableOpsScanFlushTrigger) {
+  // Tests option memtable_avg_op_scan_flush_trigger with
+  // long tombstone sequences.
+  Random* r = Random::GetTLSInstance();
+
+  const int kAvgTrigger = 10;
+  const int kMaxTrigger = 500;
+  Options options;
+  options.create_if_missing = true;
+  options.memtable_op_scan_flush_trigger = kMaxTrigger;
+  options.memtable_avg_op_scan_flush_trigger = kAvgTrigger;
+  options.level_compaction_dynamic_level_bytes = true;
+  DestroyAndReopen(options);
+
+  const int kNumKeys = 1000;
+  // Base data that will be covered by a consecutive sequence of tombstones.
+  for (int i = 0; i < kNumKeys; ++i) {
+    ASSERT_OK(Put(Key(i), r->RandomString(50)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange({}, nullptr, nullptr));
+  ASSERT_EQ(1, NumTableFilesAtLevel(6));
+
+  for (int i = 0; i < kNumKeys; ++i) {
+    // We issue slightly more deletions than kAvgTrigger between visible keys
+    // to ensure avg skipped entries exceed kAvgTrigger.
+    if (i % (kAvgTrigger + 2) != 0) {
+      ASSERT_OK(SingleDelete(Key(i)));
+    }
+  }
+
+  // Each operation, except the first Seek, is expected to see kAvgTrigger + 1
+  // tombstones (from the active memtable) before it finds the next visible key.
+  SetPerfLevel(PerfLevel::kEnableCount);
+  get_perf_context()->Reset();
+  std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+  iter->Seek(Key(1));
+  ASSERT_EQ(get_perf_context()->next_on_memtable_count, kAvgTrigger + 1);
+  iter.reset();
+  // Should not flush since total entries skipped is below
+  // memtable_op_scan_flush_trigger
+  ASSERT_OK(Put(Key(0), "dummy write"));
+  ASSERT_OK(Put(Key(0), "dummy write"));
+  ASSERT_OK(db_->WaitForCompact({}));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+
+  get_perf_context()->Reset();
+  iter.reset(db_->NewIterator(ReadOptions()));
+  int num_ops = 1;
+  uint64_t num_skipped = 0;
+  iter->Seek(Key(0));
+  ASSERT_EQ(iter->key(), Key(0));
+  uint64_t last_memtable_next_count =
+      get_perf_context()->next_on_memtable_count;
+  iter->Next();
+  num_ops++;
+  while (iter->Valid()) {
+    ASSERT_OK(iter->status());
+    uint64_t num_skipped_in_op =
+        get_perf_context()->next_on_memtable_count - last_memtable_next_count;
+    ASSERT_GE(num_skipped_in_op, kAvgTrigger + 1);
+    last_memtable_next_count = get_perf_context()->next_on_memtable_count;
+    num_skipped += num_skipped_in_op;
+    iter->Next();
+    num_ops++;
+  }
+  // During iterator destruction we mark memtable for flush
+  iter.reset();
+
+  // avg trigger
+  ASSERT_GE(num_skipped, kAvgTrigger * num_ops);
+  // memtable_op_scan_flush_trigger
+  ASSERT_GE(num_skipped, kMaxTrigger);
+  // Average hidden entries scanned from memtable per operation is more than
+  // kAvgTrigger and the total skipped is more than
+  // memtable_op_scan_flush_trigger, the current memtable should be marked for
+  // flush. The following two writes will trigger the flush.
+  ASSERT_OK(Put(Key(0), "dummy write"));
+  // Before a write, we schedule memtables for flush if requested.
+  ASSERT_OK(Put(Key(0), "dummy write"));
+  ASSERT_OK(db_->WaitForCompact({}));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+}
+
+TEST_P(DBIteratorTest, AverageMemtableOpsScanFlushTriggerByOverwrites) {
+  // Tests option memtable_avg_op_scan_flush_trigger with overwrites to keys.
+  Random* r = Random::GetTLSInstance();
+
+  const int kAvgTrigger = 25;
+  Options options;
+  options.create_if_missing = true;
+  options.memtable_op_scan_flush_trigger = 250;
+  options.memtable_avg_op_scan_flush_trigger = kAvgTrigger;
+  options.level_compaction_dynamic_level_bytes = true;
+  DestroyAndReopen(options);
+
+  const int kNumKeys = 100;
+  // Base data that will be covered by a consecutive sequence of tombstones.
+  for (int i = 0; i < kNumKeys; ++i) {
+    ASSERT_OK(Put(Key(i), r->RandomString(50)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange({}, nullptr, nullptr));
+  ASSERT_EQ(1, NumTableFilesAtLevel(6));
+
+  // One visible key every 10 keys.
+  // Each non-visible user key has 3 non-visible entries in the active memtable.
+  for (int i = 0; i < kNumKeys; ++i) {
+    if (i % 10 != 0) {
+      ASSERT_OK(Put(Key(i), r->RandomString(50)));
+      ASSERT_OK(Put(Key(i), r->RandomString(50)));
+      ASSERT_OK(Delete(Key(i)));
+    }
+  }
+
+  SetPerfLevel(PerfLevel::kEnableCount);
+  get_perf_context()->Reset();
+  ReadOptions ro;
+  std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+  iter->Seek(Key(1));
+  ASSERT_GT(get_perf_context()->next_on_memtable_count, kAvgTrigger);
+  // Re-seek to trigger check for flush trigger
+  iter->Seek(Key(1));
+  // Should not flush since total entries skipped is below
+  // memtable_op_scan_flush_trigger
+  ASSERT_FALSE(static_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())
+                   ->cfd()
+                   ->mem()
+                   ->IsMarkedForFlush());
+  ASSERT_OK(Put(Key(0), "dummy write"));
+  ASSERT_OK(Put(Key(0), "dummy write"));
+  ASSERT_OK(db_->WaitForCompact({}));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  get_perf_context()->Reset();
+
+  int num_ops = 1;
+  iter->Seek(Key(1));
+  while (iter->Valid()) {
+    num_ops++;
+    iter->Next();
+  }
+  ASSERT_GT(get_perf_context()->next_on_memtable_count, num_ops * kAvgTrigger);
+
+  // Re-seek should check conditions for marking memtable for flush
+  iter->Seek(Key(80));
+
+  // Average hidden entries scanned from memtable per operation is 2.
+  ASSERT_OK(Put(Key(0), "dummy write"));
+  // Before a write, we schedule memtables for flush if requested.
+  ASSERT_OK(Put(Key(0), "dummy write"));
+  ASSERT_OK(db_->WaitForCompact({}));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+}
+
+class DBMultiScanIteratorTest : public DBTestBase,
+                                public ::testing::WithParamInterface<bool> {
+ public:
+  DBMultiScanIteratorTest()
+      : DBTestBase("db_multi_scan_iterator_test", /*env_do_fsync=*/true) {}
+};
+
+// Param 0: ReadOptions::fill_cache
+INSTANTIATE_TEST_CASE_P(DBMultiScanIteratorTest, DBMultiScanIteratorTest,
+                        ::testing::Bool());
+
+TEST_P(DBMultiScanIteratorTest, BasicTest) {
+  auto options = CurrentOptions();
+  DestroyAndReopen(options);
+
+  // Create a file
+  for (int i = 0; i < 100; ++i) {
+    std::stringstream ss;
+    ss << std::setw(2) << std::setfill('0') << i;
+    ASSERT_OK(Put("k" + ss.str(), "val" + ss.str()));
+  }
+  ASSERT_OK(Flush());
+
+  std::vector<std::string> key_ranges({"k03", "k10", "k25", "k50"});
+  ReadOptions ro;
+  ro.fill_cache = GetParam();
+  MultiScanArgs scan_options(BytewiseComparator());
+  scan_options.insert(key_ranges[0], key_ranges[1]);
+  scan_options.insert(key_ranges[2], key_ranges[3]);
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+  std::unique_ptr<MultiScan> iter =
+      dbfull()->NewMultiScan(ro, cfh, scan_options);
+  try {
+    int idx = 0;
+    int count = 0;
+    for (auto range : *iter) {
+      for (auto it : range) {
+        ASSERT_GE(it.first.ToString().compare(key_ranges[idx]), 0);
+        ASSERT_LT(it.first.ToString().compare(key_ranges[idx + 1]), 0);
+        count++;
+      }
+      idx += 2;
+    }
+    ASSERT_EQ(count, 32);
+  } catch (MultiScanException& ex) {
+    // Make sure exception contains the status
+    ASSERT_NOK(ex.status());
+    std::cerr << "Iterator returned status " << ex.what();
+    abort();
+  } catch (std::logic_error& ex) {
+    std::cerr << "Iterator returned logic error " << ex.what();
+    abort();
+  }
+  iter.reset();
+}
+
+TEST_P(DBMultiScanIteratorTest, MixedBoundsTest) {
+  auto options = CurrentOptions();
+  DestroyAndReopen(options);
+  // Create a file
+  for (int i = 0; i < 100; ++i) {
+    std::stringstream ss;
+    ss << std::setw(2) << std::setfill('0') << i;
+    ASSERT_OK(Put("k" + ss.str(), "val" + ss.str()));
+  }
+  ASSERT_OK(Flush());
+
+  std::vector<std::string> key_ranges(
+      {"k03", "k10", "k25", "k50", "k75", "k90"});
+  ReadOptions ro;
+  ro.fill_cache = GetParam();
+  MultiScanArgs scan_options(BytewiseComparator());
+  scan_options.insert(key_ranges[0], key_ranges[1]);
+  scan_options.insert(key_ranges[2]);
+  scan_options.insert(key_ranges[4], key_ranges[5]);
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+  std::unique_ptr<MultiScan> iter =
+      dbfull()->NewMultiScan(ro, cfh, scan_options);
+  try {
+    int idx = 0;
+    int count = 0;
+    for (auto range : *iter) {
+      for (auto it : range) {
+        ASSERT_GE(
+            it.first.ToString().compare(
+                scan_options.GetScanRanges()[idx].range.start->ToString()),
+            0);
+        if (scan_options.GetScanRanges()[idx].range.limit) {
+          ASSERT_LT(
+              it.first.ToString().compare(
+                  scan_options.GetScanRanges()[idx].range.limit->ToString()),
+              0);
+        }
+        count++;
+      }
+      idx++;
+    }
+    ASSERT_EQ(count, 97);
+  } catch (MultiScanException& ex) {
+    // Make sure exception contains the status
+    ASSERT_NOK(ex.status());
+    std::cerr << "Iterator returned status " << ex.what();
+    abort();
+  } catch (std::logic_error& ex) {
+    std::cerr << "Iterator returned logic error " << ex.what();
+    abort();
+  }
+  iter.reset();
+  scan_options = MultiScanArgs(BytewiseComparator());
+  scan_options.insert(key_ranges[0]);
+  scan_options.insert(key_ranges[2], key_ranges[3]);
+  scan_options.insert(key_ranges[4]);
+  iter = dbfull()->NewMultiScan(ro, cfh, scan_options);
+  try {
+    int idx = 0;
+    int count = 0;
+    for (auto range : *iter) {
+      for (auto it : range) {
+        ASSERT_GE(
+            it.first.ToString().compare(
+                scan_options.GetScanRanges()[idx].range.start->ToString()),
+            0);
+        if (scan_options.GetScanRanges()[idx].range.limit) {
+          ASSERT_LT(
+              it.first.ToString().compare(
+                  scan_options.GetScanRanges()[idx].range.limit->ToString()),
+              0);
+        }
+        count++;
+      }
+      idx++;
+    }
+    ASSERT_EQ(count, 147);
+  } catch (MultiScanException& ex) {
+    // Make sure exception contains the status
+    ASSERT_NOK(ex.status());
+    std::cerr << "Iterator returned status " << ex.what();
+    abort();
+  } catch (std::logic_error& ex) {
+    std::cerr << "Iterator returned logic error " << ex.what();
+    abort();
+  }
+  iter.reset();
+}
+
+TEST_P(DBMultiScanIteratorTest, RangeAcrossFiles) {
+  auto options = CurrentOptions();
+  options.target_file_size_base = 100 << 10;  // 20KB
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 50;
+  options.compression = kNoCompression;
+  DestroyAndReopen(options);
+
+  auto rnd = Random::GetTLSInstance();
+  // Write ~200KB data
+  for (int i = 0; i < 100; ++i) {
+    ASSERT_OK(Put(Key(i), rnd->RandomString(2 << 10)));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->CompactRange({}, nullptr, nullptr));
+  ASSERT_EQ(2, NumTableFilesAtLevel(49));
+  std::vector<std::string> key_ranges({Key(10), Key(90)});
+  ReadOptions ro;
+  ro.fill_cache = GetParam();
+  MultiScanArgs scan_options(BytewiseComparator());
+  scan_options.insert(key_ranges[0], key_ranges[1]);
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+  std::unique_ptr<MultiScan> iter =
+      dbfull()->NewMultiScan(ro, cfh, scan_options);
+  try {
+    int i = 10;
+    for (auto range : *iter) {
+      for (auto it : range) {
+        ASSERT_EQ(it.first.ToString(), Key(i));
+        ++i;
+      }
+    }
+    ASSERT_EQ(i, 90);
+  } catch (MultiScanException& ex) {
+    // Make sure exception contains the status
+    ASSERT_NOK(ex.status());
+    std::cerr << "Iterator returned status " << ex.what();
+    abort();
+  } catch (std::logic_error& ex) {
+    std::cerr << "Iterator returned logic error " << ex.what();
+    abort();
+  }
+  iter.reset();
+}
+
+TEST_P(DBMultiScanIteratorTest, FailureTest) {
+  auto options = CurrentOptions();
+  options.compression = kNoCompression;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  // Create a file
+  for (int i = 0; i < 100; ++i) {
+    std::stringstream ss;
+    ss << std::setw(2) << std::setfill('0') << i;
+    ASSERT_OK(Put("k" + ss.str(), rnd.RandomString(1024)));
+  }
+  ASSERT_OK(Flush());
+
+  std::vector<std::string> key_ranges({"k04", "k06", "k12", "k14"});
+  ReadOptions ro;
+  Slice ub;
+  ro.iterate_upper_bound = &ub;
+  ro.fill_cache = GetParam();
+  MultiScanArgs scan_options(BytewiseComparator());
+  scan_options.insert(key_ranges[0], key_ranges[1]);
+  scan_options.insert(key_ranges[2], key_ranges[3]);
+  scan_options.max_prefetch_size = 4500;
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+  std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ro, cfh));
+  ASSERT_NE(iter, nullptr);
+  iter->Prepare(scan_options);
+  int count = 0;
+  ub = key_ranges[1];
+  iter->Seek(key_ranges[0]);
+  while (iter->status().ok() && iter->Valid()) {
+    ASSERT_GE(iter->key().compare(key_ranges[0]), 0);
+    ASSERT_LT(iter->key().compare(key_ranges[1]), 0);
+    count++;
+    iter->Next();
+  }
+  ASSERT_OK(iter->status()) << iter->status().ToString();
+  ASSERT_EQ(count, 2);
+
+  // Second seek should hit the max_prefetch_size limit
+  ub = key_ranges[3];
+  iter->Seek(key_ranges[2]);
+  ASSERT_NOK(iter->status());
+  iter.reset();
+
+  // Test the case of unexpected Seek key
+  iter.reset(dbfull()->NewIterator(ro, cfh));
+  ASSERT_NE(iter, nullptr);
+  scan_options.max_prefetch_size = 0;
+  iter->Prepare(scan_options);
+  ub = key_ranges[3];
+  iter->Seek(key_ranges[2]);
+  ASSERT_NOK(iter->status());
+  iter.reset();
+}
+
+TEST_P(DBMultiScanIteratorTest, OutOfL0FileRange) {
+  // Test that prepare does not fail scan when a scan range
+  // is outside of a L0 file's key range.
+  auto options = CurrentOptions();
+  options.compression = kNoCompression;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  // Create a Lmax file
+  // key01 ~ key99
+  for (int i = 0; i < 100; ++i) {
+    std::stringstream ss;
+    ss << std::setw(2) << std::setfill('0') << i;
+    ASSERT_OK(Put("k" + ss.str(), rnd.RandomString(1024)));
+  }
+  ASSERT_OK(Flush());
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  // Create a L0 file
+  // key00 ~ key09
+  for (int i = 0; i < 10; ++i) {
+    std::stringstream ss;
+    ss << std::setw(2) << std::setfill('0') << i;
+    ASSERT_OK(Put("k" + ss.str(), rnd.RandomString(1024)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+
+  // The second range is outside of L0 file's key range
+  std::vector<std::string> key_ranges({"k04", "k06", "k12", "k14"});
+  ReadOptions ro;
+  Slice ub;
+  ro.iterate_upper_bound = &ub;
+  ro.fill_cache = GetParam();
+  MultiScanArgs scan_options(BytewiseComparator());
+  scan_options.insert(key_ranges[0], key_ranges[1]);
+  scan_options.insert(key_ranges[2], key_ranges[3]);
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+  std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ro, cfh));
+  ASSERT_NE(iter, nullptr);
+  iter->Prepare(scan_options);
+  int count = 0;
+  ub = key_ranges[1];
+  iter->Seek(key_ranges[0]);
+  while (iter->status().ok() && iter->Valid()) {
+    ASSERT_GE(iter->key().compare(key_ranges[0]), 0);
+    ASSERT_LT(iter->key().compare(key_ranges[1]), 0);
+    count++;
+    iter->Next();
+  }
+  ASSERT_OK(iter->status()) << iter->status().ToString();
+  ASSERT_EQ(count, 2);
+
+  ub = key_ranges[3];
+  count = 0;
+  iter->Seek(key_ranges[2]);
+  while (iter->status().ok() && iter->Valid()) {
+    ASSERT_GE(iter->key().compare(key_ranges[2]), 0);
+    ASSERT_LT(iter->key().compare(key_ranges[3]), 0);
+    count++;
+    iter->Next();
+  }
+  ASSERT_OK(iter->status()) << iter->status().ToString();
+  ASSERT_EQ(count, 2);
+}
+
+TEST_P(DBMultiScanIteratorTest, RangeBetweenFiles) {
+  auto options = CurrentOptions();
+  options.target_file_size_base = 100 << 10;  // 20KB
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 50;
+  options.compression = kNoCompression;
+  DestroyAndReopen(options);
+
+  auto rnd = Random::GetTLSInstance();
+  // Write ~200KB data
+  for (int i = 0; i < 100; ++i) {
+    ASSERT_OK(Put(Key(i), rnd->RandomString(2 << 10)));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->CompactRange({}, nullptr, nullptr));
+  ASSERT_EQ(2, NumTableFilesAtLevel(49));
+
+  // Test with a scan range that overlaps an entire file, with upper bound
+  // between 2 files
+  std::vector<LiveFileMetaData> file_meta;
+  dbfull()->GetLiveFilesMetaData(&file_meta);
+  ASSERT_EQ(file_meta.size(), 2);
+  std::vector<std::string> key_ranges(4);
+  key_ranges[0] = file_meta[0].smallestkey;
+  key_ranges[1] = file_meta[0].largestkey + "0";
+  key_ranges[2] = file_meta[1].smallestkey + "0";
+  key_ranges[3] = file_meta[1].largestkey;
+  ReadOptions ro;
+  ro.fill_cache = GetParam();
+  MultiScanArgs scan_options(BytewiseComparator());
+  scan_options.insert(key_ranges[0], key_ranges[1]);
+  scan_options.insert(key_ranges[2], key_ranges[3]);
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+  std::unique_ptr<MultiScan> iter =
+      dbfull()->NewMultiScan(ro, cfh, scan_options);
+  try {
+    for (auto range : *iter) {
+      for (auto it : range) {
+        ASSERT_GE(it.first.ToString(), key_ranges[0]);
+      }
+    }
+  } catch (MultiScanException& ex) {
+    // Make sure exception contains the status
+    ASSERT_NOK(ex.status());
+    std::cerr << "Iterator returned status " << ex.what();
+    abort();
+  } catch (std::logic_error& ex) {
+    std::cerr << "Iterator returned logic error " << ex.what();
+    abort();
+  }
+  iter.reset();
+
+  // Test multiscan with a range entirely between adjacent files
+  key_ranges[0] = file_meta[0].largestkey + "0";
+  key_ranges[1] = file_meta[0].largestkey + "1";
+  key_ranges[2] = file_meta[1].smallestkey + "0";
+  key_ranges[3] = file_meta[1].largestkey;
+  (*scan_options).clear();
+  scan_options.insert(key_ranges[0], key_ranges[1]);
+  scan_options.insert(key_ranges[2], key_ranges[3]);
+  iter = dbfull()->NewMultiScan(ro, cfh, scan_options);
+  try {
+    for (auto range : *iter) {
+      for (auto it : range) {
+        ASSERT_GE(it.first.ToString(), key_ranges[0]);
+      }
+    }
+  } catch (MultiScanException& ex) {
+    // Make sure exception contains the status
+    ASSERT_NOK(ex.status());
+    std::cerr << "Iterator returned status " << ex.what();
+    abort();
+  } catch (std::logic_error& ex) {
+    std::cerr << "Iterator returned logic error " << ex.what();
+    abort();
+  }
+  iter.reset();
+}
+
+// This test case tests multiscan in the presence of fragmented range
+// tombstones in the LSM.
+TEST_P(DBMultiScanIteratorTest, FragmentedRangeTombstones) {
+  auto options = CurrentOptions();
+  // Compaction may create files 2x the target_file_size_base,
+  // so set this to 50KB so we atleast end up with 2 files of
+  // 100KB
+  options.target_file_size_base = 50 << 10;  // 50KB
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 50;
+  options.compression = kNoCompression;
+  DestroyAndReopen(options);
+
+  // Setup the LSM as follows -
+  // 1. Ingest a file with 100 keys
+  // 2. Ingest a file with one overlapping key
+  // 3. Do a Put and flush a file to L0 with one overlapping key
+  // 4. Ingest a standalone delete range file that covers the full key space
+  //    and a file with the same 100 keys with new values. This will ingest
+  //    into L0 due to the presence of an existing file in L0
+  // The final LSM will have an SST in Lmax with 100 keys, and 2 SST files
+  // in Lmax-1 with half the keys each and completely overlapping delete ranges
+  std::unordered_map<std::string, std::string> kvs;
+  auto rnd = Random::GetTLSInstance();
+  auto create_ingestion_data_file_and_update_key_value =
+      [&](const std::string& filename, int start_key, int end_key) {
+        std::unique_ptr<SstFileWriter> writer;
+        writer.reset(new SstFileWriter(EnvOptions(), options));
+        ASSERT_OK(writer->Open(filename));
+        for (int i = start_key; i < end_key; ++i) {
+          auto kiter = kvs.find(Key(i));
+          if (kiter != kvs.end()) {
+            kvs.erase(kiter);
+          }
+          auto res =
+              kvs.emplace(std::make_pair(Key(i), rnd->RandomString(2 << 10)));
+          ASSERT_OK(writer->Put(res.first->first, res.first->second));
+        }
+        ASSERT_OK(writer->Finish());
+        writer.reset();
+      };
+
+  CreateColumnFamilies({"new_cf"}, options);
+  std::string ingest_file = dbname_ + "test.sst";
+  // Write ~200KB data
+  create_ingestion_data_file_and_update_key_value(ingest_file + "_0", 0, 100);
+  create_ingestion_data_file_and_update_key_value(ingest_file + "_1", 50, 51);
+  ColumnFamilyHandle* cfh = handles_[0];
+  IngestExternalFileOptions ifo;
+  Status s = dbfull()->IngestExternalFile(
+      cfh, {ingest_file + "_0", ingest_file + "_1"}, ifo);
+  ASSERT_OK(s);
+
+  ASSERT_OK(Put(0, Key(50), rnd->RandomString(2 << 10)));
+  ASSERT_OK(Flush());
+
+  {
+    std::unique_ptr<SstFileWriter> writer;
+    writer.reset(new SstFileWriter(EnvOptions(), options));
+    ASSERT_OK(writer->Open(ingest_file + "_2"));
+    ASSERT_OK(writer->DeleteRange("a", "z"));
+    ASSERT_OK(writer->Finish());
+    writer.reset();
+  }
+  create_ingestion_data_file_and_update_key_value(ingest_file + "_3", 0, 100);
+  s = dbfull()->IngestExternalFile(
+      cfh, {ingest_file + "_2", ingest_file + "_3"}, ifo);
+  ASSERT_OK(s);
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ColumnFamilyMetaData cf_meta;
+  dbfull()->GetColumnFamilyMetaData(cfh, &cf_meta);
+  // Only the L0 with range deletion is compacted.
+  ASSERT_EQ(1, cf_meta.levels[0].files.size());
+  ASSERT_EQ(0, cf_meta.levels[0].files[0].num_deletions);
+
+  // The first scan range overlaps the DB key range, while the second extends
+  // beyond but overlaps the delete range
+  std::vector<std::string> key_ranges({"key000085", "key000090", "l", "n"});
+  ReadOptions ro;
+  ro.fill_cache = GetParam();
+  MultiScanArgs scan_options(BytewiseComparator());
+  scan_options.insert(key_ranges[0], key_ranges[1]);
+  scan_options.insert(key_ranges[2], key_ranges[3]);
+  std::unique_ptr<MultiScan> iter =
+      dbfull()->NewMultiScan(ro, cfh, scan_options);
+  try {
+    int i = 0;
+    int count = 0;
+    for (auto range : *iter) {
+      for (auto it : range) {
+        ASSERT_GE(it.first.ToString(), key_ranges[i]);
+        ASSERT_LT(it.first.ToString(), key_ranges[i + 1]);
+        auto kiter = kvs.find(it.first.ToString());
+        ASSERT_NE(kiter, kvs.end());
+        ASSERT_EQ(kiter->second, it.second.ToString());
+        count++;
+      }
+      i += 2;
+    }
+    ASSERT_EQ(i, 4);
+    ASSERT_EQ(count, 5);
+  } catch (MultiScanException& ex) {
+    ASSERT_OK(ex.status());
+  }
+  iter.reset();
+
+  // The second scan range start overlaps the delete range in the first file
+  // in Lmax-1, while the end overlaps the keys in the second file
+  (*scan_options).clear();
+  key_ranges[0] = "key000010";
+  key_ranges[1] = "key000020";
+  key_ranges[2] = "key0000500";
+  key_ranges[3] = "key000060";
+  scan_options.insert(key_ranges[0], key_ranges[1]);
+  scan_options.insert(key_ranges[2], key_ranges[3]);
+  iter = dbfull()->NewMultiScan(ro, cfh, scan_options);
+  try {
+    int i = 0;
+    int count = 0;
+    for (auto range : *iter) {
+      for (auto it : range) {
+        ASSERT_GE(it.first.ToString(), key_ranges[i]);
+        ASSERT_LT(it.first.ToString(), key_ranges[i + 1]);
+        auto kiter = kvs.find(it.first.ToString());
+        ASSERT_NE(kiter, kvs.end());
+        ASSERT_EQ(kiter->second, it.second.ToString());
+        count++;
+      }
+      i += 2;
+    }
+    ASSERT_EQ(i, 4);
+    ASSERT_EQ(count, 19);
+  } catch (MultiScanException& ex) {
+    ASSERT_OK(ex.status());
+  }
+  iter.reset();
+}
+
+TEST_P(DBMultiScanIteratorTest, ReseekAcrossBlocksSameUserKey) {
+  // This test exposes a bug where multiscan reseeks backwards when
+  // max_sequential_skip_in_iterations is triggered with the same user key
+  // spanning multiple data blocks.
+
+  auto options = CurrentOptions();
+  options.max_sequential_skip_in_iterations = 3;
+  options.compression = kNoCompression;
+
+  // Force each internal key into its own block
+  BlockBasedTableOptions table_options;
+  table_options.flush_block_policy_factory =
+      std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+
+  // Taking a snapshot after each Put to preserve all versions during flush.
+  std::vector<const Snapshot*> snapshots;
+  for (int i = 0; i < 7; ++i) {
+    ASSERT_OK(Put("key_a", "value_" + std::to_string(i)));
+    snapshots.push_back(db_->GetSnapshot());
+  }
+  ASSERT_OK(Put("key_b", "value_b"));
+
+  ASSERT_OK(Flush());
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+  // Setup multiscan range covering both keys
+  std::vector<std::string> key_ranges({"key_a", "key_c"});
+  ReadOptions ro;
+  Slice ub = key_ranges[1];
+  ro.iterate_upper_bound = &ub;
+  ro.fill_cache = GetParam();
+
+  MultiScanArgs scan_options(BytewiseComparator());
+  scan_options.insert(key_ranges[0], key_ranges[1]);
+
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+  std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ro, cfh));
+  ASSERT_NE(iter, nullptr);
+  iter->Prepare(scan_options);
+
+  std::vector<std::string> seen_keys;
+  std::vector<std::string> seen_values;
+  iter->Seek(key_ranges[0]);
+  while (iter->status().ok() && iter->Valid()) {
+    seen_keys.push_back(iter->key().ToString());
+    seen_values.push_back(iter->value().ToString());
+    iter->Next();
+  }
+  ASSERT_OK(iter->status()) << iter->status().ToString();
+
+  ASSERT_EQ(seen_keys.size(), 2) << "Should see key_a and key_b";
+  ASSERT_EQ(seen_keys[0], "key_a");
+  ASSERT_EQ(seen_keys[1], "key_b");
+  ASSERT_EQ(seen_values[0], "value_6");
+  ASSERT_EQ(seen_values[1], "value_b");
+
+  for (auto* snapshot : snapshots) {
+    db_->ReleaseSnapshot(snapshot);
+  }
+}
+
+TEST_P(DBMultiScanIteratorTest, AsyncPrefetchAcrossMultipleFiles) {
+  // Test async prefetch with multiple ranges within a single file
+  auto options = CurrentOptions();
+  options.target_file_size_base = 1 << 15;  // 32KiB
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 50;
+  options.compression = kNoCompression;
+  options.statistics = CreateDBStatistics();
+  DestroyAndReopen(options);
+
+  Random rnd(303);
+
+  // Create a single large file with many keys
+  // ~1MiB of data
+  // Should be lots of files now
+  for (int i = 0; i < 1000; ++i) {
+    std::stringstream ss;
+    ss << "k" << std::setw(5) << std::setfill('0') << i;
+    // 1KiB values
+    ASSERT_OK(Put(ss.str(), rnd.RandomString(1 << 10)));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->CompactRange({}, nullptr, nullptr));
+
+  ASSERT_GT(NumTableFilesAtLevel(49), 3);
+
+  // Set up multiple non-overlapping ranges in the same file
+  // Every 32 values should be a file or so
+  std::vector<std::string> key_ranges(
+      {"k00000", "k00100", "k00500", "k00600", "k00800", "k00900"});
+  ReadOptions ro;
+  ro.fill_cache = GetParam();
+
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+
+  MultiScanArgs scan_options(BytewiseComparator());
+  scan_options.use_async_io = true;
+  scan_options.insert(key_ranges[0], key_ranges[1]);
+  scan_options.insert(key_ranges[2], key_ranges[3]);
+  scan_options.insert(key_ranges[4], key_ranges[5]);
+
+  auto read_count_before =
+      options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
+  std::unique_ptr<MultiScan> iter =
+      dbfull()->NewMultiScan(ro, cfh, scan_options);
+  ASSERT_NE(iter, nullptr);
+  auto read_count_after =
+      options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
+  ASSERT_EQ(read_count_after, read_count_before);
+
+  // Verify all three ranges can be scanned successfully
+  try {
+    for (auto range : *iter) {
+      for (auto it : range) {
+        it.first.ToString();
+      }
+    }
+  } catch (MultiScanException& ex) {
+    // Make sure exception contains the status
+    ASSERT_NOK(ex.status());
+    std::cerr << "Iterator returned status " << ex.what();
+    abort();
+  } catch (std::logic_error& ex) {
+    std::cerr << "Iterator returned logic error " << ex.what();
+    abort();
+  }
+
+  iter.reset();
+}
+
+TEST_P(DBMultiScanIteratorTest, AsyncPrefetchMultipleLevels) {
+  // Test async prefetch with files in L0 and non-L0 levels
+  // Similar setup to AsyncPrefetchAcrossMultipleFiles but with L0 files
+  auto options = CurrentOptions();
+  options.target_file_size_base = 1 << 15;  // 32KiB
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 50;
+  options.compression = kNoCompression;
+  options.statistics = CreateDBStatistics();
+  DestroyAndReopen(options);
+
+  Random rnd(304);
+
+  // Create base files and compact to bottom level - ~500KiB of data
+  for (int i = 0; i < 500; ++i) {
+    std::stringstream ss;
+    ss << "k" << std::setw(5) << std::setfill('0') << i;
+    ASSERT_OK(Put(ss.str(), rnd.RandomString(1 << 10)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange({}, nullptr, nullptr));
+
+  // Verify we have files at bottom level
+  ASSERT_GT(NumTableFilesAtLevel(49), 0);
+
+  // Create additional L0 files with overlapping key ranges
+  for (int i = 100; i < 150; ++i) {
+    std::stringstream ss;
+    ss << "k" << std::setw(5) << std::setfill('0') << i;
+    ASSERT_OK(Put(ss.str(), rnd.RandomString(1 << 10)));
+  }
+  ASSERT_OK(Flush());
+
+  // Verify we now have files in both L0 and bottom level
+  ASSERT_GT(NumTableFilesAtLevel(0), 0);
+  ASSERT_GT(NumTableFilesAtLevel(49), 0);
+
+  // Set up multiple non-overlapping ranges
+  std::vector<std::string> key_ranges(
+      {"k00000", "k00100", "k00200", "k00300", "k00400", "k00500"});
+  ReadOptions ro;
+  ro.fill_cache = GetParam();
+
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+
+  MultiScanArgs scan_options(BytewiseComparator());
+  scan_options.use_async_io = true;
+  scan_options.insert(key_ranges[0], key_ranges[1]);
+  scan_options.insert(key_ranges[2], key_ranges[3]);
+  scan_options.insert(key_ranges[4], key_ranges[5]);
+
+  std::unique_ptr<MultiScan> iter =
+      dbfull()->NewMultiScan(ro, cfh, scan_options);
+  ASSERT_NE(iter, nullptr);
+
+  // Verify all three ranges can be scanned successfully
+  int total_keys = 0;
+  try {
+    for (auto range : *iter) {
+      for (auto it : range) {
+        it.first.ToString();
+        total_keys++;
+      }
+    }
+  } catch (MultiScanException& ex) {
+    ASSERT_NOK(ex.status());
+    std::cerr << "Iterator returned status " << ex.what();
+    abort();
+  } catch (std::logic_error& ex) {
+    std::cerr << "Iterator returned logic error " << ex.what();
+    abort();
+  }
+
+  // Should have keys from all three ranges
+  ASSERT_GT(total_keys, 0);
+  iter.reset();
+}
+
+TEST_P(DBMultiScanIteratorTest, AsyncPrefetchWithDeleteRange) {
+  // Test async prefetch with delete ranges
+  auto options = CurrentOptions();
+  options.target_file_size_base = 1 << 15;  // 32KiB
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 50;
+  options.compression = kNoCompression;
+  DestroyAndReopen(options);
+
+  Random rnd(305);
+
+  // Create base data - ~500KiB
+  for (int i = 0; i < 500; ++i) {
+    std::stringstream ss;
+    ss << "k" << std::setw(5) << std::setfill('0') << i;
+    ASSERT_OK(Put(ss.str(), rnd.RandomString(1 << 10)));
+  }
+  ASSERT_OK(Flush());
+
+  // Add delete ranges
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), dbfull()->DefaultColumnFamily(),
+                             "k00100", "k00200"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->CompactRange({}, nullptr, nullptr));
+  ASSERT_GT(NumTableFilesAtLevel(49), 0);
+
+  // Set up scan ranges that interact with delete ranges
+  std::vector<std::string> key_ranges({"k00000", "k00500"});
+  ReadOptions ro;
+  ro.fill_cache = GetParam();
+
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+
+  MultiScanArgs scan_options(BytewiseComparator());
+  scan_options.use_async_io = true;
+  scan_options.insert(key_ranges[0], key_ranges[1]);
+
+  std::unique_ptr<MultiScan> iter =
+      dbfull()->NewMultiScan(ro, cfh, scan_options);
+  ASSERT_NE(iter, nullptr);
+
+  // Verify ranges can be scanned successfully
+  int total_keys = 0;
+  try {
+    for (auto range : *iter) {
+      for (auto it : range) {
+        std::string key = it.first.ToString();
+        // Verify deleted keys are not returned
+        ASSERT_TRUE((key < "k00100" || key >= "k00200"));
+        total_keys++;
+      }
+    }
+  } catch (MultiScanException& ex) {
+    ASSERT_NOK(ex.status());
+    std::cerr << "Iterator returned status " << ex.what();
+    abort();
+  } catch (std::logic_error& ex) {
+    std::cerr << "Iterator returned logic error " << ex.what();
+    abort();
+  }
+
+  // Should have keys excluding deleted ranges
+  ASSERT_EQ(total_keys, 400);
+  iter.reset();
+}
+
+TEST_P(DBMultiScanIteratorTest, AsyncPrefetchWithExternalFileIngestion) {
+  // Test async prefetch with externally ingested files
+  auto options = CurrentOptions();
+  options.target_file_size_base = 1 << 15;  // 32KiB
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 50;
+  options.compression = kNoCompression;
+  DestroyAndReopen(options);
+
+  Random rnd(306);
+
+  // Create base data - ~200KiB
+  for (int i = 0; i < 200; ++i) {
+    std::stringstream ss;
+    ss << "k" << std::setw(5) << std::setfill('0') << i;
+    ASSERT_OK(Put(ss.str(), rnd.RandomString(1 << 10)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange({}, nullptr, nullptr));
+
+  // Create and ingest external SST file with new data
+  std::string ingest_file = dbname_ + "/test_ingest.sst";
+  {
+    std::unique_ptr<SstFileWriter> writer;
+    writer.reset(new SstFileWriter(EnvOptions(), options));
+    ASSERT_OK(writer->Open(ingest_file));
+    for (int i = 300; i < 500; ++i) {
+      std::stringstream ss;
+      ss << "k" << std::setw(5) << std::setfill('0') << i;
+      ASSERT_OK(writer->Put(ss.str(), rnd.RandomString(1 << 10)));
+    }
+    ASSERT_OK(writer->Finish());
+  }
+
+  IngestExternalFileOptions ifo;
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+  ASSERT_OK(dbfull()->IngestExternalFile(cfh, {ingest_file}, ifo));
+
+  // Set up scan ranges that span both regular and ingested files
+  std::vector<std::string> key_ranges({"k00000", "k00500"});
+  ReadOptions ro;
+  ro.fill_cache = GetParam();
+
+  MultiScanArgs scan_options(BytewiseComparator());
+  scan_options.use_async_io = true;
+  scan_options.insert(key_ranges[0], key_ranges[1]);
+
+  std::unique_ptr<MultiScan> iter =
+      dbfull()->NewMultiScan(ro, cfh, scan_options);
+  ASSERT_NE(iter, nullptr);
+
+  // Verify all ranges can be scanned successfully
+  int total_keys = 0;
+  try {
+    for (auto range : *iter) {
+      for (auto it : range) {
+        it.first.ToString();
+        total_keys++;
+      }
+    }
+  } catch (MultiScanException& ex) {
+    ASSERT_NOK(ex.status());
+    std::cerr << "Iterator returned status " << ex.what();
+    abort();
+  } catch (std::logic_error& ex) {
+    std::cerr << "Iterator returned logic error " << ex.what();
+    abort();
+  }
+
+  ASSERT_EQ(total_keys, 400);
+  iter.reset();
+}
+
+TEST_P(DBMultiScanIteratorTest, IODispatcherStatsVerification) {
+  // Test that verifies all IOs go through the IODispatcher by checking stats
+  auto options = CurrentOptions();
+  options.target_file_size_base = 1 << 15;  // 32KiB
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 50;
+  options.compression = kNoCompression;
+  DestroyAndReopen(options);
+
+  Random rnd(307);
+
+  // Create data - enough to create multiple data blocks
+  for (int i = 0; i < 500; ++i) {
+    std::stringstream ss;
+    ss << "k" << std::setw(5) << std::setfill('0') << i;
+    ASSERT_OK(Put(ss.str(), rnd.RandomString(1 << 10)));  // 1KiB values
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange({}, nullptr, nullptr));
+
+  // Set up scan ranges
+  std::vector<std::string> key_ranges({"k00000", "k00200", "k00300", "k00400"});
+  ReadOptions ro;
+  ro.fill_cache = GetParam();
+
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+
+  // Create a tracking IODispatcher to verify IO statistics
+  auto tracking_dispatcher = std::make_shared<TrackingIODispatcher>();
+
+  MultiScanArgs scan_options(BytewiseComparator());
+  scan_options.use_async_io = false;  // Use sync IO for predictable stats
+  scan_options.io_dispatcher = tracking_dispatcher;
+  scan_options.insert(key_ranges[0], key_ranges[1]);
+  scan_options.insert(key_ranges[2], key_ranges[3]);
+
+  std::unique_ptr<MultiScan> iter =
+      dbfull()->NewMultiScan(ro, cfh, scan_options);
+  ASSERT_NE(iter, nullptr);
+
+  // Scan through all data
+  int total_keys = 0;
+  try {
+    for (auto range : *iter) {
+      for (auto it : range) {
+        it.first.ToString();
+        total_keys++;
+      }
+    }
+  } catch (MultiScanException& ex) {
+    ASSERT_NOK(ex.status());
+    std::cerr << "Iterator returned status " << ex.what();
+    abort();
+  } catch (std::logic_error& ex) {
+    std::cerr << "Iterator returned logic error " << ex.what();
+    abort();
+  }
+
+  // We scanned ~200 keys in range 1 and ~100 keys in range 2
+  ASSERT_EQ(total_keys, 300);
+
+  // Verify that IO operations went through the IODispatcher
+  // The total IO operations should be > 0 (either sync reads, async reads, or
+  // cache hits)
+  uint64_t total_ops = tracking_dispatcher->GetTotalIOOperations();
+  ASSERT_GT(total_ops, 0) << "Expected some IO operations through IODispatcher";
+
+  // Verify that we have at least one ReadSet created
+  ASSERT_GT(tracking_dispatcher->GetReadSets().size(), 0)
+      << "Expected at least one ReadSet to be created";
+
+  // Since we used sync IO, we should have sync reads (or cache hits if cached)
+  uint64_t sync_reads = tracking_dispatcher->GetTotalSyncReads();
+  uint64_t cache_hits = tracking_dispatcher->GetTotalCacheHits();
+  ASSERT_GT(sync_reads + cache_hits, 0)
+      << "Expected sync reads or cache hits for sync IO mode";
+
+  iter.reset();
+}
+
+TEST_P(DBMultiScanIteratorTest, IODispatcherPrefetchKnownBlocks) {
+  // Test that verifies we prefetch a known/expected number of blocks.
+  // Uses FlushBlockEveryKeyPolicyFactory to create exactly one block per key,
+  // making the block count predictable and verifiable.
+  auto options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+
+  // Configure to create exactly one block per key
+  BlockBasedTableOptions table_options;
+  table_options.flush_block_policy_factory =
+      std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+  // Use a block cache (required by IODispatcher), but use a fresh one
+  // that won't have any cached data
+  table_options.block_cache = NewLRUCache(10 * 1024 * 1024);  // 10MB cache
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+
+  // Create exactly 100 keys, each in its own block
+  const int kNumKeys = 100;
+  const int kValueSize = 100;  // Fixed value size for predictability
+  std::string value(kValueSize, 'v');
+
+  for (int i = 0; i < kNumKeys; ++i) {
+    std::stringstream ss;
+    ss << "k" << std::setw(3) << std::setfill('0') << i;
+    ASSERT_OK(Put(ss.str(), value));
+  }
+  ASSERT_OK(Flush());
+
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+
+  // Create a tracking IODispatcher to verify IO statistics
+  auto tracking_dispatcher = std::make_shared<TrackingIODispatcher>();
+
+  // Define scan ranges with known block counts:
+  // Range 1: k000 to k020 (20 keys = 20 blocks)
+  // Range 2: k050 to k060 (10 keys = 10 blocks)
+  // Total expected blocks to read: 30
+  MultiScanArgs scan_options(BytewiseComparator());
+  scan_options.use_async_io = false;  // Use sync IO for predictable stats
+  scan_options.io_dispatcher = tracking_dispatcher;
+  scan_options.insert("k000", "k020");
+  scan_options.insert("k050", "k060");
+
+  ReadOptions ro;
+  ro.fill_cache = false;  // Don't fill cache, ensure fresh reads
+
+  std::unique_ptr<MultiScan> iter =
+      dbfull()->NewMultiScan(ro, cfh, scan_options);
+  ASSERT_NE(iter, nullptr);
+
+  // Scan through all data and count keys
+  int total_keys = 0;
+  try {
+    for (auto range : *iter) {
+      for (auto it : range) {
+        it.first.ToString();
+        total_keys++;
+      }
+    }
+  } catch (MultiScanException& ex) {
+    ASSERT_NOK(ex.status());
+    std::cerr << "Iterator returned status " << ex.what();
+    abort();
+  } catch (std::logic_error& ex) {
+    std::cerr << "Iterator returned logic error " << ex.what();
+    abort();
+  }
+
+  // Verify we scanned the expected number of keys
+  // Range 1: k000-k019 = 20 keys, Range 2: k050-k059 = 10 keys
+  ASSERT_EQ(total_keys, 30) << "Expected 30 keys from two ranges";
+
+  // Verify IODispatcher statistics
+  uint64_t total_ops = tracking_dispatcher->GetTotalIOOperations();
+  uint64_t sync_reads = tracking_dispatcher->GetTotalSyncReads();
+
+  // We should have at least as many IO operations as blocks we need to read
+  // (could be more due to index/filter blocks)
+  ASSERT_GE(total_ops, 30)
+      << "Expected at least 30 IO operations for 30 data blocks";
+
+  // Since cache is fresh and fill_cache=false, all should be sync reads
+  ASSERT_GE(sync_reads, 30)
+      << "Expected at least 30 sync reads for 30 data blocks";
+
+  // Verify we created ReadSets (one per range)
+  size_t num_readsets = tracking_dispatcher->GetReadSets().size();
+  ASSERT_GE(num_readsets, 1) << "Expected at least one ReadSet";
+
+  // Log the stats for debugging
+  std::cout << "IODispatcher Stats: total_ops=" << total_ops
+            << ", sync_reads=" << sync_reads
+            << ", async_reads=" << tracking_dispatcher->GetTotalAsyncReads()
+            << ", cache_hits=" << tracking_dispatcher->GetTotalCacheHits()
+            << ", readsets=" << num_readsets << std::endl;
+
+  iter.reset();
+}
+
+TEST_P(DBMultiScanIteratorTest, IODispatcherCacheHitVerification) {
+  // Test that verifies cache hits are properly tracked through IODispatcher.
+  // First scan populates cache, second scan should show cache hits.
+  auto options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+
+  BlockBasedTableOptions table_options;
+  table_options.flush_block_policy_factory =
+      std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+  // Enable block cache with enough space for all blocks
+  table_options.block_cache = NewLRUCache(10 * 1024 * 1024);  // 10MB cache
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+
+  // Create 50 keys, each in its own block
+  const int kNumKeys = 50;
+  std::string value(100, 'v');
+
+  for (int i = 0; i < kNumKeys; ++i) {
+    std::stringstream ss;
+    ss << "k" << std::setw(3) << std::setfill('0') << i;
+    ASSERT_OK(Put(ss.str(), value));
+  }
+  ASSERT_OK(Flush());
+
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+
+  // First scan: populate the cache
+  {
+    auto dispatcher1 = std::make_shared<TrackingIODispatcher>();
+    MultiScanArgs scan_options(BytewiseComparator());
+    scan_options.use_async_io = false;
+    scan_options.io_dispatcher = dispatcher1;
+    scan_options.insert("k000", "k025");  // 25 keys
+
+    ReadOptions ro;
+    ro.fill_cache = true;  // Fill cache on first scan
+
+    std::unique_ptr<MultiScan> iter =
+        dbfull()->NewMultiScan(ro, cfh, scan_options);
+    ASSERT_NE(iter, nullptr);
+
+    int count = 0;
+    try {
+      for (auto range : *iter) {
+        for (auto it : range) {
+          it.first.ToString();
+          count++;
+        }
+      }
+    } catch (MultiScanException& ex) {
+      FAIL() << "First scan failed: " << ex.what();
+    }
+    ASSERT_EQ(count, 25);
+
+    // First scan should have sync reads (cache was empty)
+    uint64_t first_sync = dispatcher1->GetTotalSyncReads();
+    ASSERT_GE(first_sync, 25) << "First scan should have sync reads";
+
+    std::cout << "First scan stats: sync_reads=" << first_sync
+              << ", cache_hits=" << dispatcher1->GetTotalCacheHits()
+              << std::endl;
+  }
+
+  // Second scan: should get cache hits
+  {
+    auto dispatcher2 = std::make_shared<TrackingIODispatcher>();
+    MultiScanArgs scan_options(BytewiseComparator());
+    scan_options.use_async_io = false;
+    scan_options.io_dispatcher = dispatcher2;
+    scan_options.insert("k000", "k025");  // Same range as before
+
+    ReadOptions ro;
+    ro.fill_cache = true;
+
+    std::unique_ptr<MultiScan> iter =
+        dbfull()->NewMultiScan(ro, cfh, scan_options);
+    ASSERT_NE(iter, nullptr);
+
+    int count = 0;
+    try {
+      for (auto range : *iter) {
+        for (auto it : range) {
+          it.first.ToString();
+          count++;
+        }
+      }
+    } catch (MultiScanException& ex) {
+      FAIL() << "Second scan failed: " << ex.what();
+    }
+    ASSERT_EQ(count, 25);
+
+    // Second scan should have cache hits (blocks were cached in first scan)
+    uint64_t second_cache_hits = dispatcher2->GetTotalCacheHits();
+    uint64_t second_sync = dispatcher2->GetTotalSyncReads();
+
+    std::cout << "Second scan stats: sync_reads=" << second_sync
+              << ", cache_hits=" << second_cache_hits << std::endl;
+
+    // We expect cache hits on the second scan for data blocks
+    // Note: Some blocks might still need sync reads (e.g., if cache was
+    // evicted)
+    ASSERT_GE(second_cache_hits, 20)
+        << "Second scan should have cache hits for most blocks";
+  }
+}
+
+TEST_P(DBMultiScanIteratorTest, WastedBlocksTracking) {
+  // Test that verifies wasted prefetch blocks are properly tracked.
+  // When blocks are prefetched but skipped (e.g., due to seek), they should
+  // be counted as wasted and recorded to MULTISCAN_PREFETCH_BLOCKS_WASTED.
+  auto options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.statistics = CreateDBStatistics();
+
+  BlockBasedTableOptions table_options;
+  table_options.flush_block_policy_factory =
+      std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+  table_options.block_cache = NewLRUCache(10 * 1024 * 1024);  // 10MB cache
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+
+  // Create 100 keys, each in its own block
+  const int kNumKeys = 100;
+  std::string value(100, 'v');
+
+  for (int i = 0; i < kNumKeys; ++i) {
+    std::stringstream ss;
+    ss << "k" << std::setw(3) << std::setfill('0') << i;
+    ASSERT_OK(Put(ss.str(), value));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange({}, nullptr, nullptr));
+
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+
+  // Reset the wasted blocks counter before test
+  options.statistics->setTickerCount(MULTISCAN_PREFETCH_BLOCKS_WASTED, 0);
+
+  // Set up MultiScan with two non-contiguous ranges:
+  // Range 1: k000-k020 (20 keys/blocks)
+  // Range 2: k050-k070 (20 keys/blocks)
+  // The blocks between k020-k050 (30 blocks) should be wasted if prefetched
+  MultiScanArgs scan_options(BytewiseComparator());
+  scan_options.use_async_io = false;
+  scan_options.insert("k000", "k020");
+  scan_options.insert("k050", "k070");
+
+  ReadOptions ro;
+  ro.fill_cache = GetParam();
+
+  {
+    std::unique_ptr<MultiScan> iter =
+        dbfull()->NewMultiScan(ro, cfh, scan_options);
+    ASSERT_NE(iter, nullptr);
+
+    int count = 0;
+    try {
+      for (auto range : *iter) {
+        for (auto it : range) {
+          it.first.ToString();
+          count++;
+        }
+      }
+    } catch (MultiScanException& ex) {
+      FAIL() << "Scan failed: " << ex.what();
+    }
+
+    // We should have scanned 40 keys total (20 + 20)
+    ASSERT_EQ(count, 40);
+  }  // Iterator destroyed here, wasted blocks recorded
+
+  // Check that wasted blocks were recorded
+  // The exact count depends on how many blocks were prefetched between ranges
+  uint64_t wasted =
+      options.statistics->getTickerCount(MULTISCAN_PREFETCH_BLOCKS_WASTED);
+
+  // We expect some wasted blocks due to the gap between ranges
+  // The exact number depends on prefetch behavior, but should be > 0
+  // if blocks between k020-k050 were prefetched
+  std::cout << "Wasted blocks: " << wasted << std::endl;
+
+  // Note: The test verifies the tracking mechanism works.
+  // The actual count depends on prefetch heuristics which may vary.
+}
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/db_kv_checksum_test.cc b/db/db_kv_checksum_test.cc
index 6eea6e5b4ba0..7d18688f0788 100644
--- a/db/db_kv_checksum_test.cc
+++ b/db/db_kv_checksum_test.cc
@@ -312,12 +312,12 @@ TEST_P(DbKvChecksumTest, WriteToWALCorrupted) {
     // Corrupted write batch leads to read-only mode, so we have to
     // reopen for every attempt.
     Reopen(options);
-    auto log_size_pre_write = dbfull()->TEST_total_log_size();
+    auto log_size_pre_write = dbfull()->TEST_wals_total_size();
 
     SyncPoint::GetInstance()->EnableProcessing();
     ASSERT_TRUE(ExecuteWrite(nullptr /* cf_handle */).IsCorruption());
     // Confirm that nothing was written to WAL
-    ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size());
+    ASSERT_EQ(log_size_pre_write, dbfull()->TEST_wals_total_size());
     ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption());
     SyncPoint::GetInstance()->DisableProcessing();
 
@@ -350,12 +350,12 @@ TEST_P(DbKvChecksumTest, WriteToWALWithColumnFamilyCorrupted) {
     // Corrupted write batch leads to read-only mode, so we have to
     // reopen for every attempt.
     ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
-    auto log_size_pre_write = dbfull()->TEST_total_log_size();
+    auto log_size_pre_write = dbfull()->TEST_wals_total_size();
 
     SyncPoint::GetInstance()->EnableProcessing();
     ASSERT_TRUE(ExecuteWrite(nullptr /* cf_handle */).IsCorruption());
     // Confirm that nothing was written to WAL
-    ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size());
+    ASSERT_EQ(log_size_pre_write, dbfull()->TEST_wals_total_size());
     ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption());
     SyncPoint::GetInstance()->DisableProcessing();
 
@@ -487,7 +487,7 @@ TEST_P(DbKvChecksumTestMergedBatch, WriteToWALCorrupted) {
     // Reopen DB since it failed WAL write which lead to read-only mode
     Reopen(options);
     SyncPoint::GetInstance()->EnableProcessing();
-    auto log_size_pre_write = dbfull()->TEST_total_log_size();
+    auto log_size_pre_write = dbfull()->TEST_wals_total_size();
     leader_batch_and_status =
         GetWriteBatch(GetCFHandleToUse(nullptr, op_type1_),
                       8 /* protection_bytes_per_key */, op_type1_);
@@ -499,7 +499,7 @@ TEST_P(DbKvChecksumTestMergedBatch, WriteToWALCorrupted) {
     SyncPoint::GetInstance()->ClearCallBack("WriteThread::JoinBatchGroup:Wait");
     ASSERT_EQ(1, leader_count);
     // Nothing should have been written to WAL
-    ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size());
+    ASSERT_EQ(log_size_pre_write, dbfull()->TEST_wals_total_size());
     ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption());
 
     corrupt_byte_offset++;
@@ -599,7 +599,7 @@ TEST_P(DbKvChecksumTestMergedBatch, WriteToWALWithColumnFamilyCorrupted) {
     // Reopen DB since it failed WAL write which lead to read-only mode
     ReopenWithColumnFamilies({kDefaultColumnFamilyName, "ramen"}, options);
     SyncPoint::GetInstance()->EnableProcessing();
-    auto log_size_pre_write = dbfull()->TEST_total_log_size();
+    auto log_size_pre_write = dbfull()->TEST_wals_total_size();
     leader_batch_and_status =
         GetWriteBatch(GetCFHandleToUse(handles_[1], op_type1_),
                       8 /* protection_bytes_per_key */, op_type1_);
@@ -612,7 +612,7 @@ TEST_P(DbKvChecksumTestMergedBatch, WriteToWALWithColumnFamilyCorrupted) {
 
     ASSERT_EQ(1, leader_count);
     // Nothing should have been written to WAL
-    ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size());
+    ASSERT_EQ(log_size_pre_write, dbfull()->TEST_wals_total_size());
     ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption());
 
     corrupt_byte_offset++;
diff --git a/db/db_log_iter_test.cc b/db/db_log_iter_test.cc
index 17163210e82f..62b1f893d5c2 100644
--- a/db/db_log_iter_test.cc
+++ b/db/db_log_iter_test.cc
@@ -180,13 +180,15 @@ TEST_F(DBTestXactLogIterator, TransactionLogIteratorCheckWhenArchive) {
 
     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
     ASSERT_OK(dbfull()->Flush(FlushOptions(), cf));
+    // Try lots of things to ensure callback is triggered
+    ASSERT_OK(dbfull()->TEST_SwitchWAL());
+    ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
+    ASSERT_OK(dbfull()->TEST_WaitForPurge());
     delete cf;
-    // Normally hit several times; WART: perhaps more in parallel after flush
-    // FIXME: this test is flaky
-    // ASSERT_TRUE(callback_hit.LoadRelaxed());
+    ASSERT_TRUE(callback_hit.LoadRelaxed());
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    Close();
   } while (ChangeCompactOptions());
-  Close();
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 }
 #endif
 
diff --git a/db/db_logical_block_size_cache_test.cc b/db/db_logical_block_size_cache_test.cc
index ff56d56e370d..a2de4e33e417 100644
--- a/db/db_logical_block_size_cache_test.cc
+++ b/db/db_logical_block_size_cache_test.cc
@@ -67,7 +67,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, OpenClose) {
   options.db_paths = {{data_path_0_, 2048}, {data_path_1_, 2048}};
 
   for (int i = 0; i < 2; i++) {
-    DB* db;
+    std::unique_ptr<DB> db;
     if (!i) {
       printf("Open\n");
       ASSERT_OK(DB::Open(options, dbname_, &db));
@@ -82,7 +82,6 @@ TEST_F(DBLogicalBlockSizeCacheTest, OpenClose) {
     ASSERT_EQ(1, cache_->GetRefCount(data_path_1_));
     ASSERT_OK(db->Close());
     ASSERT_EQ(0, cache_->Size());
-    delete db;
   }
   ASSERT_OK(DestroyDB(dbname_, options, {}));
 }
@@ -95,7 +94,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, OpenDelete) {
   options.env = env_.get();
 
   for (int i = 0; i < 2; i++) {
-    DB* db;
+    std::unique_ptr<DB> db;
     if (!i) {
       printf("Open\n");
       ASSERT_OK(DB::Open(options, dbname_, &db));
@@ -106,7 +105,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, OpenDelete) {
     ASSERT_EQ(1, cache_->Size());
     ASSERT_TRUE(cache_->Contains(dbname_));
     ASSERT_EQ(1, cache_->GetRefCount(dbname_));
-    delete db;
+    db.reset();
     ASSERT_EQ(0, cache_->Size());
   }
   ASSERT_OK(DestroyDB(dbname_, options, {}));
@@ -122,7 +121,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, CreateColumnFamily) {
   ColumnFamilyOptions cf_options;
   cf_options.cf_paths = {{cf_path_0_, 1024}, {cf_path_1_, 2048}};
 
-  DB* db;
+  std::unique_ptr<DB> db;
   ASSERT_OK(DB::Open(options, dbname_, &db));
   ASSERT_EQ(1, cache_->Size());
   ASSERT_TRUE(cache_->Contains(dbname_));
@@ -153,7 +152,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, CreateColumnFamily) {
   ASSERT_TRUE(cache_->Contains(dbname_));
   ASSERT_EQ(1, cache_->GetRefCount(dbname_));
 
-  delete db;
+  db.reset();
   ASSERT_EQ(0, cache_->Size());
   ASSERT_OK(DestroyDB(dbname_, options, {{"cf", cf_options}}));
 }
@@ -173,7 +172,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, CreateColumnFamilies) {
   ColumnFamilyOptions cf_options;
   cf_options.cf_paths = {{cf_path_0_, 1024}};
 
-  DB* db;
+  std::unique_ptr<DB> db;
   ASSERT_OK(DB::Open(options, dbname_, &db));
   ASSERT_EQ(1, cache_->Size());
   ASSERT_TRUE(cache_->Contains(dbname_));
@@ -211,7 +210,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, CreateColumnFamilies) {
   ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[1]));
   ASSERT_TRUE(cache_->Contains(dbname_));
   ASSERT_EQ(1, cache_->GetRefCount(dbname_));
-  delete db;
+  db.reset();
 
   // Now cf_path_0_ in cache_ has been properly decreased and cf_path_0_'s entry
   // is dropped from cache
@@ -233,15 +232,15 @@ TEST_F(DBLogicalBlockSizeCacheTest, OpenWithColumnFamilies) {
   cf_options.cf_paths = {{cf_path_0_, 1024}};
 
   for (int i = 0; i < 2; i++) {
-    DB* db;
+    std::unique_ptr<DB> db;
+    ASSERT_OK(DB::Open(options, dbname_, &db));
     ColumnFamilyHandle* cf1 = nullptr;
     ColumnFamilyHandle* cf2 = nullptr;
-    ASSERT_OK(DB::Open(options, dbname_, &db));
     ASSERT_OK(db->CreateColumnFamily(cf_options, "cf1", &cf1));
     ASSERT_OK(db->CreateColumnFamily(cf_options, "cf2", &cf2));
     ASSERT_OK(db->DestroyColumnFamilyHandle(cf1));
     ASSERT_OK(db->DestroyColumnFamilyHandle(cf2));
-    delete db;
+    db.reset();
     ASSERT_EQ(0, cache_->Size());
 
     std::vector<ColumnFamilyHandle*> cfs;
@@ -298,7 +297,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, OpenWithColumnFamilies) {
     ASSERT_TRUE(cache_->Contains(dbname_));
     ASSERT_EQ(1, cache_->GetRefCount(dbname_));
 
-    delete db;
+    db.reset();
     ASSERT_EQ(0, cache_->Size());
   }
   ASSERT_OK(
@@ -315,7 +314,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, DestroyColumnFamilyHandle) {
   ColumnFamilyOptions cf_options;
   cf_options.cf_paths = {{cf_path_0_, 1024}};
 
-  DB* db;
+  std::unique_ptr<DB> db;
   ASSERT_OK(DB::Open(options, dbname_, &db));
   ASSERT_EQ(1, cache_->Size());
   ASSERT_TRUE(cache_->Contains(dbname_));
@@ -336,7 +335,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, DestroyColumnFamilyHandle) {
   ASSERT_TRUE(cache_->Contains(cf_path_0_));
   ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
 
-  delete db;
+  db.reset();
   ASSERT_EQ(0, cache_->Size());
 
   // Open with column families.
@@ -369,7 +368,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, DestroyColumnFamilyHandle) {
     ASSERT_TRUE(cache_->Contains(cf_path_0_));
     ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
 
-    delete db;
+    db.reset();
     ASSERT_EQ(0, cache_->Size());
   }
   ASSERT_OK(DestroyDB(dbname_, options, {{"cf", cf_options}}));
@@ -384,7 +383,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithDifferentPaths) {
 
   ASSERT_OK(env_->CreateDirIfMissing(dbname_));
 
-  DB* db0;
+  std::unique_ptr<DB> db0;
   ASSERT_OK(DB::Open(options, data_path_0_, &db0));
   ASSERT_EQ(1, cache_->Size());
   ASSERT_TRUE(cache_->Contains(data_path_0_));
@@ -399,7 +398,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithDifferentPaths) {
   ASSERT_TRUE(cache_->Contains(cf_path_0_));
   ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
 
-  DB* db1;
+  std::unique_ptr<DB> db1;
   ASSERT_OK(DB::Open(options, data_path_1_, &db1));
   ASSERT_EQ(3, cache_->Size());
   ASSERT_TRUE(cache_->Contains(data_path_0_));
@@ -424,7 +423,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithDifferentPaths) {
   ASSERT_EQ(1, cache_->GetRefCount(cf_path_1_));
 
   ASSERT_OK(db0->DestroyColumnFamilyHandle(cf0));
-  delete db0;
+  db0.reset();
   ASSERT_EQ(2, cache_->Size());
   ASSERT_TRUE(cache_->Contains(data_path_1_));
   ASSERT_EQ(1, cache_->GetRefCount(data_path_1_));
@@ -433,7 +432,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithDifferentPaths) {
   ASSERT_OK(DestroyDB(data_path_0_, options, {{"cf", cf_options0}}));
 
   ASSERT_OK(db1->DestroyColumnFamilyHandle(cf1));
-  delete db1;
+  db1.reset();
   ASSERT_EQ(0, cache_->Size());
   ASSERT_OK(DestroyDB(data_path_1_, options, {{"cf", cf_options1}}));
 }
@@ -450,7 +449,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithSamePaths) {
 
   ASSERT_OK(env_->CreateDirIfMissing(dbname_));
 
-  DB* db0;
+  std::unique_ptr<DB> db0;
   ASSERT_OK(DB::Open(options, dbname_ + "/db0", &db0));
   ASSERT_EQ(1, cache_->Size());
   ASSERT_TRUE(cache_->Contains(data_path_0_));
@@ -464,7 +463,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithSamePaths) {
   ASSERT_TRUE(cache_->Contains(cf_path_0_));
   ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
 
-  DB* db1;
+  std::unique_ptr<DB> db1;
   ASSERT_OK(DB::Open(options, dbname_ + "/db1", &db1));
   ASSERT_EQ(2, cache_->Size());
   ASSERT_TRUE(cache_->Contains(data_path_0_));
@@ -481,7 +480,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithSamePaths) {
   ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_));
 
   ASSERT_OK(db0->DestroyColumnFamilyHandle(cf0));
-  delete db0;
+  db0.reset();
   ASSERT_EQ(2, cache_->Size());
   ASSERT_TRUE(cache_->Contains(data_path_0_));
   ASSERT_EQ(1, cache_->GetRefCount(data_path_0_));
@@ -490,7 +489,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithSamePaths) {
   ASSERT_OK(DestroyDB(dbname_ + "/db0", options, {{"cf", cf_options}}));
 
   ASSERT_OK(db1->DestroyColumnFamilyHandle(cf1));
-  delete db1;
+  db1.reset();
   ASSERT_EQ(0, cache_->Size());
   ASSERT_OK(DestroyDB(dbname_ + "/db1", options, {{"cf", cf_options}}));
 }
diff --git a/db/db_memtable_test.cc b/db/db_memtable_test.cc
index 3f7b029572e4..1086401dd3f9 100644
--- a/db/db_memtable_test.cc
+++ b/db/db_memtable_test.cc
@@ -117,8 +117,6 @@ class TestPrefixExtractor : public SliceTransform {
     return separator(key) != nullptr;
   }
 
-  bool InRange(const Slice& /*key*/) const override { return false; }
-
  private:
   const char* separator(const Slice& key) const {
     return static_cast<const char*>(memchr(key.data(), '_', key.size()));
@@ -339,6 +337,135 @@ TEST_F(DBMemTableTest, ColumnFamilyId) {
   }
 }
 
+class DBMemTableTestForSeek : public DBMemTableTest,
+                              virtual public ::testing::WithParamInterface<
+                                  std::tuple<bool, bool, bool>> {};
+
+TEST_P(DBMemTableTestForSeek, IntegrityChecks) {
+  // Validate key corruption could be detected during seek.
+  // We insert many keys into skiplist. Then we corrupt the each key one at a
+  // time. With memtable_veirfy_per_key_checksum_on_seek enabled, when the
+  // corrupted key is searched, the checksum of every key visited during the
+  // seek is validated. It will report data corruption. Otherwise seek returns
+  // not found.
+  auto allow_data_in_error = std::get<0>(GetParam());
+  Options options = CurrentOptions();
+  options.allow_data_in_errors = allow_data_in_error;
+  options.paranoid_memory_checks = std::get<1>(GetParam());
+  options.memtable_veirfy_per_key_checksum_on_seek = std::get<2>(GetParam());
+  options.memtable_protection_bytes_per_key = 8;
+  DestroyAndReopen(options);
+
+  // capture the data pointer of all of the keys
+  std::vector<char*> raw_data_pointer;
+
+  // Insert enough keys, so memtable would create multiple levels.
+  auto key_count = 100;
+  for (int i = 0; i < key_count; i++) {
+    // The last digit of the key will be corrupted from value 0 to value 5
+    ASSERT_OK(Put(Key(i * 10), "val0"));
+  }
+
+  ReadOptions rops;
+
+  // Iterate all the keys to get key pointers
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->SetCallBack("InlineSkipList::Iterator::Next::key",
+                                        [&raw_data_pointer](void* key) {
+                                          auto p = static_cast<char*>(key);
+                                          raw_data_pointer.push_back(p);
+                                        });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  {
+    std::unique_ptr<Iterator> iter{db_->NewIterator(rops)};
+    iter->Seek(Key(0));
+    while (iter->Valid()) {
+      ASSERT_OK(iter->status());
+      iter->Next();
+    }
+    // check status after valid returned false.
+    auto status = iter->status();
+    ASSERT_TRUE(status.ok());
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  ASSERT_EQ(raw_data_pointer.size(), key_count);
+
+  bool enable_key_validation_on_seek =
+      options.memtable_veirfy_per_key_checksum_on_seek;
+
+  // For each key, corrupt it, validate corruption is detected correctly, then
+  // revert it.
+  for (int i = 0; i < key_count; i++) {
+    std::string key_to_corrupt = Key(i * 10);
+    raw_data_pointer[i][key_to_corrupt.size()] = '5';
+
+    auto corrupted_key = key_to_corrupt;
+    corrupted_key.data()[key_to_corrupt.size() - 1] = '5';
+    auto corrupted_key_slice =
+        Slice(corrupted_key.data(), corrupted_key.length());
+    auto corrupted_key_hex = corrupted_key_slice.ToString(/*hex=*/true);
+
+    {
+      // Test Get API
+      std::string val;
+      auto status = db_->Get(rops, key_to_corrupt, &val);
+      if (enable_key_validation_on_seek) {
+        ASSERT_TRUE(status.IsCorruption()) << key_to_corrupt;
+        ASSERT_EQ(
+            status.ToString().find(corrupted_key_hex) != std::string::npos,
+            allow_data_in_error)
+            << status.ToString() << "\n"
+            << corrupted_key_hex;
+      } else {
+        ASSERT_TRUE(status.IsNotFound());
+      }
+    }
+
+    {
+      // Test MultiGet API
+      std::vector<std::string> vals;
+      std::vector<Status> statuses = db_->MultiGet(
+          rops, {db_->DefaultColumnFamily()}, {key_to_corrupt}, &vals, nullptr);
+      if (enable_key_validation_on_seek) {
+        ASSERT_TRUE(statuses[0].IsCorruption());
+        ASSERT_EQ(
+            statuses[0].ToString().find(corrupted_key_hex) != std::string::npos,
+            allow_data_in_error);
+      } else {
+        ASSERT_TRUE(statuses[0].IsNotFound());
+      }
+    }
+
+    {
+      // Test Iterator Seek API
+      std::unique_ptr<Iterator> iter{db_->NewIterator(rops)};
+      ASSERT_OK(iter->status());
+      iter->Seek(key_to_corrupt);
+      auto status = iter->status();
+      if (enable_key_validation_on_seek) {
+        ASSERT_TRUE(status.IsCorruption());
+        ASSERT_EQ(
+            status.ToString().find(corrupted_key_hex) != std::string::npos,
+            allow_data_in_error);
+      } else {
+        ASSERT_FALSE(iter->Valid());
+        ASSERT_FALSE(status.ok());
+      }
+    }
+
+    // revert the key corruption.
+    raw_data_pointer[i][key_to_corrupt.size()] = '0';
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(DBMemTableTestForSeek, DBMemTableTestForSeek,
+                        ::testing::Combine(::testing::Bool(), ::testing::Bool(),
+                                           ::testing::Bool()));
+
 TEST_F(DBMemTableTest, IntegrityChecks) {
   // We insert keys key000000, key000001 and key000002 into skiplist at fixed
   // height 1 (smallest height). Then we corrupt the second key to aey000001 to
@@ -424,6 +551,96 @@ TEST_F(DBMemTableTest, IntegrityChecks) {
     ASSERT_FALSE(iter->Valid());
   }
 }
+
+TEST_F(DBMemTableTest, VectorConcurrentInsert) {
+  Options options;
+  options.create_if_missing = true;
+  options.create_missing_column_families = true;
+  options.allow_concurrent_memtable_write = true;
+  options.memtable_factory.reset(new VectorRepFactory());
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"cf1"}, options);
+
+  // Multi-threaded writes
+  {
+    WriteOptions write_options;
+    std::vector<port::Thread> threads;
+    for (int i = 0; i < 10; ++i) {
+      threads.emplace_back([&, i]() {
+        int start = i * 100;
+        int end = start + 100;
+        WriteBatch batch;
+        for (int j = start; j < end; ++j) {
+          ASSERT_OK(
+              batch.Put(handles_[0], Key(j), "value" + std::to_string(j)));
+        }
+        ASSERT_OK(db_->Write(write_options, &batch));
+      });
+    }
+    for (auto& t : threads) {
+      t.join();
+    }
+
+    std::unique_ptr<Iterator> iter(
+        db_->NewIterator(ReadOptions(), handles_[0]));
+    iter->SeekToFirst();
+    for (int i = 0; i < 1000; ++i) {
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ(iter->key().ToString(), Key(i));
+      ASSERT_EQ(iter->value().ToString(), "value" + std::to_string(i));
+      iter->Next();
+    }
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_OK(iter->status());
+  }
+
+  // Multi-threaded writes, multi CF
+  {
+    WriteOptions write_options;
+    std::vector<port::Thread> threads;
+    for (int i = 0; i < 10; ++i) {
+      threads.emplace_back([&, i]() {
+        int start = i * 100;
+        int end = start + 100;
+        WriteBatch batch;
+        for (int j = start; j < end; ++j) {
+          ASSERT_OK(batch.Put(handles_[0], Key(j), "CF0" + std::to_string(j)));
+          ASSERT_OK(batch.Put(handles_[1], Key(j), "CF1" + std::to_string(j)));
+        }
+        ASSERT_OK(db_->Write(write_options, &batch));
+      });
+    }
+
+    for (auto& t : threads) {
+      t.join();
+    }
+
+    std::unique_ptr<Iterator> iter0(
+        db_->NewIterator(ReadOptions(), handles_[0]));
+    std::unique_ptr<Iterator> iter1(
+        db_->NewIterator(ReadOptions(), handles_[1]));
+    iter0->SeekToFirst();
+    iter1->SeekToFirst();
+    for (int i = 0; i < 1000; ++i) {
+      ASSERT_TRUE(iter0->Valid());
+      ASSERT_EQ(iter0->key().ToString(), Key(i));
+      ASSERT_EQ(iter0->value().ToString(), "CF0" + std::to_string(i));
+      iter0->Next();
+
+      ASSERT_TRUE(iter1->Valid());
+      ASSERT_EQ(iter1->key().ToString(), Key(i));
+      ASSERT_EQ(iter1->value().ToString(), "CF1" + std::to_string(i));
+      iter1->Next();
+    }
+    ASSERT_FALSE(iter0->Valid());
+    ASSERT_OK(iter0->status());
+    ASSERT_FALSE(iter1->Valid());
+    ASSERT_OK(iter1->status());
+  }
+
+  ASSERT_OK(Flush(0));
+  ASSERT_OK(Flush(1));
+}
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/db_merge_operator_test.cc b/db/db_merge_operator_test.cc
index 69f6ec4e9185..143203fd7b7e 100644
--- a/db/db_merge_operator_test.cc
+++ b/db/db_merge_operator_test.cc
@@ -386,7 +386,7 @@ TEST_F(DBMergeOperatorTest, MergeOperandThresholdExceeded) {
   snapshots.reserve(3);
 
   for (size_t i = 0; i < keys.size(); ++i) {
-    snapshots.emplace_back(db_);
+    snapshots.emplace_back(db_.get());
 
     const std::string suffix = std::to_string(i + 1);
 
@@ -971,7 +971,7 @@ TEST_F(DBMergeOperatorTest, MaxSuccessiveMergesBaseValues) {
 
   // No base value
   {
-    constexpr char key[] = "key1";
+    const std::string key = "key1";
 
     ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), key, foo));
     ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), key, bar));
@@ -985,7 +985,7 @@ TEST_F(DBMergeOperatorTest, MaxSuccessiveMergesBaseValues) {
     // max_successive_merges.
     constexpr size_t max_key_versions = 8;
     std::vector<KeyVersion> key_versions;
-    ASSERT_OK(GetAllKeyVersions(db_, db_->DefaultColumnFamily(), key, key,
+    ASSERT_OK(GetAllKeyVersions(db_.get(), db_->DefaultColumnFamily(), key, key,
                                 max_key_versions, &key_versions));
     ASSERT_EQ(key_versions.size(), 2);
     ASSERT_EQ(key_versions[0].type, kTypeValue);
@@ -994,7 +994,7 @@ TEST_F(DBMergeOperatorTest, MaxSuccessiveMergesBaseValues) {
 
   // Plain base value
   {
-    constexpr char key[] = "key2";
+    const std::string key = "key2";
 
     ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), key, foo));
     ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), key, bar));
@@ -1009,7 +1009,7 @@ TEST_F(DBMergeOperatorTest, MaxSuccessiveMergesBaseValues) {
     // max_successive_merges.
     constexpr size_t max_key_versions = 8;
     std::vector<KeyVersion> key_versions;
-    ASSERT_OK(GetAllKeyVersions(db_, db_->DefaultColumnFamily(), key, key,
+    ASSERT_OK(GetAllKeyVersions(db_.get(), db_->DefaultColumnFamily(), key, key,
                                 max_key_versions, &key_versions));
     ASSERT_EQ(key_versions.size(), 3);
     ASSERT_EQ(key_versions[0].type, kTypeValue);
@@ -1019,7 +1019,7 @@ TEST_F(DBMergeOperatorTest, MaxSuccessiveMergesBaseValues) {
 
   // Wide-column base value
   {
-    constexpr char key[] = "key3";
+    const std::string key = "key3";
     const WideColumns columns{{kDefaultWideColumnName, foo}, {bar, baz}};
 
     ASSERT_OK(db_->PutEntity(WriteOptions(), db_->DefaultColumnFamily(), key,
@@ -1038,7 +1038,7 @@ TEST_F(DBMergeOperatorTest, MaxSuccessiveMergesBaseValues) {
     // max_successive_merges.
     constexpr size_t max_key_versions = 8;
     std::vector<KeyVersion> key_versions;
-    ASSERT_OK(GetAllKeyVersions(db_, db_->DefaultColumnFamily(), key, key,
+    ASSERT_OK(GetAllKeyVersions(db_.get(), db_->DefaultColumnFamily(), key, key,
                                 max_key_versions, &key_versions));
     ASSERT_EQ(key_versions.size(), 3);
     ASSERT_EQ(key_versions[0].type, kTypeWideColumnEntity);
diff --git a/db/db_options_test.cc b/db/db_options_test.cc
index 99d390db2399..07e5d27f23e8 100644
--- a/db/db_options_test.cc
+++ b/db/db_options_test.cc
@@ -70,7 +70,8 @@ class DBOptionsTest : public DBTestBase {
     options.env = env_;
     ImmutableDBOptions db_options(options);
     test::RandomInitCFOptions(&options, options, rnd);
-    auto sanitized_options = SanitizeOptions(db_options, options);
+    auto sanitized_options =
+        SanitizeCfOptions(db_options, /*read_only*/ false, options);
     auto opt_map = GetMutableCFOptionsMap(sanitized_options);
     delete options.compaction_filter;
     return opt_map;
@@ -321,31 +322,26 @@ TEST_F(DBOptionsTest, SetWithCustomMemTableFactory) {
   }
   Options options;
   options.create_if_missing = true;
-  // Try with fail_if_options_file_error=false/true to update the options
-  for (bool on_error : {false, true}) {
-    options.fail_if_options_file_error = on_error;
-    options.env = env_;
-    options.disable_auto_compactions = false;
+  options.env = env_;
+  options.disable_auto_compactions = false;
 
-    options.memtable_factory.reset(new DummySkipListFactory());
-    Reopen(options);
+  options.memtable_factory.reset(new DummySkipListFactory());
+  Reopen(options);
 
-    ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
-    ASSERT_OK(
-        dbfull()->SetOptions(cfh, {{"disable_auto_compactions", "true"}}));
-    ColumnFamilyDescriptor cfd;
-    ASSERT_OK(cfh->GetDescriptor(&cfd));
-    ASSERT_STREQ(cfd.options.memtable_factory->Name(),
-                 DummySkipListFactory::kClassName());
-    ColumnFamilyHandle* test = nullptr;
-    ASSERT_OK(dbfull()->CreateColumnFamily(options, "test", &test));
-    ASSERT_OK(test->GetDescriptor(&cfd));
-    ASSERT_STREQ(cfd.options.memtable_factory->Name(),
-                 DummySkipListFactory::kClassName());
-
-    ASSERT_OK(dbfull()->DropColumnFamily(test));
-    delete test;
-  }
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+  ASSERT_OK(dbfull()->SetOptions(cfh, {{"disable_auto_compactions", "true"}}));
+  ColumnFamilyDescriptor cfd;
+  ASSERT_OK(cfh->GetDescriptor(&cfd));
+  ASSERT_STREQ(cfd.options.memtable_factory->Name(),
+               DummySkipListFactory::kClassName());
+  ColumnFamilyHandle* test = nullptr;
+  ASSERT_OK(dbfull()->CreateColumnFamily(options, "test", &test));
+  ASSERT_OK(test->GetDescriptor(&cfd));
+  ASSERT_STREQ(cfd.options.memtable_factory->Name(),
+               DummySkipListFactory::kClassName());
+
+  ASSERT_OK(dbfull()->DropColumnFamily(test));
+  delete test;
 }
 
 TEST_F(DBOptionsTest, SetBytesPerSync) {
@@ -436,12 +432,47 @@ TEST_F(DBOptionsTest, SetWalBytesPerSync) {
   ASSERT_GT(low_bytes_per_sync, counter);
 }
 
+TEST_F(DBOptionsTest, MutableManifestOptions) {
+  // These aren't end-to-end tests, but sufficient to ensure the VersionSet
+  // receives the updates with SetDBOptions
+  for (int64_t i : {0, 1, 100, 100000, 10000000}) {
+    ASSERT_OK(
+        db_->SetDBOptions({{"max_manifest_file_size", std::to_string(i)}}));
+    ASSERT_EQ(i,
+              static_cast<int64_t>(db_->GetDBOptions().max_manifest_file_size));
+    ASSERT_EQ(i,
+              static_cast<int64_t>(
+                  dbfull()->GetVersionSet()->TEST_GetMinMaxManifestFileSize()));
+    if (i > 1) {
+      ++i;
+    }
+    ASSERT_OK(
+        db_->SetDBOptions({{"max_manifest_space_amp_pct", std::to_string(i)}}));
+    ASSERT_EQ(i, static_cast<int64_t>(
+                     db_->GetDBOptions().max_manifest_space_amp_pct));
+    ASSERT_EQ(i,
+              static_cast<int64_t>(
+                  dbfull()->GetVersionSet()->TEST_GetMaxManifestSpaceAmpPct()));
+    if (i > 1) {
+      ++i;
+    }
+    ASSERT_OK(db_->SetDBOptions(
+        {{"manifest_preallocation_size", std::to_string(i)}}));
+    ASSERT_EQ(i, static_cast<int64_t>(
+                     db_->GetDBOptions().manifest_preallocation_size));
+    ASSERT_EQ(
+        i, static_cast<int64_t>(
+               dbfull()->GetVersionSet()->TEST_GetManifestPreallocationSize()));
+  }
+}
+
 TEST_F(DBOptionsTest, WritableFileMaxBufferSize) {
   Options options;
   options.create_if_missing = true;
   options.writable_file_max_buffer_size = 1024 * 1024;
   options.level0_file_num_compaction_trigger = 3;
   options.max_manifest_file_size = 1;
+  options.max_manifest_space_amp_pct = 0;
   options.env = env_;
   int buffer_size = 1024 * 1024;
   Reopen(options);
@@ -1658,6 +1689,46 @@ TEST_F(DBOptionsTest, SetOptionsNoManifestWrite) {
   ASSERT_EQ(Get("x"), "x");
 }
 
+TEST_F(DBOptionsTest, SetOptionsMultipleColumnFamilies) {
+  Options options;
+  options.create_if_missing = true;
+  options.env = CurrentOptions().env;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+
+  // Create two additional column families
+  CreateColumnFamilies({"cf1", "cf2"}, options);
+  ReopenWithColumnFamilies({"default", "cf1", "cf2"}, options);
+
+  // Verify initial state - auto compaction should be disabled
+  ASSERT_TRUE(dbfull()->GetOptions(handles_[0]).disable_auto_compactions);
+  ASSERT_TRUE(dbfull()->GetOptions(handles_[1]).disable_auto_compactions);
+  ASSERT_TRUE(dbfull()->GetOptions(handles_[2]).disable_auto_compactions);
+
+  // Set options on multiple column families at once
+  ASSERT_OK(dbfull()->SetOptions({handles_[1], handles_[2]},
+                                 {{"disable_auto_compactions", "false"}}));
+
+  ASSERT_TRUE(
+      dbfull()->GetOptions(handles_[0]).disable_auto_compactions);  // unchanged
+  ASSERT_FALSE(
+      dbfull()->GetOptions(handles_[1]).disable_auto_compactions);  // changed
+  ASSERT_FALSE(
+      dbfull()->GetOptions(handles_[2]).disable_auto_compactions);  // changed
+
+  std::unordered_map<ColumnFamilyHandle*,
+                     std::unordered_map<std::string, std::string>>
+      options_map;
+  options_map[handles_[0]] = {{"disable_auto_compactions", "false"}};
+  options_map[handles_[1]] = {{"disable_auto_compactions", "true"}};
+  options_map[handles_[2]] = {{"disable_auto_compactions", "true"}};
+  ASSERT_OK(dbfull()->SetOptions(options_map));
+
+  ASSERT_FALSE(dbfull()->GetOptions(handles_[0]).disable_auto_compactions);
+  ASSERT_TRUE(dbfull()->GetOptions(handles_[1]).disable_auto_compactions);
+  ASSERT_TRUE(dbfull()->GetOptions(handles_[2]).disable_auto_compactions);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/db_properties_test.cc b/db/db_properties_test.cc
index 01ab37e21ebf..523abeb1cbd6 100644
--- a/db/db_properties_test.cc
+++ b/db/db_properties_test.cc
@@ -377,12 +377,14 @@ TEST_F(DBPropertiesTest, AggregatedTableProperties) {
         NewBloomFilterPolicy(kBloomBitsPerKey, false));
     table_options.block_size = 1024;
     options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    // The checks assume kTableCount number of files
+    options.disable_auto_compactions = true;
 
     DestroyAndReopen(options);
 
     // Hold open a snapshot to prevent range tombstones from being compacted
     // away.
-    ManagedSnapshot snapshot(db_);
+    ManagedSnapshot snapshot(db_.get());
 
     Random rnd(5632);
     for (int table = 1; table <= kTableCount; ++table) {
@@ -567,7 +569,7 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {
   options.target_file_size_base = 8192;
   options.max_bytes_for_level_base = 10000;
   options.max_bytes_for_level_multiplier = 2;
-  // This ensures there no compaction happening when we call GetProperty().
+  // The checks assume kTableCount number of files
   options.disable_auto_compactions = true;
   options.merge_operator.reset(new TestPutOperator());
 
@@ -580,7 +582,7 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {
   DestroyAndReopen(options);
 
   // Hold open a snapshot to prevent range tombstones from being compacted away.
-  ManagedSnapshot snapshot(db_);
+  ManagedSnapshot snapshot(db_.get());
 
   std::string level_tp_strings[kMaxLevel];
   std::string tp_string;
@@ -1517,16 +1519,14 @@ TEST_F(DBPropertiesTest, NeedCompactHintPersistentTest) {
 
 // Excluded from RocksDB lite tests due to `GetPropertiesOfAllTables()` usage.
 TEST_F(DBPropertiesTest, BlockAddForCompressionSampling) {
-  // Sampled compression requires at least one of the following four types.
-  if (!Snappy_Supported() && !Zlib_Supported() && !LZ4_Supported() &&
-      !ZSTD_Supported()) {
-    return;
-  }
-
   Options options = CurrentOptions();
   options.disable_auto_compactions = true;
   options.table_properties_collector_factories.emplace_back(
       std::make_shared<BlockCountingTablePropertiesCollectorFactory>());
+  options.compression = kNoCompression;
+
+  bool fast_sampling_supported = Snappy_Supported() || LZ4_Supported();
+  bool slow_sampling_supported = ZSTD_Supported() || Zlib_Supported();
 
   for (bool sample_for_compression : {false, true}) {
     // For simplicity/determinism, sample 100% when enabled, or 0% when disabled
@@ -1540,10 +1540,11 @@ TEST_F(DBPropertiesTest, BlockAddForCompressionSampling) {
     // L1_0 ["a", "b"]
     //
     // L0_0 was created by flush. L1_0 was created by compaction. Each file
-    // contains one data block.
+    // contains one data block with enough data to be compressible.
     for (int i = 0; i < 3; ++i) {
-      ASSERT_OK(Put("a", "val"));
-      ASSERT_OK(Put("b", "val"));
+      for (int j = 0; j < 50; ++j) {
+        ASSERT_OK(Put(std::to_string(j), "thisismyvalue"));
+      }
       ASSERT_OK(Flush());
       if (i == 1) {
         ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
@@ -1556,13 +1557,33 @@ TEST_F(DBPropertiesTest, BlockAddForCompressionSampling) {
     ASSERT_OK(db_->GetPropertiesOfAllTables(&file_to_props));
     ASSERT_EQ(2, file_to_props.size());
     for (const auto& file_and_props : file_to_props) {
-      auto& user_props = file_and_props.second->user_collected_properties;
+      auto& props = *file_and_props.second;
+      auto& user_props = props.user_collected_properties;
       ASSERT_TRUE(user_props.find(BlockCountingTablePropertiesCollector::
                                       kNumSampledBlocksPropertyName) !=
                   user_props.end());
       ASSERT_EQ(user_props.at(BlockCountingTablePropertiesCollector::
                                   kNumSampledBlocksPropertyName),
                 std::to_string(sample_for_compression ? 1 : 0));
+      if (sample_for_compression) {
+        EXPECT_GT(props.fast_compression_estimated_data_size, 0);
+        EXPECT_GT(props.slow_compression_estimated_data_size, 0);
+        if (fast_sampling_supported) {
+          EXPECT_LT(props.fast_compression_estimated_data_size,
+                    props.data_size);
+          if (slow_sampling_supported) {
+            EXPECT_LT(props.slow_compression_estimated_data_size,
+                      props.fast_compression_estimated_data_size);
+          }
+        }
+        if (slow_sampling_supported) {
+          EXPECT_LT(props.slow_compression_estimated_data_size,
+                    props.data_size);
+        }
+      } else {
+        EXPECT_EQ(props.fast_compression_estimated_data_size, 0);
+        EXPECT_EQ(props.slow_compression_estimated_data_size, 0);
+      }
     }
   }
 }
@@ -1843,7 +1864,7 @@ TEST_F(DBPropertiesTest, MinObsoleteSstNumberToKeep) {
   options.listeners.push_back(listener);
   options.level0_file_num_compaction_trigger = kNumL0Files;
   DestroyAndReopen(options);
-  listener->SetDB(db_);
+  listener->SetDB(db_.get());
 
   for (int i = 0; i < kNumL0Files; ++i) {
     // Make sure they overlap in keyspace to prevent trivial move
diff --git a/db/db_range_del_test.cc b/db/db_range_del_test.cc
index 5122aedc97a3..f0996ce34c94 100644
--- a/db/db_range_del_test.cc
+++ b/db/db_range_del_test.cc
@@ -2047,7 +2047,7 @@ TEST_F(DBRangeDelTest, IteratorReseek) {
   // Immutable memtable
   ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(1),
                              Key(2)));
-  ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
   std::string value;
   ASSERT_TRUE(dbfull()->GetProperty(db_->DefaultColumnFamily(),
                                     "rocksdb.num-immutable-mem-table", &value));
@@ -3825,6 +3825,89 @@ TEST_F(DBRangeDelTest, RowCache) {
   // and should not turn db into read-only mdoe.
   ASSERT_OK(Put(Key(5), "foo"));
 }
+
+TEST_F(DBRangeDelTest, SeekForPrevTest) {
+  // open db
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.compaction_style = kCompactionStyleUniversal;
+
+  // add SST partitioner, split sst file with prefix length 2
+  options.sst_partitioner_factory = NewSstPartitionerFixedPrefixFactory(2);
+  Reopen(options);
+
+  // File uses SST partitioner, so it will be split into 3 files
+  // SST file 1: ka1, ka2
+  // SST file 2: kb1
+  // SST file 3: kc1, kc2
+  // Delete range covers from ka2 to kc2, which means record ka2 and kb1, kc1
+  // are covered by the delete range
+
+  std::vector<std::pair<std::string, std::string>> kv = {{"ka1", "value_1"},
+                                                         {"ka2", "value_2"},
+                                                         {"kb1", "value_3"},
+                                                         {"kc1", "value_4"},
+                                                         {"kc2", "value_5"}};
+  for (auto& p : kv) {
+    ASSERT_OK(Put(p.first, p.second));
+  }
+
+  ASSERT_OK(Flush());
+  // Compact to Lmax, it should have seq 0 now.
+  ASSERT_OK(CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  // Open an iterator and create a snapshot, so that keys are not deleted
+  // completely by delete range in SST
+  ReadOptions read_opts;
+  read_opts.snapshot = db_->GetSnapshot();
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+  iter->SeekToFirst();
+  // iterate all the keys and validate the value
+  for (int i = 0; iter->Valid(); iter->Next()) {
+    ASSERT_EQ(kv[i].first, iter->key().ToString());
+    ASSERT_EQ(kv[i].second, iter->value().ToString());
+    i++;
+  }
+
+  // use delete range to delete the record
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "ka2",
+                             "kc2"));
+  // Flush
+  ASSERT_OK(Flush());
+  // Compact to Lmax
+  ASSERT_OK(CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  // Close the iterator and release the snapshot.
+  ASSERT_OK(iter->status());
+  iter.reset();
+  db_->ReleaseSnapshot(read_opts.snapshot);
+
+  // create second iterator, seek each key and validate result
+  std::unique_ptr<Iterator> iter2(db_->NewIterator(ReadOptions()));
+  // Validate keys are deleted
+  iter2->SeekToFirst();
+  ASSERT_TRUE(iter2->Valid());
+  ASSERT_EQ("ka1", iter2->key().ToString());
+  iter2->Next();
+  ASSERT_TRUE(iter2->Valid());
+  ASSERT_EQ("kc2", iter2->key().ToString());
+  iter2->Next();
+  ASSERT_FALSE(iter2->Valid());
+
+  // Validate seek for prev result
+  for (auto& p : kv) {
+    iter2->SeekForPrev(p.first);
+    ASSERT_TRUE(iter2->Valid());
+    if (p.first == "kc2") {
+      ASSERT_EQ("kc2", iter2->key().ToString());
+    } else {
+      ASSERT_EQ("ka1", iter2->key().ToString());
+    }
+  }
+  ASSERT_OK(iter2->status());
+  iter2.reset();
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/db_rate_limiter_test.cc b/db/db_rate_limiter_test.cc
index b28055225a0f..210e3c49ac32 100644
--- a/db/db_rate_limiter_test.cc
+++ b/db/db_rate_limiter_test.cc
@@ -442,6 +442,107 @@ TEST_P(DBRateLimiterOnWriteWALTest, AutoWalFlush) {
   EXPECT_EQ(actual_auto_wal_flush_request,
             options_.rate_limiter->GetTotalRequests(Env::IO_USER));
 }
+
+class DBRateLimiterOnManualWALFlushTest
+    : public DBRateLimiterOnWriteTest,
+      public ::testing::WithParamInterface<Env::IOPriority> {
+ public:
+  static std::string GetTestNameSuffix(
+      ::testing::TestParamInfo<Env::IOPriority> info) {
+    std::ostringstream oss;
+    if (info.param == Env::IO_USER) {
+      oss << "RateLimitManualWALFlush";
+    } else if (info.param == Env::IO_TOTAL) {
+      oss << "NoRateLimitManualWALFlush";
+    } else if (info.param == Env::IO_HIGH) {
+      oss << "RateLimitManualWALFlushWithHighPriority";
+    } else {
+      oss << "RateLimitManualWALFlushWithLowPriority";
+    }
+    return oss.str();
+  }
+
+  explicit DBRateLimiterOnManualWALFlushTest()
+      : rate_limiter_priority_(GetParam()) {}
+
+  void Init() {
+    options_ = GetOptions();
+    // Enable manual WAL flush mode
+    options_.manual_wal_flush = true;
+    Reopen(options_);
+  }
+
+  WriteOptions GetWriteOptions() {
+    WriteOptions write_options;
+    // WAL must be enabled for manual WAL flush to work
+    write_options.disableWAL = false;
+    // In manual WAL flush mode, WAL write rate limiting should be done through
+    // FlushWAL(), not WriteOptions::rate_limiter_priority
+    write_options.rate_limiter_priority = Env::IO_TOTAL;
+    return write_options;
+  }
+
+ protected:
+  Env::IOPriority rate_limiter_priority_;
+};
+
+INSTANTIATE_TEST_CASE_P(DBRateLimiterOnManualWALFlushTest,
+                        DBRateLimiterOnManualWALFlushTest,
+                        ::testing::Values(Env::IO_TOTAL, Env::IO_USER,
+                                          Env::IO_HIGH, Env::IO_LOW),
+                        DBRateLimiterOnManualWALFlushTest::GetTestNameSuffix);
+
+TEST_P(DBRateLimiterOnManualWALFlushTest, ManualWALFlush) {
+  Init();
+
+  const bool no_rate_limit = (rate_limiter_priority_ == Env::IO_TOTAL);
+
+  ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_TOTAL));
+
+  for (bool sync : {false, true}) {
+    std::int64_t prev_total_request =
+        options_.rate_limiter->GetTotalRequests(Env::IO_TOTAL);
+
+    Status put_status = Put("key_" + std::to_string(sync),
+                            "value_" + std::to_string(sync), GetWriteOptions());
+
+    EXPECT_TRUE(put_status.ok());
+
+    // Since manual_wal_flush is enabled and write_options.rate_limiter_priority
+    // is IO_TOTAL, no rate limiting should have occurred for this user write
+    EXPECT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_TOTAL) -
+                     prev_total_request);
+
+    // Now explicitly flush the WAL with the test's rate_limiter_priority
+    prev_total_request = options_.rate_limiter->GetTotalRequests(Env::IO_TOTAL);
+    std::int64_t prev_priority_request =
+        options_.rate_limiter->GetTotalRequests(rate_limiter_priority_);
+
+    FlushWALOptions flush_options;
+    flush_options.sync = sync;
+    flush_options.rate_limiter_priority = rate_limiter_priority_;
+    Status flush_status = db_->FlushWAL(flush_options);
+
+    EXPECT_TRUE(flush_status.ok());
+
+    std::int64_t manual_wal_flush_requests_total =
+        options_.rate_limiter->GetTotalRequests(Env::IO_TOTAL) -
+        prev_total_request;
+    std::int64_t manual_wal_flush_requests_for_priority =
+        options_.rate_limiter->GetTotalRequests(rate_limiter_priority_) -
+        prev_priority_request;
+
+    if (no_rate_limit) {
+      EXPECT_EQ(0, manual_wal_flush_requests_total);
+      EXPECT_EQ(0, manual_wal_flush_requests_for_priority);
+    } else {
+      EXPECT_EQ(manual_wal_flush_requests_total,
+                manual_wal_flush_requests_for_priority);
+      EXPECT_GT(manual_wal_flush_requests_for_priority, 0);
+    }
+  }
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/db_readonly_with_timestamp_test.cc b/db/db_readonly_with_timestamp_test.cc
index 7a37bfec81c5..6fbc43bb2664 100644
--- a/db/db_readonly_with_timestamp_test.cc
+++ b/db/db_readonly_with_timestamp_test.cc
@@ -237,7 +237,7 @@ TEST_F(DBReadOnlyTestWithTimestamp, IteratorAndGet) {
          it->Next(), ++count, ++key) {
       CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
                          "value" + std::to_string(i), write_timestamps[i]);
-      get_value_and_check(db_, read_opts, it->key(), it->value(),
+      get_value_and_check(db_.get(), read_opts, it->key(), it->value(),
                           write_timestamps[i]);
     }
     ASSERT_OK(it->status());
@@ -250,7 +250,7 @@ TEST_F(DBReadOnlyTestWithTimestamp, IteratorAndGet) {
          it->Prev(), ++count, --key) {
       CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
                          "value" + std::to_string(i), write_timestamps[i]);
-      get_value_and_check(db_, read_opts, it->key(), it->value(),
+      get_value_and_check(db_.get(), read_opts, it->key(), it->value(),
                           write_timestamps[i]);
     }
     ASSERT_OK(it->status());
@@ -272,7 +272,7 @@ TEST_F(DBReadOnlyTestWithTimestamp, IteratorAndGet) {
            it->Valid(); it->Next(), ++key, ++count) {
         CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
                            "value" + std::to_string(i), write_timestamps[i]);
-        get_value_and_check(db_, read_opts, it->key(), it->value(),
+        get_value_and_check(db_.get(), read_opts, it->key(), it->value(),
                             write_timestamps[i]);
       }
       ASSERT_OK(it->status());
@@ -282,7 +282,7 @@ TEST_F(DBReadOnlyTestWithTimestamp, IteratorAndGet) {
            it->Valid(); it->Prev(), --key, ++count) {
         CheckIterUserEntry(it.get(), Key1(key - 1), kTypeValue,
                            "value" + std::to_string(i), write_timestamps[i]);
-        get_value_and_check(db_, read_opts, it->key(), it->value(),
+        get_value_and_check(db_.get(), read_opts, it->key(), it->value(),
                             write_timestamps[i]);
       }
       ASSERT_OK(it->status());
diff --git a/db/db_secondary_test.cc b/db/db_secondary_test.cc
index 060ce8644087..a5da2afacc44 100644
--- a/db/db_secondary_test.cc
+++ b/db/db_secondary_test.cc
@@ -56,12 +56,11 @@ class DBSecondaryTestBase : public DBBasicTestWithTimestampBase {
       ASSERT_OK(db_secondary_->DestroyColumnFamilyHandle(h));
     }
     handles_secondary_.clear();
-    delete db_secondary_;
-    db_secondary_ = nullptr;
+    db_secondary_.reset();
   }
 
   DBImplSecondary* db_secondary_full() {
-    return static_cast<DBImplSecondary*>(db_secondary_);
+    return static_cast<DBImplSecondary*>(db_secondary_.get());
   }
 
   void CheckFileTypeCounts(const std::string& dir, int expected_log,
@@ -69,7 +68,7 @@ class DBSecondaryTestBase : public DBBasicTestWithTimestampBase {
 
   std::string secondary_path_;
   std::vector<ColumnFamilyHandle*> handles_secondary_;
-  DB* db_secondary_;
+  std::unique_ptr<DB> db_secondary_;
 };
 
 void DBSecondaryTestBase::OpenSecondary(const Options& options) {
@@ -152,14 +151,15 @@ TEST_F(DBSecondaryTest, NonExistingDb) {
   options.env = env_;
   options.max_open_files = -1;
   const std::string dbname = "/doesnt/exist";
-  Status s =
-      DB::OpenAsSecondary(options, dbname, secondary_path_, &db_secondary_);
+  std::unique_ptr<DB> dbptr;
+  Status s = DB::OpenAsSecondary(options, dbname, secondary_path_, &dbptr);
   ASSERT_TRUE(s.IsIOError());
 }
 
 TEST_F(DBSecondaryTest, ReopenAsSecondary) {
   Options options;
   options.env = env_;
+  options.preserve_internal_time_seconds = 300;
   Reopen(options);
   ASSERT_OK(Put("foo", "foo_value"));
   ASSERT_OK(Put("bar", "bar_value"));
@@ -181,7 +181,7 @@ TEST_F(DBSecondaryTest, ReopenAsSecondary) {
 
   ReadOptions ropts;
   ropts.verify_checksums = true;
-  auto db1 = static_cast<DBImplSecondary*>(db_);
+  auto db1 = static_cast<DBImplSecondary*>(db_.get());
   ASSERT_NE(nullptr, db1);
   Iterator* iter = db1->NewIterator(ropts);
   ASSERT_NE(nullptr, iter);
@@ -507,6 +507,81 @@ TEST_F(DBSecondaryTest, OpenAsSecondary) {
   verify_db_func("new_foo_value", "new_bar_value");
 }
 
+TEST_F(DBSecondaryTest, OptionsOverrideTest) {
+  Options options;
+  options.env = env_;
+  options.preserve_internal_time_seconds = 300;
+  options.compaction_readahead_size = 200;
+  options.blob_compaction_readahead_size = 100;
+  Reopen(options);
+
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+    ASSERT_OK(Flush());
+  }
+
+  CompactionServiceInput input;
+
+  ColumnFamilyMetaData meta;
+  db_->GetColumnFamilyMetaData(&meta);
+  for (auto& file : meta.levels[0].files) {
+    ASSERT_EQ(0, meta.levels[0].level);
+    input.input_files.push_back(file.name);
+  }
+  ASSERT_EQ(input.input_files.size(), 3);
+
+  input.output_level = 1;
+  input.options_file_number = dbfull()->GetVersionSet()->options_file_number();
+  input.cf_name = kDefaultColumnFamilyName;
+  ASSERT_OK(db_->GetDbIdentity(input.db_id));
+
+  ASSERT_EQ(db_->GetOptions().compaction_readahead_size, 200);
+  ASSERT_EQ(db_->GetOptions().blob_compaction_readahead_size, 100);
+
+  Close();
+
+  std::string compaction_input_binary;
+  ASSERT_OK(input.Write(&compaction_input_binary));
+  std::string compaction_result_binary;
+
+  CompactionServiceOptionsOverride override_options;
+  override_options.env = env_;
+  override_options.table_factory.reset(
+      NewBlockBasedTableFactory(BlockBasedTableOptions()));
+
+  ASSERT_OK(
+      StringToMap("compaction_readahead_size=8388608;"
+                  "blob_compaction_readahead_size=4194304;"
+                  "some_invalid_option=ignore_me;"
+                  "env=this_should_not_fail;"
+                  "max_open_files=100;",  // this should be always overriden as
+                                          // -1 in remote compaction
+                  &override_options.options_map));
+
+  bool verified = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImplSecondary::OpenAndCompact::AfterOpenAsSecondary:0",
+      [&](void* arg) {
+        auto secondary_db = static_cast<DB*>(arg);
+        auto secondary_db_options = secondary_db->GetOptions();
+        // DBOption
+        ASSERT_EQ(secondary_db_options.compaction_readahead_size, 8388608);
+        ASSERT_EQ(secondary_db_options.max_open_files, -1);
+        // CFOption
+        ASSERT_EQ(secondary_db_options.blob_compaction_readahead_size, 4194304);
+        verified = true;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(DB::OpenAndCompact(OpenAndCompactOptions(), dbname_,
+                               secondary_path_, compaction_input_binary,
+                               &compaction_result_binary, override_options));
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  ASSERT_TRUE(verified);
+}
+
 namespace {
 class TraceFileEnv : public EnvWrapper {
  public:
@@ -529,6 +604,9 @@ class TraceFileEnv : public EnvWrapper {
                   char* scratch) const override {
         return target_->Read(offset, n, result, scratch);
       }
+      Status GetFileSize(uint64_t* file_size) override {
+        return target_->GetFileSize(file_size);
+      }
 
      private:
       std::unique_ptr<RandomAccessFile> target_;
@@ -755,7 +833,7 @@ TEST_F(DBSecondaryTest, OpenWithSubsetOfColumnFamilies) {
   options1.max_open_files = -1;
   OpenSecondary(options1);
   ASSERT_EQ(0, handles_secondary_.size());
-  ASSERT_NE(nullptr, db_secondary_);
+  ASSERT_NE(nullptr, db_secondary_.get());
 
   ASSERT_OK(Put(0 /*cf*/, "foo", "foo_value"));
   ASSERT_OK(Put(1 /*cf*/, "foo", "foo_value"));
@@ -1073,7 +1151,7 @@ TEST_F(DBSecondaryTest, DISABLED_SwitchWAL) {
   for (int k = 0; k != 16; ++k) {
     ASSERT_OK(Put("key" + std::to_string(k), "value" + std::to_string(k)));
     ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
-    verify_db(dbfull(), db_secondary_);
+    verify_db(dbfull(), db_secondary_.get());
   }
 }
 
@@ -1142,7 +1220,7 @@ TEST_F(DBSecondaryTest, DISABLED_SwitchWALMultiColumnFamilies) {
     TEST_SYNC_POINT(
         "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp");
     ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
-    verify_db(dbfull(), handles_, db_secondary_, handles_secondary_);
+    verify_db(dbfull(), handles_, db_secondary_.get(), handles_secondary_);
     SyncPoint::GetInstance()->ClearTrace();
   }
 }
@@ -1215,46 +1293,6 @@ TEST_F(DBSecondaryTest, CatchUpAfterFlush) {
   ASSERT_OK(iter3->status());
 }
 
-TEST_F(DBSecondaryTest, CheckConsistencyWhenOpen) {
-  bool called = false;
-  Options options;
-  options.env = env_;
-  options.disable_auto_compactions = true;
-  Reopen(options);
-  SyncPoint::GetInstance()->DisableProcessing();
-  SyncPoint::GetInstance()->ClearAllCallBacks();
-  SyncPoint::GetInstance()->SetCallBack(
-      "DBImplSecondary::CheckConsistency:AfterFirstAttempt", [&](void* arg) {
-        ASSERT_NE(nullptr, arg);
-        called = true;
-        auto* s = static_cast<Status*>(arg);
-        ASSERT_NOK(*s);
-      });
-  SyncPoint::GetInstance()->LoadDependency(
-      {{"DBImpl::CheckConsistency:AfterGetLiveFilesMetaData",
-        "BackgroundCallCompaction:0"},
-       {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles",
-        "DBImpl::CheckConsistency:BeforeGetFileSize"}});
-  SyncPoint::GetInstance()->EnableProcessing();
-
-  ASSERT_OK(Put("a", "value0"));
-  ASSERT_OK(Put("c", "value0"));
-  ASSERT_OK(Flush());
-  ASSERT_OK(Put("b", "value1"));
-  ASSERT_OK(Put("d", "value1"));
-  ASSERT_OK(Flush());
-  port::Thread thread([this]() {
-    Options opts;
-    opts.env = env_;
-    opts.max_open_files = -1;
-    OpenSecondary(opts);
-  });
-  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  ASSERT_OK(dbfull()->TEST_WaitForCompact());
-  thread.join();
-  ASSERT_TRUE(called);
-}
-
 TEST_F(DBSecondaryTest, StartFromInconsistent) {
   Options options = CurrentOptions();
   DestroyAndReopen(options);
@@ -1318,7 +1356,7 @@ TEST_F(DBSecondaryTest, OpenWithTransactionDB) {
   TransactionDBOptions txn_db_opts;
   ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, &txn_db));
   ASSERT_NE(txn_db, nullptr);
-  db_ = txn_db;
+  db_.reset(txn_db);
 
   std::vector<std::string> cfs = {"new_CF"};
   CreateColumnFamilies(cfs, options);
@@ -1522,7 +1560,7 @@ TEST_F(DBSecondaryTestWithTimestamp, IteratorAndGet) {
          it->Next(), ++count, ++key) {
       CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
                          "value" + std::to_string(i), write_timestamps[i]);
-      get_value_and_check(db_, read_opts, it->key(), it->value(),
+      get_value_and_check(db_.get(), read_opts, it->key(), it->value(),
                           write_timestamps[i]);
     }
     ASSERT_OK(it->status());
@@ -1535,7 +1573,7 @@ TEST_F(DBSecondaryTestWithTimestamp, IteratorAndGet) {
          it->Prev(), ++count, --key) {
       CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
                          "value" + std::to_string(i), write_timestamps[i]);
-      get_value_and_check(db_, read_opts, it->key(), it->value(),
+      get_value_and_check(db_.get(), read_opts, it->key(), it->value(),
                           write_timestamps[i]);
     }
     ASSERT_OK(it->status());
@@ -1557,7 +1595,7 @@ TEST_F(DBSecondaryTestWithTimestamp, IteratorAndGet) {
            it->Valid(); it->Next(), ++key, ++count) {
         CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
                            "value" + std::to_string(i), write_timestamps[i]);
-        get_value_and_check(db_, read_opts, it->key(), it->value(),
+        get_value_and_check(db_.get(), read_opts, it->key(), it->value(),
                             write_timestamps[i]);
       }
       ASSERT_OK(it->status());
@@ -1567,7 +1605,7 @@ TEST_F(DBSecondaryTestWithTimestamp, IteratorAndGet) {
            it->Valid(); it->Prev(), --key, ++count) {
         CheckIterUserEntry(it.get(), Key1(key - 1), kTypeValue,
                            "value" + std::to_string(i), write_timestamps[i]);
-        get_value_and_check(db_, read_opts, it->key(), it->value(),
+        get_value_and_check(db_.get(), read_opts, it->key(), it->value(),
                             write_timestamps[i]);
       }
       ASSERT_OK(it->status());
diff --git a/db/db_sst_test.cc b/db/db_sst_test.cc
index 71511cee7420..d186efd8c600 100644
--- a/db/db_sst_test.cc
+++ b/db/db_sst_test.cc
@@ -135,21 +135,6 @@ TEST_F(DBSSTTest, SSTsWithLdbSuffixHandling) {
   Destroy(options);
 }
 
-// Check that we don't crash when opening DB with
-// DBOptions::skip_checking_sst_file_sizes_on_db_open = true.
-TEST_F(DBSSTTest, SkipCheckingSSTFileSizesOnDBOpen) {
-  ASSERT_OK(Put("pika", "choo"));
-  ASSERT_OK(Flush());
-
-  // Just open the DB with the option set to true and check that we don't crash.
-  Options options;
-  options.env = env_;
-  options.skip_checking_sst_file_sizes_on_db_open = true;
-  Reopen(options);
-
-  ASSERT_EQ("choo", Get("pika"));
-}
-
 TEST_F(DBSSTTest, DontDeleteMovedFile) {
   // This test triggers move compaction and verifies that the file is not
   // deleted when it's part of move compaction
@@ -1748,45 +1733,6 @@ TEST_F(DBSSTTest, GetTotalSstFilesSize) {
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 }
 
-TEST_F(DBSSTTest, OpenDBWithoutGetFileSizeInvocations) {
-  Options options = CurrentOptions();
-  std::unique_ptr<MockEnv> env{MockEnv::Create(Env::Default())};
-  options.env = env.get();
-  options.disable_auto_compactions = true;
-  options.compression = kNoCompression;
-  options.enable_blob_files = true;
-  options.blob_file_size = 32;  // create one blob per file
-  options.skip_checking_sst_file_sizes_on_db_open = true;
-
-  DestroyAndReopen(options);
-  // Generate 5 files in L0
-  for (int i = 0; i < 5; i++) {
-    for (int j = 0; j < 10; j++) {
-      std::string val = "val_file_" + std::to_string(i);
-      ASSERT_OK(Put(Key(j), val));
-    }
-    ASSERT_OK(Flush());
-  }
-  Close();
-
-  bool is_get_file_size_called = false;
-  SyncPoint::GetInstance()->SetCallBack(
-      "MockFileSystem::GetFileSize:CheckFileType", [&](void* arg) {
-        std::string* filename = static_cast<std::string*>(arg);
-        if (filename->find(".blob") != std::string::npos) {
-          is_get_file_size_called = true;
-        }
-      });
-
-  SyncPoint::GetInstance()->EnableProcessing();
-  Reopen(options);
-  ASSERT_FALSE(is_get_file_size_called);
-  SyncPoint::GetInstance()->DisableProcessing();
-  SyncPoint::GetInstance()->ClearAllCallBacks();
-
-  Destroy(options);
-}
-
 TEST_F(DBSSTTest, GetTotalSstFilesSizeVersionsFilesShared) {
   Options options = CurrentOptions();
   options.disable_auto_compactions = true;
@@ -1991,6 +1937,70 @@ TEST_F(DBSSTTest, DBWithSFMForBlobFilesAtomicFlush) {
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
 }
 
+TEST_F(DBSSTTest, SstGetFileSizeFails) {
+  // Build an SST file
+  ASSERT_OK(Put("x", "zaphod"));
+  ASSERT_OK(Flush());
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(1U, metadata.size());
+  std::string filename = dbname_ + metadata[0].name;
+
+  // Prepare for fault injection
+  std::shared_ptr<FaultInjectionTestFS> fault_fs =
+      std::make_shared<FaultInjectionTestFS>(
+          CurrentOptions().env->GetFileSystem());
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  Options options = CurrentOptions();
+  options.env = fault_fs_env.get();
+  options.paranoid_checks = false;  // don't check file sizes on open
+
+  for (int i = 0; i < 4; i++) {
+    SCOPED_TRACE("Iteration = " + std::to_string(i));
+    fault_fs->SetFailRandomAccessGetFileSizeSst(false);
+    fault_fs->SetFailFilesystemGetFileSizeSst(false);
+    Close();
+
+    if (i == 1) {
+      // Just FSRandomAccessFile::GetFileSize fails, which should be worked
+      // around
+      fault_fs->SetFailRandomAccessGetFileSizeSst(true);
+    } else if (i == 2) {
+      // FileSystem::GetFileSize fails, which should be worked around if
+      // FSRandomAccessFile::GetFileSize is supported
+      fault_fs->SetFailFilesystemGetFileSizeSst(true);
+    } else if (i == 3) {
+      // Both GetFileSize APIs fail with an IOError
+      fault_fs->SetFailRandomAccessGetFileSizeSst(true);
+      fault_fs->SetFailFilesystemGetFileSizeSst(true);
+    }
+
+    ASSERT_OK(TryReopen(options));
+    std::string value;
+    Status get_status = db_->Get({}, "x", &value);
+    if (i < 2) {
+      ASSERT_OK(get_status);
+    } else if (i == 2) {
+      if (encrypted_env_) {
+        // Can't recover because RandomAccessFile::GetFileSize is not supported
+        // on EncryptedEnv
+        // Fail with propagated IOError. (Not Corruption nor NotSupported!)
+        ASSERT_EQ(get_status.code(), Status::Code::kIOError);
+        ASSERT_STREQ(get_status.getState(), "FileSystem::GetFileSize failed");
+      } else {
+        // Never sees the FileSystem::GetFileSize failure
+        ASSERT_OK(get_status);
+      }
+    } else {
+      ASSERT_EQ(i, 3);
+      // Fail with propagated IOError. (Not Corruption nor NotSupported!)
+      ASSERT_EQ(get_status.code(), Status::Code::kIOError);
+      ASSERT_STREQ(get_status.getState(), "FileSystem::GetFileSize failed");
+    }
+  }
+  Close();
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/db_statistics_test.cc b/db/db_statistics_test.cc
index 4fe3032e901c..91f9df57e92b 100644
--- a/db/db_statistics_test.cc
+++ b/db/db_statistics_test.cc
@@ -321,7 +321,7 @@ TEST_F(DBStatisticsTest, BytesWrittenStats) {
     options.enable_pipelined_write = enable_pipelined_write;
     ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, &txn_db));
     ASSERT_NE(txn_db, nullptr);
-    db_ = txn_db->GetBaseDB();
+    db_.reset(txn_db);
 
     WriteOptions wopts;
     TransactionOptions txn_opts;
@@ -351,8 +351,7 @@ TEST_F(DBStatisticsTest, BytesWrittenStats) {
                   WriteBatchInternal::kHeader);
 
     // Cleanup
-    db_ = nullptr;
-    delete txn_db;
+    db_.reset();
   }
 }
 
diff --git a/db/db_table_properties_test.cc b/db/db_table_properties_test.cc
index a899c03e2935..0f9e1327825c 100644
--- a/db/db_table_properties_test.cc
+++ b/db/db_table_properties_test.cc
@@ -69,14 +69,6 @@ TEST_F(DBTablePropertiesTest, GetPropertiesOfAllTablesTest) {
 
   // Create 4 tables
   for (int table = 0; table < 4; ++table) {
-    // Use old meta name for table properties for one file
-    if (table == 3) {
-      SyncPoint::GetInstance()->SetCallBack(
-          "BlockBasedTableBuilder::WritePropertiesBlock:Meta", [&](void* meta) {
-            *static_cast<const std::string**>(meta) = &kPropertiesBlockOldName;
-          });
-      SyncPoint::GetInstance()->EnableProcessing();
-    }
     // Build file
     for (int i = 0; i < 10 + table; ++i) {
       ASSERT_OK(
@@ -84,7 +76,6 @@ TEST_F(DBTablePropertiesTest, GetPropertiesOfAllTablesTest) {
     }
     ASSERT_OK(db_->Flush(FlushOptions()));
   }
-  SyncPoint::GetInstance()->DisableProcessing();
   std::string original_session_id;
   ASSERT_OK(db_->GetDbSessionId(original_session_id));
 
@@ -99,7 +90,7 @@ TEST_F(DBTablePropertiesTest, GetPropertiesOfAllTablesTest) {
   // Clear out auto-opened files
   dbfull()->TEST_table_cache()->EraseUnRefEntries();
   ASSERT_EQ(dbfull()->TEST_table_cache()->GetUsage(), 0U);
-  VerifyTableProperties(db_, 10 + 11 + 12 + 13);
+  VerifyTableProperties(db_.get(), 10 + 11 + 12 + 13);
 
   // 2. Put two tables to table cache and
   Reopen(options);
@@ -112,7 +103,7 @@ TEST_F(DBTablePropertiesTest, GetPropertiesOfAllTablesTest) {
     Get(std::to_string(i * 100 + 0));
   }
 
-  VerifyTableProperties(db_, 10 + 11 + 12 + 13);
+  VerifyTableProperties(db_.get(), 10 + 11 + 12 + 13);
 
   // 3. Put all tables to table cache
   Reopen(options);
@@ -120,7 +111,7 @@ TEST_F(DBTablePropertiesTest, GetPropertiesOfAllTablesTest) {
   for (int i = 0; i < 4; ++i) {
     Get(std::to_string(i * 100 + 0));
   }
-  VerifyTableProperties(db_, 10 + 11 + 12 + 13);
+  VerifyTableProperties(db_.get(), 10 + 11 + 12 + 13);
 
   // 4. Try to read CORRUPT properties (a) directly from file, and (b)
   // through reader on Get
@@ -169,10 +160,7 @@ TEST_F(DBTablePropertiesTest, GetPropertiesOfAllTablesTest) {
   SyncPoint::GetInstance()->DisableProcessing();
 }
 
-TEST_F(DBTablePropertiesTest, InvalidIgnored) {
-  // RocksDB versions 2.5 - 2.7 generate some properties that Block considers
-  // invalid in some way. This approximates that.
-
+TEST_F(DBTablePropertiesTest, InvalidReportedAsCorruption) {
   // Inject properties block data that Block considers invalid
   SyncPoint::GetInstance()->SetCallBack(
       "BlockBasedTableBuilder::WritePropertiesBlock:BlockData",
@@ -189,13 +177,10 @@ TEST_F(DBTablePropertiesTest, InvalidIgnored) {
   for (int i = 0; i < 10; ++i) {
     ASSERT_OK(db_->Put(WriteOptions(), std::to_string(i), "val"));
   }
-  ASSERT_OK(db_->Flush(FlushOptions()));
+  // Corrupted properties block should be detected and reported as corruption
+  ASSERT_TRUE(db_->Flush(FlushOptions()).IsCorruption());
 
   SyncPoint::GetInstance()->DisableProcessing();
-
-  // Not crashing is good enough
-  TablePropertiesCollection props;
-  ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
 }
 
 TEST_F(DBTablePropertiesTest, CreateOnDeletionCollectorFactory) {
@@ -229,6 +214,56 @@ TEST_F(DBTablePropertiesTest, CreateOnDeletionCollectorFactory) {
   ASSERT_EQ(0.5, del_factory->GetDeletionRatio());
 }
 
+TEST_F(DBTablePropertiesTest, GetPropertiesOfTablesByLevelTest) {
+  Random rnd(202);
+  Options options;
+  options.level_compaction_dynamic_level_bytes = false;
+  options.create_if_missing = true;
+  options.write_buffer_size = 4096;
+  options.max_write_buffer_number = 2;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_stop_writes_trigger = 2;
+  options.target_file_size_base = 2048;
+  options.max_bytes_for_level_base = 40960;
+  options.max_bytes_for_level_multiplier = 4;
+  options.hard_pending_compaction_bytes_limit = 16 * 1024;
+  options.num_levels = 8;
+  options.env = env_;
+
+  DestroyAndReopen(options);
+
+  // build a decent LSM
+  for (int i = 0; i < 10000; i++) {
+    EXPECT_OK(Put(test::RandomKey(&rnd, 5), rnd.RandomString(102)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  if (NumTableFilesAtLevel(0) == 0) {
+    EXPECT_OK(Put(test::RandomKey(&rnd, 5), rnd.RandomString(102)));
+    ASSERT_OK(Flush());
+  }
+
+  ASSERT_OK(db_->PauseBackgroundWork());
+
+  // Ensure that we have at least L0, L1 and L2
+  ASSERT_GT(NumTableFilesAtLevel(0), 0);
+  ASSERT_GT(NumTableFilesAtLevel(1), 0);
+  ASSERT_GT(NumTableFilesAtLevel(2), 0);
+  ColumnFamilyMetaData cf_meta;
+  db_->GetColumnFamilyMetaData(&cf_meta);
+  std::vector<std::unique_ptr<TablePropertiesCollection>> levels_props;
+  ASSERT_OK(db_->GetPropertiesOfTablesByLevel(db_->DefaultColumnFamily(),
+                                              &levels_props));
+  for (int i = 0; i < 8; i++) {
+    const std::unique_ptr<TablePropertiesCollection>& level_props =
+        levels_props[i];
+    ASSERT_EQ(level_props->size(), cf_meta.levels[i].files.size());
+  }
+
+  Close();
+}
+
 // Test params:
 // 1) whether to enable user-defined timestamps
 class DBTablePropertiesInRangeTest : public DBTestBase,
@@ -292,7 +327,7 @@ class DBTablePropertiesInRangeTest : public DBTestBase,
     keys.reserve(range_size * 2);
     for (auto& r : ranges) {
       auto [start, limit] = MaybeAddTimestampsToRange(
-          &r.start, &r.limit, ts_sz, &keys.emplace_back(), &keys.emplace_back(),
+          r.start, r.limit, ts_sz, &keys.emplace_back(), &keys.emplace_back(),
           /*exclusive_end=*/false);
       EXPECT_TRUE(start.has_value());
       EXPECT_TRUE(limit.has_value());
@@ -737,6 +772,46 @@ TEST_P(DBTablePropertiesTest, RatioBasedDeletionTriggeredCompactionMarking) {
   }
 }
 
+TEST_F(DBTablePropertiesTest, KeyLargestSmallestSeqno) {
+  ASSERT_OK(db_->Put(WriteOptions(), "key1", "value1"));
+  ASSERT_OK(db_->Put(WriteOptions(), "key2", "value2"));
+  ASSERT_OK(db_->Put(WriteOptions(), "key3", "value3"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  {
+    TablePropertiesCollection props;
+    ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
+    ASSERT_EQ(1U, props.size());
+
+    auto table_props = props.begin()->second;
+
+    ASSERT_TRUE(table_props->HasKeyLargestSeqno());
+    ASSERT_TRUE(table_props->HasKeySmallestSeqno());
+
+    ASSERT_EQ(table_props->key_largest_seqno,
+              table_props->key_smallest_seqno + 2);
+    ASSERT_GT(table_props->key_largest_seqno, 0U);
+    ASSERT_GT(table_props->key_smallest_seqno, 0U);
+  }
+
+  // Becomes zero after compaction
+  {
+    CompactRangeOptions cro;
+    cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+    ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+    TablePropertiesCollection props;
+    ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
+    ASSERT_EQ(1U, props.size());
+
+    auto table_props = props.begin()->second;
+    ASSERT_TRUE(table_props->HasKeyLargestSeqno());
+    ASSERT_TRUE(table_props->HasKeySmallestSeqno());
+
+    ASSERT_EQ(table_props->key_largest_seqno, table_props->key_smallest_seqno);
+    ASSERT_EQ(table_props->key_largest_seqno, 0U);
+  }
+}
+
 INSTANTIATE_TEST_CASE_P(DBTablePropertiesTest, DBTablePropertiesTest,
                         ::testing::Values("kCompactionStyleLevel",
                                           "kCompactionStyleUniversal"));
diff --git a/db/db_test.cc b/db/db_test.cc
index e141e562afbd..9c0dc9fe326b 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -59,11 +59,13 @@
 #include "rocksdb/utilities/checkpoint.h"
 #include "rocksdb/utilities/optimistic_transaction_db.h"
 #include "rocksdb/utilities/write_batch_with_index.h"
+#include "table/block_based/block_based_table_factory.h"
 #include "table/mock_table.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/compression.h"
+#include "util/defer.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
 #include "util/rate_limiter_impl.h"
@@ -102,7 +104,7 @@ TEST_F(DBTest, MockEnvTest) {
   Options options;
   options.create_if_missing = true;
   options.env = env.get();
-  DB* db;
+  std::unique_ptr<DB> db;
 
   const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")};
   const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")};
@@ -130,7 +132,7 @@ TEST_F(DBTest, MockEnvTest) {
   ASSERT_OK(iterator->status());
   delete iterator;
 
-  DBImpl* dbi = static_cast_with_check<DBImpl>(db);
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db.get());
   ASSERT_OK(dbi->TEST_FlushMemTable());
 
   for (size_t i = 0; i < 3; ++i) {
@@ -139,7 +141,122 @@ TEST_F(DBTest, MockEnvTest) {
     ASSERT_TRUE(res == vals[i]);
   }
 
-  delete db;
+  db.reset();
+}
+
+TEST_F(DBTest, RequestIdPlumbingTest) {
+  // test that request_id is passed to the filesystem, from
+  // ReadOptions to IODebugContext
+  Options options = CurrentOptions();
+  options.env = env_;
+
+  // Create a mock environment to capture IODebugContext during reads
+  IODebugContext dbgCopy;
+  const std::string* captured_request_id_dbg;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "RandomAccessFileReader::Read:IODebugContext", [&](void* arg) {
+        IODebugContext* dbg = static_cast<IODebugContext*>(arg);
+        if (dbg == nullptr) {
+          captured_request_id_dbg = nullptr;
+        } else {
+          captured_request_id_dbg = dbg->request_id;
+          // Test IODebugContext assignment operator
+          dbgCopy = *dbg;
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put("k1", "v1"));
+  ASSERT_OK(Flush());
+
+  // test request_id plumbing during a get
+  {
+    const std::string test_request_id = "test_request_id_123";
+    ReadOptions read_opts;
+    read_opts.request_id = &test_request_id;
+    std::string value;
+    ASSERT_OK(db_->Get(read_opts, "k1", &value));
+
+    // Verify the request_id was propagated to the file system
+    ASSERT_NE(captured_request_id_dbg, nullptr);
+    ASSERT_EQ(*captured_request_id_dbg, test_request_id);
+
+    ASSERT_NE(dbgCopy.request_id, nullptr);
+    ASSERT_NE(dbgCopy.request_id, captured_request_id_dbg);
+    ASSERT_EQ(*dbgCopy.request_id, test_request_id);
+  }
+
+  captured_request_id_dbg = nullptr;
+
+  // test request_id plumbing during iterator seek
+  ASSERT_OK(Put("k2", "v2"));
+  ASSERT_OK(Flush());
+  {
+    ReadOptions read_opts;
+    const std::string request_id = "test_request_id_456";
+    read_opts.request_id = &request_id;
+
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    iter->Seek("k2");
+    ASSERT_TRUE(iter->Valid());
+
+    // Verify the request_id was propagated to the file system
+    ASSERT_NE(captured_request_id_dbg, nullptr);
+    ASSERT_EQ(*captured_request_id_dbg, request_id);
+
+    ASSERT_NE(dbgCopy.request_id, nullptr);
+    ASSERT_NE(dbgCopy.request_id, captured_request_id_dbg);
+    ASSERT_EQ(*dbgCopy.request_id, request_id);
+
+    // Test IODebugContext copy constructor
+    IODebugContext dbgCopy2(dbgCopy);
+    ASSERT_NE(dbgCopy2.request_id, nullptr);
+    ASSERT_NE(dbgCopy2.request_id, captured_request_id_dbg);
+    ASSERT_NE(dbgCopy2.request_id, dbgCopy.request_id);
+    ASSERT_EQ(*dbgCopy2.request_id, request_id);
+  }
+
+  // test request_id plumbing during multiget
+  captured_request_id_dbg = nullptr;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "RandomAccessFileReader::MultiRead:IODebugContext", [&](void* arg) {
+        IODebugContext* dbg = static_cast<IODebugContext*>(arg);
+        if (dbg == nullptr) {
+          captured_request_id_dbg = nullptr;
+        } else {
+          captured_request_id_dbg = dbg->request_id;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put("k3", "v3"));
+  ASSERT_OK(Put("k4", "v4"));
+  ASSERT_OK(Flush());
+
+  {
+    ReadOptions read_opts;
+    const std::string multiget_request_id = "test_request_id_789";
+    read_opts.request_id = &multiget_request_id;
+
+    std::vector<std::string> values;
+    std::vector<Slice> keys = {Slice("k3"), Slice("k4")};
+
+    values.resize(keys.size());
+
+    std::vector<ColumnFamilyHandle*> cfhs(keys.size(),
+                                          db_->DefaultColumnFamily());
+    db_->MultiGet(read_opts, cfhs, keys, &values);
+
+    ASSERT_NE(captured_request_id_dbg, nullptr);
+    ASSERT_EQ(*captured_request_id_dbg, multiget_request_id);
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
 }
 
 TEST_F(DBTest, MemEnvTest) {
@@ -147,7 +264,7 @@ TEST_F(DBTest, MemEnvTest) {
   Options options;
   options.create_if_missing = true;
   options.env = env.get();
-  DB* db;
+  std::unique_ptr<DB> db;
 
   const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")};
   const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")};
@@ -175,7 +292,7 @@ TEST_F(DBTest, MemEnvTest) {
   ASSERT_OK(iterator->status());
   delete iterator;
 
-  DBImpl* dbi = static_cast_with_check<DBImpl>(db);
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db.get());
   ASSERT_OK(dbi->TEST_FlushMemTable());
 
   for (size_t i = 0; i < 3; ++i) {
@@ -184,7 +301,7 @@ TEST_F(DBTest, MemEnvTest) {
     ASSERT_TRUE(res == vals[i]);
   }
 
-  delete db;
+  db.reset();
 
   options.create_if_missing = false;
   ASSERT_OK(DB::Open(options, "/dir/db", &db));
@@ -193,7 +310,7 @@ TEST_F(DBTest, MemEnvTest) {
     ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
     ASSERT_TRUE(res == vals[i]);
   }
-  delete db;
+  db.reset();
 }
 
 TEST_F(DBTest, WriteEmptyBatch) {
@@ -961,7 +1078,9 @@ TEST_F(DBTest, WrongLevel0Config) {
   options.level0_stop_writes_trigger = 1;
   options.level0_slowdown_writes_trigger = 2;
   options.level0_file_num_compaction_trigger = 3;
-  ASSERT_OK(DB::Open(options, dbname_, &db_));
+  {
+    ASSERT_OK(DB::Open(options, dbname_, &db_));
+  }
 }
 
 TEST_F(DBTest, GetOrderedByLevels) {
@@ -1090,8 +1209,10 @@ TEST_F(DBTest, FlushSchedule) {
     t.join();
   }
 
-  auto default_tables = GetNumberOfSstFilesForColumnFamily(db_, "default");
-  auto pikachu_tables = GetNumberOfSstFilesForColumnFamily(db_, "pikachu");
+  auto default_tables =
+      GetNumberOfSstFilesForColumnFamily(db_.get(), "default");
+  auto pikachu_tables =
+      GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu");
   ASSERT_LE(default_tables, static_cast<uint64_t>(10));
   ASSERT_GT(default_tables, static_cast<uint64_t>(0));
   ASSERT_LE(pikachu_tables, static_cast<uint64_t>(10));
@@ -1161,12 +1282,6 @@ class DelayFilterFactory : public CompactionFilterFactory {
 };
 }  // anonymous namespace
 
-static std::string CompressibleString(Random* rnd, int len) {
-  std::string r;
-  test::CompressibleString(rnd, 0.8, len, &r);
-  return r;
-}
-
 TEST_F(DBTest, FailMoreDbPaths) {
   Options options = CurrentOptions();
   options.db_paths.emplace_back(dbname_, 10000000);
@@ -1381,6 +1496,246 @@ TEST_F(DBTest, MetaDataTest) {
   CheckLiveFilesMeta(live_file_meta, files_by_level);
 }
 
+TEST_F(DBTest, GetColumnFamilyMetaDataWithKeyRangeAndLevel) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+
+  int64_t temp_time = 0;
+  ASSERT_OK(options.env->GetCurrentTime(&temp_time));
+
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  int key_index = 0;
+  for (int i = 0; i < 100; ++i) {
+    // Add a single blob reference to each file
+    std::string blob_index;
+    BlobIndex::EncodeBlob(&blob_index, /* blob_file_number */ i + 1000,
+                          /* offset */ 1234, /* size */ 5678, kNoCompression);
+
+    WriteBatch batch;
+    ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, Key(key_index),
+                                               blob_index));
+    ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+
+    ++key_index;
+
+    // Fill up the rest of the file with random values.
+    GenerateNewFile(&rnd, &key_index, /* nowait */ true);
+
+    ASSERT_OK(Flush());
+  }
+
+  std::vector<std::vector<FileMetaData>> files_by_level;
+  dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files_by_level);
+
+  ASSERT_OK(options.env->GetCurrentTime(&temp_time));
+
+  ColumnFamilyMetaData cf_meta;
+  // Keys in the SST files are distributed
+  // (key000000, key000100) ->File 1
+  // (key000101, key000201) -> File 2
+  // (key000202, key000302) -> File 3
+  // (key009999, key010099) -> File 100
+
+  // With keySlice (key000050, key000150) => should only pick 2 files(instead of
+  // default 100 that is in the level)
+  auto startKey = Slice("key000050");
+  auto endKey = Slice("key000150");
+  GetColumnFamilyMetaDataOptions cf_options(startKey, endKey, 0);
+  db_->GetColumnFamilyMetaData(cf_options, &cf_meta);
+  ASSERT_EQ(cf_meta.levels.size(), 1);
+  const auto& level_meta_from_cf = cf_meta.levels[0];
+  ASSERT_EQ(level_meta_from_cf.files.size(), 2);
+  ASSERT_LT(level_meta_from_cf.files[1].smallestkey,
+            std::string(startKey.data()));
+  ASSERT_GT(level_meta_from_cf.files[0].largestkey, std::string(endKey.data()));
+
+  GetColumnFamilyMetaDataOptions cf_option_default;
+  db_->GetColumnFamilyMetaData(cf_option_default, &cf_meta);
+  ASSERT_EQ(cf_meta.levels.size(), 1);
+  ASSERT_EQ(cf_meta.levels[0].files.size(), 100);
+
+  // Test with start key valid and end key unbounded
+  // This should get all files from key000150 onwards (99 files)
+  auto startKeyUnbounded = Slice("key000150");
+  GetColumnFamilyMetaDataOptions cf_options_unbounded_end(startKeyUnbounded,
+                                                          OptSlice(), 0);
+  db_->GetColumnFamilyMetaData(cf_options_unbounded_end, &cf_meta);
+  ASSERT_EQ(cf_meta.levels.size(), 1);
+  ASSERT_EQ(cf_meta.levels[0].files.size(), 99);
+
+  // Test with end key valid and start key unbounded
+  // This should get all files from beginning to key000250 ( 3 files)
+  auto endKeyUnbounded = Slice("key000250");
+  GetColumnFamilyMetaDataOptions cf_options_unbounded_start(OptSlice(),
+                                                            endKeyUnbounded, 0);
+  db_->GetColumnFamilyMetaData(cf_options_unbounded_start, &cf_meta);
+  ASSERT_EQ(cf_meta.levels.size(), 1);
+  ASSERT_EQ(cf_meta.levels[0].files.size(), 3);
+}
+
+TEST_F(DBTest, GetColumnFamilyMetaDataBottommostLevel) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.num_levels = 7;
+
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  int key_index = 0;
+
+  for (int i = 0; i < 100; ++i) {
+    GenerateNewFile(&rnd, &key_index, /* nowait */ true);
+    ASSERT_OK(Flush());
+  }
+
+  CompactRangeOptions compact_options;
+  compact_options.bottommost_level_compaction =
+      BottommostLevelCompaction::kForce;
+  compact_options.change_level = true;
+  compact_options.target_level = 6;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+
+  // Nothing on Level 0 after compaction
+  ColumnFamilyMetaData cf_meta;
+  GetColumnFamilyMetaDataOptions cf_options_0(OptSlice(), OptSlice(), 0);
+  db_->GetColumnFamilyMetaData(cf_options_0, &cf_meta);
+
+  ASSERT_EQ(cf_meta.levels.size(), 0);
+  ASSERT_EQ(cf_meta.file_count, 0);
+
+  // Data should be in Level 6
+  GetColumnFamilyMetaDataOptions cf_options(OptSlice(), OptSlice(), 6);
+  db_->GetColumnFamilyMetaData(cf_options, &cf_meta);
+
+  ASSERT_EQ(cf_meta.levels.size(), 1);
+  ASSERT_EQ(cf_meta.levels[0].level, 6);
+  ASSERT_GT(cf_meta.levels[0].files.size(), 0);
+  size_t all_files = cf_meta.levels[0].files.size();
+
+  // Keys in the SST files are distributed across level 6
+  // Test with key range - should only return files within the range
+  auto startKey = Slice("key000050");
+  auto endKey = Slice("key000150");
+  GetColumnFamilyMetaDataOptions cf_options_range(startKey, endKey, 6);
+  db_->GetColumnFamilyMetaData(cf_options_range, &cf_meta);
+
+  ASSERT_EQ(cf_meta.levels.size(), 1);
+  ASSERT_EQ(cf_meta.levels[0].level, 6);
+  ASSERT_GT(cf_meta.levels[0].files.size(), 0);
+  size_t files_in_range = cf_meta.levels[0].files.size();
+
+  // Files in range should be less than or equal to all files
+  ASSERT_LE(files_in_range, all_files);
+}
+
+TEST_F(DBTest, GetColumnFamilyMetaDataMultipleLevels) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.num_levels = 7;
+
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  int key_index = 0;
+
+  for (int i = 0; i < 50; ++i) {
+    GenerateNewFile(&rnd, &key_index, /* nowait */ true);
+    ASSERT_OK(Flush());
+  }
+
+  CompactRangeOptions compact_options;
+  compact_options.bottommost_level_compaction =
+      BottommostLevelCompaction::kForce;
+  compact_options.change_level = true;
+  compact_options.target_level = 6;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+
+  for (int i = 0; i < 30; ++i) {
+    GenerateNewFile(&rnd, &key_index, /* nowait */ true);
+    ASSERT_OK(Flush());
+  }
+
+  // First verify both levels have files without key range filter
+  ColumnFamilyMetaData cf_meta_all_no_range;
+  GetColumnFamilyMetaDataOptions cf_options_all_no_range;
+  db_->GetColumnFamilyMetaData(cf_options_all_no_range, &cf_meta_all_no_range);
+
+  bool has_level_0 = false;
+  bool has_level_6 = false;
+  for (const auto& level : cf_meta_all_no_range.levels) {
+    if (level.level == 0 && level.files.size() > 0) {
+      has_level_0 = true;
+    }
+    if (level.level == 6 && level.files.size() > 0) {
+      has_level_6 = true;
+    }
+  }
+
+  ASSERT_TRUE(has_level_0);
+  ASSERT_TRUE(has_level_6);
+
+  // Test querying bottommost level only with key range
+  // Use a range that should be in the first set of files (now in level 6)
+  auto startKey = Slice("key000050");
+  auto endKey = Slice("key000150");
+  ColumnFamilyMetaData cf_meta_bottommost;
+  GetColumnFamilyMetaDataOptions cf_options_bottommost(startKey, endKey, 6);
+  db_->GetColumnFamilyMetaData(cf_options_bottommost, &cf_meta_bottommost);
+
+  ASSERT_EQ(cf_meta_bottommost.levels.size(), 1);
+  ASSERT_EQ(cf_meta_bottommost.levels[0].level, 6);
+  ASSERT_GT(cf_meta_bottommost.levels[0].files.size(), 0);
+  size_t level_6_files_in_range = cf_meta_bottommost.levels[0].files.size();
+
+  // Test querying all levels with same key range
+  ColumnFamilyMetaData cf_meta_all;
+  GetColumnFamilyMetaDataOptions cf_options_all(startKey, endKey);
+  db_->GetColumnFamilyMetaData(cf_options_all, &cf_meta_all);
+
+  size_t level_6_files_in_range_from_all = 0;
+  for (const auto& level : cf_meta_all.levels) {
+    if (level.level == 6) {
+      level_6_files_in_range_from_all = level.files.size();
+    }
+  }
+
+  ASSERT_GT(level_6_files_in_range_from_all, 0);
+  ASSERT_EQ(level_6_files_in_range, level_6_files_in_range_from_all);
+}
+
+TEST_F(DBTest, GetColumnFamilyMetaDataEmptyDB) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.num_levels = 7;
+
+  DestroyAndReopen(options);
+
+  // Test on empty database
+  ColumnFamilyMetaData cf_meta_empty_db;
+  GetColumnFamilyMetaDataOptions cf_options_empty_db;
+  db_->GetColumnFamilyMetaData(cf_options_empty_db, &cf_meta_empty_db);
+
+  ASSERT_EQ(cf_meta_empty_db.levels.size(), 0);
+  ASSERT_EQ(cf_meta_empty_db.file_count, 0);
+  ASSERT_EQ(cf_meta_empty_db.size, 0);
+
+  // Test on empty database with key range
+  auto startKey = Slice("key000050");
+  auto endKey = Slice("key000150");
+  ColumnFamilyMetaData cf_meta_empty_range;
+  GetColumnFamilyMetaDataOptions cf_options_empty_range(startKey, endKey);
+  db_->GetColumnFamilyMetaData(cf_options_empty_range, &cf_meta_empty_range);
+
+  ASSERT_EQ(cf_meta_empty_range.levels.size(), 0);
+  ASSERT_EQ(cf_meta_empty_range.file_count, 0);
+  ASSERT_EQ(cf_meta_empty_range.size, 0);
+}
+
 TEST_F(DBTest, AllMetaDataTest) {
   Options options = CurrentOptions();
   options.create_if_missing = true;
@@ -2017,7 +2372,7 @@ TEST_F(DBTest, Snapshot) {
     ASSERT_OK(Put(1, "foo", "1v3"));
 
     {
-      ManagedSnapshot s3(db_);
+      ManagedSnapshot s3(db_.get());
       ASSERT_EQ(3U, GetNumSnapshots());
       ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
       ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
@@ -2374,37 +2729,43 @@ TEST_F(DBTest, DBOpen_Options) {
   ASSERT_OK(DestroyDB(dbname, options));
 
   // Does not exist, and create_if_missing == false: error
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   options.create_if_missing = false;
-  Status s = DB::Open(options, dbname, &db);
-  ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr);
+  {
+    Status s = DB::Open(options, dbname, &db);
+    ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr);
+  }
   ASSERT_TRUE(db == nullptr);
 
   // Does not exist, and create_if_missing == true: OK
   options.create_if_missing = true;
-  s = DB::Open(options, dbname, &db);
-  ASSERT_OK(s);
+  {
+    Status s = DB::Open(options, dbname, &db);
+    ASSERT_OK(s);
+  }
   ASSERT_TRUE(db != nullptr);
 
-  delete db;
-  db = nullptr;
+  db.reset();
 
   // Does exist, and error_if_exists == true: error
   options.create_if_missing = false;
   options.error_if_exists = true;
-  s = DB::Open(options, dbname, &db);
-  ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr);
+  {
+    Status s = DB::Open(options, dbname, &db);
+    ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr);
+  }
   ASSERT_TRUE(db == nullptr);
 
   // Does exist, and error_if_exists == false: OK
   options.create_if_missing = true;
   options.error_if_exists = false;
-  s = DB::Open(options, dbname, &db);
-  ASSERT_OK(s);
+  {
+    Status s = DB::Open(options, dbname, &db);
+    ASSERT_OK(s);
+  }
   ASSERT_TRUE(db != nullptr);
 
-  delete db;
-  db = nullptr;
+  db.reset();
 }
 
 TEST_F(DBTest, DBOpen_Change_NumLevels) {
@@ -2442,25 +2803,36 @@ TEST_F(DBTest, DestroyDBMetaDatabase) {
   ASSERT_OK(DestroyDB(dbname, options));
 
   // Setup databases
-  DB* db = nullptr;
-  ASSERT_OK(DB::Open(options, dbname, &db));
-  delete db;
-  db = nullptr;
-  ASSERT_OK(DB::Open(options, metadbname, &db));
-  delete db;
-  db = nullptr;
-  ASSERT_OK(DB::Open(options, metametadbname, &db));
-  delete db;
-  db = nullptr;
+  {
+    std::unique_ptr<DB> db;
+    ASSERT_OK(DB::Open(options, dbname, &db));
+  }
+  {
+    std::unique_ptr<DB> db;
+    ASSERT_OK(DB::Open(options, metadbname, &db));
+  }
+  {
+    std::unique_ptr<DB> db;
+    ASSERT_OK(DB::Open(options, metametadbname, &db));
+  }
 
   // Delete databases
   ASSERT_OK(DestroyDB(dbname, options));
 
   // Check if deletion worked.
   options.create_if_missing = false;
-  ASSERT_TRUE(!(DB::Open(options, dbname, &db)).ok());
-  ASSERT_TRUE(!(DB::Open(options, metadbname, &db)).ok());
-  ASSERT_TRUE(!(DB::Open(options, metametadbname, &db)).ok());
+  {
+    std::unique_ptr<DB> dbptr;
+    ASSERT_TRUE(!(DB::Open(options, dbname, &dbptr)).ok());
+  }
+  {
+    std::unique_ptr<DB> dbptr;
+    ASSERT_TRUE(!(DB::Open(options, metadbname, &dbptr)).ok());
+  }
+  {
+    std::unique_ptr<DB> dbptr;
+    ASSERT_TRUE(!(DB::Open(options, metametadbname, &dbptr)).ok());
+  }
 }
 
 TEST_F(DBTest, SnapshotFiles) {
@@ -2539,13 +2911,11 @@ TEST_F(DBTest, SnapshotFiles) {
     column_families.emplace_back("default", ColumnFamilyOptions());
     column_families.emplace_back("pikachu", ColumnFamilyOptions());
     std::vector<ColumnFamilyHandle*> cf_handles;
-    DB* snapdb;
+    std::unique_ptr<DB> snapdb;
     DBOptions opts;
     opts.env = env_;
     opts.create_if_missing = false;
-    Status stat =
-        DB::Open(opts, snapdir, column_families, &cf_handles, &snapdb);
-    ASSERT_OK(stat);
+    ASSERT_OK(DB::Open(opts, snapdir, column_families, &cf_handles, &snapdb));
 
     ReadOptions roptions;
     std::string val;
@@ -2556,7 +2926,7 @@ TEST_F(DBTest, SnapshotFiles) {
     for (auto cfh : cf_handles) {
       delete cfh;
     }
-    delete snapdb;
+    snapdb.reset();
 
     // look at the new live files after we added an 'extra' key
     // and after we took the first snapshot.
@@ -2758,7 +3128,7 @@ struct MTThread {
 static void MTThreadBody(void* arg) {
   MTThread* t = static_cast<MTThread*>(arg);
   int id = t->id;
-  DB* db = t->state->test->db_;
+  DB* db = t->state->test->db_.get();
   int counter = 0;
   std::shared_ptr<SystemClock> clock = SystemClock::Default();
   auto end_micros = clock->NowMicros() + kTestSeconds * 1000000U;
@@ -2973,7 +3343,7 @@ TEST_F(DBTest, GroupCommitTest) {
     GCThread thread[kGCNumThreads];
     for (int id = 0; id < kGCNumThreads; id++) {
       thread[id].id = id;
-      thread[id].db = db_;
+      thread[id].db = db_.get();
       thread[id].done = false;
       env_->StartThread(GCThreadBody, &thread[id]);
     }
@@ -3180,6 +3550,15 @@ class ModelDB : public DB {
     return Status();
   }
 
+  using DB::GetPropertiesOfTablesByLevel;
+  Status GetPropertiesOfTablesByLevel(
+      ColumnFamilyHandle* /* column_family */,
+      std::vector<
+          std::unique_ptr<TablePropertiesCollection>>* /* props_by_level */)
+      override {
+    return Status();
+  }
+
   using DB::KeyMayExist;
   bool KeyMayExist(const ReadOptions& /*options*/,
                    ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/,
@@ -3331,6 +3710,8 @@ class ModelDB : public DB {
   void EnableManualCompaction() override {}
 
   void DisableManualCompaction() override {}
+  void AbortAllCompactions() override {}
+  void ResumeAllCompactions() override {}
 
   Status WaitForCompact(
       const WaitForCompactOptions& /* wait_for_compact_options */) override {
@@ -3340,11 +3721,6 @@ class ModelDB : public DB {
   using DB::NumberLevels;
   int NumberLevels(ColumnFamilyHandle* /*column_family*/) override { return 1; }
 
-  using DB::MaxMemCompactionLevel;
-  int MaxMemCompactionLevel(ColumnFamilyHandle* /*column_family*/) override {
-    return 1;
-  }
-
   using DB::Level0StopWriteTrigger;
   int Level0StopWriteTrigger(ColumnFamilyHandle* /*column_family*/) override {
     return -1;
@@ -3401,7 +3777,7 @@ class ModelDB : public DB {
   }
 
   Status GetCurrentWalFile(
-      std::unique_ptr<LogFile>* /*current_log_file*/) override {
+      std::unique_ptr<LogFile>* /*current_wal_file*/) override {
     return Status::OK();
   }
 
@@ -3420,6 +3796,11 @@ class ModelDB : public DB {
   void GetColumnFamilyMetaData(ColumnFamilyHandle* /*column_family*/,
                                ColumnFamilyMetaData* /*metadata*/) override {}
 
+  void GetColumnFamilyMetaData(
+      ColumnFamilyHandle* /*column_family*/,
+      const GetColumnFamilyMetaDataOptions& /*options*/,
+      ColumnFamilyMetaData* /*metadata*/) override {}
+
   Status GetDbIdentity(std::string& /*identity*/) const override {
     return Status::OK();
   }
@@ -3440,6 +3821,11 @@ class ModelDB : public DB {
     return Status::OK();
   }
 
+  Status GetNewestUserDefinedTimestamp(
+      ColumnFamilyHandle* /*cf*/, std::string* /*newest_timestamp*/) override {
+    return Status::OK();
+  }
+
   ColumnFamilyHandle* DefaultColumnFamily() const override { return nullptr; }
 
  private:
@@ -3629,8 +4015,10 @@ TEST_P(DBTestRandomized, Randomized) {
       // than return a key that is close to it.
       if (option_config_ != kBlockBasedTableWithWholeKeyHashIndex &&
           option_config_ != kBlockBasedTableWithPrefixHashIndex) {
-        ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
-        ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap));
+        ASSERT_TRUE(
+            CompareIterators(step, &model, db_.get(), nullptr, nullptr));
+        ASSERT_TRUE(
+            CompareIterators(step, &model, db_.get(), model_snap, db_snap));
       }
 
       // Save a snapshot from each DB this time that we'll use next
@@ -3644,7 +4032,7 @@ TEST_P(DBTestRandomized, Randomized) {
       }
 
       Reopen(options);
-      ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
+      ASSERT_TRUE(CompareIterators(step, &model, db_.get(), nullptr, nullptr));
 
       model_snap = model.GetSnapshot();
       db_snap = db_->GetSnapshot();
@@ -4814,7 +5202,7 @@ TEST_F(DBTest, DynamicMemtableOptions) {
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 }
 
-#ifdef ROCKSDB_USING_THREAD_STATUS
+#ifndef NROCKSDB_THREAD_STATUS
 namespace {
 bool VerifyOperationCount(Env* env, ThreadStatus::OperationType op_type,
                           int expected_count) {
@@ -5070,7 +5458,7 @@ TEST_P(DBTestWithParam, PreShutdownManualCompaction) {
     // Compact all
     MakeTables(1, "a", "z", 1);
     ASSERT_EQ("1,0,2", FilesPerLevel(1));
-    CancelAllBackgroundWork(db_);
+    CancelAllBackgroundWork(db_.get());
     ASSERT_TRUE(
         db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr)
             .IsShutdownInProgress());
@@ -5090,7 +5478,7 @@ TEST_F(DBTest, PreShutdownFlush) {
   Options options = CurrentOptions();
   CreateAndReopenWithCF({"pikachu"}, options);
   ASSERT_OK(Put(1, "key", "value"));
-  CancelAllBackgroundWork(db_);
+  CancelAllBackgroundWork(db_.get());
   Status s =
       db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr);
   ASSERT_TRUE(s.IsShutdownInProgress());
@@ -5171,7 +5559,7 @@ TEST_P(DBTestWithParam, PreShutdownMultipleCompaction) {
 
   TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:Preshutdown");
   ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1);
-  CancelAllBackgroundWork(db_);
+  CancelAllBackgroundWork(db_.get());
   TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:VerifyPreshutdown");
   ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
   // Record the number of compactions at a time.
@@ -5257,7 +5645,7 @@ TEST_P(DBTestWithParam, PreShutdownCompactionMiddle) {
   }
 
   ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1);
-  CancelAllBackgroundWork(db_);
+  CancelAllBackgroundWork(db_.get());
   TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:Preshutdown");
   TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:VerifyPreshutdown");
   ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
@@ -5272,279 +5660,13 @@ TEST_P(DBTestWithParam, PreShutdownCompactionMiddle) {
   ASSERT_EQ(operation_count[ThreadStatus::OP_COMPACTION], 0);
 }
 
-#endif  // ROCKSDB_USING_THREAD_STATUS
+#endif  // !NROCKSDB_THREAD_STATUS
 
 TEST_F(DBTest, FlushOnDestroy) {
   WriteOptions wo;
   wo.disableWAL = true;
   ASSERT_OK(Put("foo", "v1", wo));
-  CancelAllBackgroundWork(db_);
-}
-
-TEST_F(DBTest, DynamicLevelCompressionPerLevel) {
-  if (!Snappy_Supported()) {
-    return;
-  }
-  const int kNKeys = 120;
-  int keys[kNKeys];
-  for (int i = 0; i < kNKeys; i++) {
-    keys[i] = i;
-  }
-
-  Random rnd(301);
-  Options options;
-  options.env = env_;
-  options.create_if_missing = true;
-  options.db_write_buffer_size = 20480;
-  options.write_buffer_size = 20480;
-  options.max_write_buffer_number = 2;
-  options.level0_file_num_compaction_trigger = 2;
-  options.level0_slowdown_writes_trigger = 2;
-  options.level0_stop_writes_trigger = 2;
-  options.target_file_size_base = 20480;
-  options.level_compaction_dynamic_level_bytes = true;
-  options.max_bytes_for_level_base = 102400;
-  options.max_bytes_for_level_multiplier = 4;
-  options.max_background_compactions = 1;
-  options.num_levels = 5;
-  options.statistics = CreateDBStatistics();
-
-  options.compression_per_level.resize(3);
-  // No compression for L0
-  options.compression_per_level[0] = kNoCompression;
-  // No compression for the Ln whre L0 is compacted to
-  options.compression_per_level[1] = kNoCompression;
-  // Snappy compression for Ln+1
-  options.compression_per_level[2] = kSnappyCompression;
-
-  OnFileDeletionListener* listener = new OnFileDeletionListener();
-  options.listeners.emplace_back(listener);
-
-  DestroyAndReopen(options);
-
-  // Insert more than 80K. L4 should be base level. Neither L0 nor L4 should
-  // be compressed, so there shouldn't be any compression.
-  for (int i = 0; i < 20; i++) {
-    ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000)));
-    ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
-  }
-  ASSERT_OK(Flush());
-  ASSERT_OK(dbfull()->TEST_WaitForCompact());
-
-  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
-  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
-  ASSERT_EQ(NumTableFilesAtLevel(3), 0);
-  ASSERT_TRUE(NumTableFilesAtLevel(0) > 0 || NumTableFilesAtLevel(4) > 0);
-
-  // Verify there was no compression
-  auto num_block_compressed =
-      options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED);
-  ASSERT_EQ(num_block_compressed, 0);
-
-  // Insert 400KB and there will be some files end up in L3. According to the
-  // above compression settings for each level, there will be some compression.
-  ASSERT_OK(options.statistics->Reset());
-  ASSERT_EQ(num_block_compressed, 0);
-  for (int i = 20; i < 120; i++) {
-    ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000)));
-    ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
-  }
-  ASSERT_OK(Flush());
-  ASSERT_OK(dbfull()->TEST_WaitForCompact());
-  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
-  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
-  ASSERT_GE(NumTableFilesAtLevel(3), 1);
-  ASSERT_GE(NumTableFilesAtLevel(4), 1);
-
-  // Verify there was compression
-  num_block_compressed =
-      options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED);
-  ASSERT_GT(num_block_compressed, 0);
-
-  // Make sure data in files in L3 is not compacted by removing all files
-  // in L4 and calculate number of rows
-  ASSERT_OK(dbfull()->SetOptions({
-      {"disable_auto_compactions", "true"},
-  }));
-  ColumnFamilyMetaData cf_meta;
-  db_->GetColumnFamilyMetaData(&cf_meta);
-
-  // Ensure that L1+ files are non-overlapping and together with L0 encompass
-  // full key range between smallestkey and largestkey from CF file metadata.
-  int largestkey_in_prev_level = -1;
-  int keys_found = 0;
-  for (int level = (int)cf_meta.levels.size() - 1; level >= 0; level--) {
-    int files_in_level = (int)cf_meta.levels[level].files.size();
-    int largestkey_in_prev_file = -1;
-    for (int j = 0; j < files_in_level; j++) {
-      int smallestkey = IdFromKey(cf_meta.levels[level].files[j].smallestkey);
-      int largestkey = IdFromKey(cf_meta.levels[level].files[j].largestkey);
-      int num_entries = (int)cf_meta.levels[level].files[j].num_entries;
-      ASSERT_EQ(num_entries, largestkey - smallestkey + 1);
-      keys_found += num_entries;
-      if (level > 0) {
-        if (j == 0) {
-          ASSERT_GT(smallestkey, largestkey_in_prev_level);
-        }
-        if (j > 0) {
-          ASSERT_GT(smallestkey, largestkey_in_prev_file);
-        }
-        if (j == files_in_level - 1) {
-          largestkey_in_prev_level = largestkey;
-        }
-      }
-      largestkey_in_prev_file = largestkey;
-    }
-  }
-  ASSERT_EQ(keys_found, kNKeys);
-
-  for (const auto& file : cf_meta.levels[4].files) {
-    listener->SetExpectedFileName(dbname_ + file.name);
-    Slice start(file.smallestkey), limit(file.largestkey);
-    const RangePtr ranges(&start, &limit);
-    // Given verification from above, we're guaranteed that by deleting all the
-    // files in [<smallestkey>, <largestkey>] range, we're effectively deleting
-    // that very single file and nothing more.
-    EXPECT_OK(dbfull()->DeleteFilesInRanges(dbfull()->DefaultColumnFamily(),
-                                            &ranges, true /* include_end */));
-  }
-  listener->VerifyMatchedCount(cf_meta.levels[4].files.size());
-
-  int num_keys = 0;
-  std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
-  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-    num_keys++;
-  }
-  ASSERT_OK(iter->status());
-
-  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
-  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
-  ASSERT_GE(NumTableFilesAtLevel(3), 1);
-  ASSERT_EQ(NumTableFilesAtLevel(4), 0);
-
-  ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(3), num_keys * 4000U + num_keys * 10U);
-}
-
-TEST_F(DBTest, DynamicLevelCompressionPerLevel2) {
-  if (!Snappy_Supported() || !LZ4_Supported() || !Zlib_Supported()) {
-    return;
-  }
-  const int kNKeys = 500;
-  int keys[kNKeys];
-  for (int i = 0; i < kNKeys; i++) {
-    keys[i] = i;
-  }
-  RandomShuffle(std::begin(keys), std::end(keys));
-
-  Random rnd(301);
-  Options options;
-  options.create_if_missing = true;
-  options.db_write_buffer_size = 6000000;
-  options.write_buffer_size = 600000;
-  options.max_write_buffer_number = 2;
-  options.level0_file_num_compaction_trigger = 2;
-  options.level0_slowdown_writes_trigger = 2;
-  options.level0_stop_writes_trigger = 2;
-  options.soft_pending_compaction_bytes_limit = 1024 * 1024;
-  options.target_file_size_base = 20;
-  options.env = env_;
-  options.level_compaction_dynamic_level_bytes = true;
-  options.max_bytes_for_level_base = 200;
-  options.max_bytes_for_level_multiplier = 8;
-  options.max_background_compactions = 1;
-  options.num_levels = 5;
-  std::shared_ptr<mock::MockTableFactory> mtf(new mock::MockTableFactory);
-  options.table_factory = mtf;
-
-  options.compression_per_level.resize(3);
-  options.compression_per_level[0] = kNoCompression;
-  options.compression_per_level[1] = kLZ4Compression;
-  options.compression_per_level[2] = kZlibCompression;
-
-  DestroyAndReopen(options);
-  // When base level is L4, L4 is LZ4.
-  std::atomic<int> num_zlib(0);
-  std::atomic<int> num_lz4(0);
-  std::atomic<int> num_no(0);
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
-        Compaction* compaction = static_cast<Compaction*>(arg);
-        if (compaction->output_level() == 4) {
-          ASSERT_TRUE(compaction->output_compression() == kLZ4Compression);
-          num_lz4.fetch_add(1);
-        }
-      });
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) {
-        auto* compression = static_cast<CompressionType*>(arg);
-        ASSERT_TRUE(*compression == kNoCompression);
-        num_no.fetch_add(1);
-      });
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
-
-  for (int i = 0; i < 100; i++) {
-    std::string value = rnd.RandomString(200);
-    ASSERT_OK(Put(Key(keys[i]), value));
-    if (i % 25 == 24) {
-      ASSERT_OK(Flush());
-      ASSERT_OK(dbfull()->TEST_WaitForCompact());
-    }
-  }
-
-  ASSERT_OK(Flush());
-  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
-  ASSERT_OK(dbfull()->TEST_WaitForCompact());
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
-
-  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
-  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
-  ASSERT_EQ(NumTableFilesAtLevel(3), 0);
-  ASSERT_GT(NumTableFilesAtLevel(4), 0);
-  ASSERT_GT(num_no.load(), 2);
-  ASSERT_GT(num_lz4.load(), 0);
-  int prev_num_files_l4 = NumTableFilesAtLevel(4);
-
-  // After base level turn L4->L3, L3 becomes LZ4 and L4 becomes Zlib
-  num_lz4.store(0);
-  num_no.store(0);
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
-        Compaction* compaction = static_cast<Compaction*>(arg);
-        if (compaction->output_level() == 4 && compaction->start_level() == 3) {
-          ASSERT_TRUE(compaction->output_compression() == kZlibCompression);
-          num_zlib.fetch_add(1);
-        } else {
-          ASSERT_TRUE(compaction->output_compression() == kLZ4Compression);
-          num_lz4.fetch_add(1);
-        }
-      });
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) {
-        auto* compression = static_cast<CompressionType*>(arg);
-        ASSERT_TRUE(*compression == kNoCompression);
-        num_no.fetch_add(1);
-      });
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
-
-  for (int i = 101; i < 500; i++) {
-    std::string value = rnd.RandomString(200);
-    ASSERT_OK(Put(Key(keys[i]), value));
-    if (i % 100 == 99) {
-      ASSERT_OK(Flush());
-      ASSERT_OK(dbfull()->TEST_WaitForCompact());
-    }
-  }
-
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
-  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
-  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
-  ASSERT_GT(NumTableFilesAtLevel(3), 0);
-  ASSERT_GT(NumTableFilesAtLevel(4), prev_num_files_l4);
-  ASSERT_GT(num_no.load(), 2);
-  ASSERT_GT(num_lz4.load(), 0);
-  ASSERT_GT(num_zlib.load(), 0);
+  CancelAllBackgroundWork(db_.get());
 }
 
 TEST_F(DBTest, DynamicCompactionOptions) {
@@ -6083,53 +6205,6 @@ TEST_F(DBTest, L0L1L2AndUpHitCounter) {
                                TestGetTickerCount(options, GET_HIT_L2_AND_UP));
 }
 
-TEST_F(DBTest, EncodeDecompressedBlockSizeTest) {
-  // iter 0 -- zlib
-  // iter 1 -- bzip2
-  // iter 2 -- lz4
-  // iter 3 -- lz4HC
-  // iter 4 -- xpress
-  CompressionType compressions[] = {kZlibCompression, kBZip2Compression,
-                                    kLZ4Compression, kLZ4HCCompression,
-                                    kXpressCompression};
-  for (auto comp : compressions) {
-    if (!CompressionTypeSupported(comp)) {
-      continue;
-    }
-    // first_table_version 1 -- generate with table_version == 1, read with
-    // table_version == 2
-    // first_table_version 2 -- generate with table_version == 2, read with
-    // table_version == 1
-    for (int first_table_version = 1; first_table_version <= 2;
-         ++first_table_version) {
-      BlockBasedTableOptions table_options;
-      table_options.format_version = first_table_version;
-      table_options.filter_policy.reset(NewBloomFilterPolicy(10));
-      Options options = CurrentOptions();
-      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-      options.create_if_missing = true;
-      options.compression = comp;
-      DestroyAndReopen(options);
-
-      int kNumKeysWritten = 1000;
-
-      Random rnd(301);
-      for (int i = 0; i < kNumKeysWritten; ++i) {
-        // compressible string
-        ASSERT_OK(Put(Key(i), rnd.RandomString(128) + std::string(128, 'a')));
-      }
-
-      table_options.format_version = first_table_version == 1 ? 2 : 1;
-      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-      Reopen(options);
-      for (int i = 0; i < kNumKeysWritten; ++i) {
-        auto r = Get(Key(i));
-        ASSERT_EQ(r.substr(128), std::string(128, 'a'));
-      }
-    }
-  }
-}
-
 TEST_F(DBTest, CloseSpeedup) {
   Options options = CurrentOptions();
   options.compaction_style = kCompactionStyleLevel;
@@ -6254,9 +6329,9 @@ TEST_F(DBTest, MergeTestTime) {
 
   ASSERT_EQ(1, count);
   ASSERT_EQ(4000000, TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME));
-#ifdef ROCKSDB_USING_THREAD_STATUS
+#ifndef NROCKSDB_THREAD_STATUS
   ASSERT_GT(TestGetTickerCount(options, FLUSH_WRITE_BYTES), 0);
-#endif  // ROCKSDB_USING_THREAD_STATUS
+#endif  // !NROCKSDB_THREAD_STATUS
 }
 
 TEST_P(DBTestWithParam, MergeCompactionTimeTest) {
@@ -6366,7 +6441,8 @@ TEST_P(DBTestWithParam, CompactionTotalTimeTest) {
   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
 
   // Hard-coded number in CompactionJob::ProcessKeyValueCompaction().
-  const int kRecordStatsEvery = 1000;
+  // Uses 1024 (power of 2) for efficient bitwise check.
+  const int kRecordStatsEvery = 1024;
   // The stat COMPACTION_CPU_TOTAL_TIME should be recorded
   // during compaction and once more after compaction.
   ASSERT_EQ(n / kRecordStatsEvery + 1, record_count);
@@ -6389,7 +6465,7 @@ TEST_F(DBTest, TestLogCleanup) {
 
   for (int i = 0; i < 100000; ++i) {
     ASSERT_OK(Put(Key(i), "val"));
-    // only 2 memtables will be alive, so logs_to_free needs to always be below
+    // only 2 memtables will be alive, so wals_to_free needs to always be below
     // 2
     ASSERT_LT(dbfull()->TEST_LogsToFreeSize(), static_cast<size_t>(3));
   }
@@ -6458,7 +6534,7 @@ TEST_F(DBTest, SuggestCompactRangeTest) {
 
   // compact it three times
   for (int i = 0; i < 3; ++i) {
-    ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr));
+    ASSERT_OK(experimental::SuggestCompactRange(db_.get(), nullptr, nullptr));
     ASSERT_OK(dbfull()->TEST_WaitForCompact());
   }
 
@@ -6471,7 +6547,7 @@ TEST_F(DBTest, SuggestCompactRangeTest) {
 
   // nonoverlapping with the file on level 0
   Slice start("a"), end("b");
-  ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end));
+  ASSERT_OK(experimental::SuggestCompactRange(db_.get(), &start, &end));
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   // should not compact the level 0 file
@@ -6479,7 +6555,7 @@ TEST_F(DBTest, SuggestCompactRangeTest) {
 
   start = Slice("j");
   end = Slice("m");
-  ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end));
+  ASSERT_OK(experimental::SuggestCompactRange(db_.get(), &start, &end));
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
   // SuggestCompactRange() is not going to be reported as manual compaction
   ASSERT_TRUE(!CompactionFilterFactoryGetContext::IsManual(
@@ -6530,7 +6606,7 @@ TEST_F(DBTest, SuggestCompactRangeUniversal) {
 
   // nonoverlapping with the file on level 0
   Slice start("a"), end("b");
-  ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end));
+  ASSERT_OK(experimental::SuggestCompactRange(db_.get(), &start, &end));
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   // should not compact the level 0 file
@@ -6538,7 +6614,7 @@ TEST_F(DBTest, SuggestCompactRangeUniversal) {
 
   start = Slice("j");
   end = Slice("m");
-  ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end));
+  ASSERT_OK(experimental::SuggestCompactRange(db_.get(), &start, &end));
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   // now it should compact the level 0 file to the last level
@@ -6575,7 +6651,7 @@ TEST_F(DBTest, PromoteL0) {
   ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0);  // No files in L1
 
   // Promote L0 level to L2.
-  ASSERT_OK(experimental::PromoteL0(db_, db_->DefaultColumnFamily(), 2));
+  ASSERT_OK(experimental::PromoteL0(db_.get(), db_->DefaultColumnFamily(), 2));
   // We expect that all the files were trivially moved from L0 to L2
   ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
   ASSERT_EQ(NumTableFilesAtLevel(2, 0), level0_files);
@@ -6600,7 +6676,7 @@ TEST_F(DBTest, PromoteL0Failure) {
 
   Status status;
   // Fails because L0 has overlapping files.
-  status = experimental::PromoteL0(db_, db_->DefaultColumnFamily());
+  status = experimental::PromoteL0(db_.get(), db_->DefaultColumnFamily());
   ASSERT_TRUE(status.IsInvalidArgument());
 
   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
@@ -6610,7 +6686,7 @@ TEST_F(DBTest, PromoteL0Failure) {
   ASSERT_OK(Put(Key(5), ""));
   ASSERT_OK(Flush());
   // Fails because L1 is non-empty.
-  status = experimental::PromoteL0(db_, db_->DefaultColumnFamily());
+  status = experimental::PromoteL0(db_.get(), db_->DefaultColumnFamily());
   ASSERT_TRUE(status.IsInvalidArgument());
 }
 
@@ -7205,27 +7281,6 @@ TEST_F(DBTest, LastWriteBufferDelay) {
 }
 #endif  // !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
 
-TEST_F(DBTest, FailWhenCompressionNotSupportedTest) {
-  CompressionType compressions[] = {kZlibCompression, kBZip2Compression,
-                                    kLZ4Compression, kLZ4HCCompression,
-                                    kXpressCompression};
-  for (auto comp : compressions) {
-    if (!CompressionTypeSupported(comp)) {
-      // not supported, we should fail the Open()
-      Options options = CurrentOptions();
-      options.compression = comp;
-      ASSERT_TRUE(!TryReopen(options).ok());
-      // Try if CreateColumnFamily also fails
-      options.compression = kNoCompression;
-      ASSERT_OK(TryReopen(options));
-      ColumnFamilyOptions cf_options(options);
-      cf_options.compression = comp;
-      ColumnFamilyHandle* handle;
-      ASSERT_TRUE(!db_->CreateColumnFamily(cf_options, "name", &handle).ok());
-    }
-  }
-}
-
 TEST_F(DBTest, CreateColumnFamilyShouldFailOnIncompatibleOptions) {
   Options options = CurrentOptions();
   options.max_open_files = 100;
@@ -7702,7 +7757,7 @@ TEST_F(DBTest, ShuttingDownNotBlockStalledWrites) {
   });
 
   TEST_SYNC_POINT("DBTest::ShuttingDownNotBlockStalledWrites");
-  CancelAllBackgroundWork(db_, true);
+  CancelAllBackgroundWork(db_.get(), true);
 
   thd.join();
 }
diff --git a/db/db_test2.cc b/db/db_test2.cc
index 6c4f6243719d..6129e2d923b8 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -10,7 +10,6 @@
 #include <atomic>
 #include <cstdlib>
 #include <functional>
-#include <iostream>
 #include <memory>
 
 #include "db/db_test_util.h"
@@ -40,7 +39,7 @@ class DBTest2 : public DBTestBase {
 };
 
 TEST_F(DBTest2, OpenForReadOnly) {
-  DB* db_ptr = nullptr;
+  std::unique_ptr<DB> db_ptr;
   std::string dbname = test::PerThreadDBPath("db_readonly");
   Options options = CurrentOptions();
   options.create_if_missing = true;
@@ -64,7 +63,7 @@ TEST_F(DBTest2, OpenForReadOnly) {
 }
 
 TEST_F(DBTest2, OpenForReadOnlyWithColumnFamilies) {
-  DB* db_ptr = nullptr;
+  std::unique_ptr<DB> db_ptr;
   std::string dbname = test::PerThreadDBPath("db_readonly");
   Options options = CurrentOptions();
   options.create_if_missing = true;
@@ -350,9 +349,9 @@ TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) {
   ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
   ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
   ASSERT_OK(Flush(0));
-  ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+  ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "default"),
             static_cast<uint64_t>(1));
-  ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+  ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "nikitich"),
             static_cast<uint64_t>(1));
 
   flush_listener->expected_flush_reason = FlushReason::kWriteBufferManager;
@@ -372,13 +371,13 @@ TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) {
   // No flush should trigger
   wait_flush();
   {
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "default"),
               static_cast<uint64_t>(1));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu"),
               static_cast<uint64_t>(0));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "dobrynia"),
               static_cast<uint64_t>(0));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "nikitich"),
               static_cast<uint64_t>(1));
   }
 
@@ -388,13 +387,13 @@ TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) {
   ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
   wait_flush();
   {
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "default"),
               static_cast<uint64_t>(1));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu"),
               static_cast<uint64_t>(0));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "dobrynia"),
               static_cast<uint64_t>(0));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "nikitich"),
               static_cast<uint64_t>(2));
   }
 
@@ -406,13 +405,13 @@ TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) {
   ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
   wait_flush();
   {
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "default"),
               static_cast<uint64_t>(1));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu"),
               static_cast<uint64_t>(0));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "dobrynia"),
               static_cast<uint64_t>(0));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "nikitich"),
               static_cast<uint64_t>(2));
   }
 
@@ -429,13 +428,13 @@ TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) {
   ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
   wait_flush();
   {
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "default"),
               static_cast<uint64_t>(2));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu"),
               static_cast<uint64_t>(0));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "dobrynia"),
               static_cast<uint64_t>(0));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "nikitich"),
               static_cast<uint64_t>(2));
   }
 
@@ -451,13 +450,13 @@ TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) {
   wait_flush();
 
   {
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "default"),
               static_cast<uint64_t>(2));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu"),
               static_cast<uint64_t>(0));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "dobrynia"),
               static_cast<uint64_t>(1));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "nikitich"),
               static_cast<uint64_t>(2));
   }
   if (cost_cache_) {
@@ -507,7 +506,7 @@ TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) {
   CreateAndReopenWithCF({"cf1", "cf2"}, options);
 
   ASSERT_OK(DestroyDB(dbname2, options));
-  DB* db2 = nullptr;
+  std::unique_ptr<DB> db2;
   ASSERT_OK(DB::Open(options, dbname2, &db2));
 
   WriteOptions wo;
@@ -517,12 +516,12 @@ TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) {
     ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
     ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
     ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2]));
-    ASSERT_OK(static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable());
+    ASSERT_OK(static_cast<DBImpl*>(db2.get())->TEST_WaitForFlushMemTable());
     // Ensure background work is fully finished including listener callbacks
     // before accessing listener state.
     ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
-    ASSERT_OK(
-        static_cast_with_check<DBImpl>(db2)->TEST_WaitForBackgroundWork());
+    ASSERT_OK(static_cast_with_check<DBImpl>(db2.get())
+                  ->TEST_WaitForBackgroundWork());
   };
 
   // Trigger a flush on cf2
@@ -538,13 +537,13 @@ TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) {
 
   ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
   wait_flush();
-  ASSERT_OK(static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable());
+  ASSERT_OK(static_cast<DBImpl*>(db2.get())->TEST_WaitForFlushMemTable());
   {
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default") +
-                  GetNumberOfSstFilesForColumnFamily(db_, "cf1") +
-                  GetNumberOfSstFilesForColumnFamily(db_, "cf2"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "default") +
+                  GetNumberOfSstFilesForColumnFamily(db_.get(), "cf1") +
+                  GetNumberOfSstFilesForColumnFamily(db_.get(), "cf2"),
               static_cast<uint64_t>(1));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2.get(), "default"),
               static_cast<uint64_t>(0));
   }
 
@@ -554,13 +553,13 @@ TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) {
   ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
   wait_flush();
   {
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "default"),
               static_cast<uint64_t>(1));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "cf1"),
               static_cast<uint64_t>(0));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf2"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "cf2"),
               static_cast<uint64_t>(1));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2.get(), "default"),
               static_cast<uint64_t>(0));
   }
 
@@ -569,19 +568,19 @@ TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) {
   wait_flush();
   ASSERT_OK(db2->Put(wo, Key(1), DummyString(1)));
   wait_flush();
-  ASSERT_OK(static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable());
+  ASSERT_OK(static_cast<DBImpl*>(db2.get())->TEST_WaitForFlushMemTable());
   {
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "default"),
               static_cast<uint64_t>(1));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "cf1"),
               static_cast<uint64_t>(0));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf2"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "cf2"),
               static_cast<uint64_t>(1));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2.get(), "default"),
               static_cast<uint64_t>(1));
   }
 
-  delete db2;
+  db2.reset();
   ASSERT_OK(DestroyDB(dbname2, options));
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
@@ -786,7 +785,7 @@ TEST_F(DBTest2, WalFilterTest) {
     while (true) {
       // Ensure that expected keys exists
       // and not expected keys don't exist after recovery
-      ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist);
+      ValidateKeyExistence(db_.get(), keys_must_exist, keys_must_not_exist);
 
       if (checked_after_reopen) {
         break;
@@ -923,7 +922,7 @@ TEST_F(DBTest2, WalFilterTestWithChangeBatch) {
   while (true) {
     // Ensure that expected keys exists
     // and not expected keys don't exist after recovery
-    ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist);
+    ValidateKeyExistence(db_.get(), keys_must_exist, keys_must_not_exist);
 
     if (checked_after_reopen) {
       break;
@@ -1005,7 +1004,7 @@ TEST_F(DBTest2, WalFilterTestWithChangeBatchExtraKeys) {
     }
   }
 
-  ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist);
+  ValidateKeyExistence(db_.get(), keys_must_exist, keys_must_not_exist);
 }
 
 TEST_F(DBTest2, WalFilterTestWithColumnFamilies) {
@@ -1186,705 +1185,6 @@ TEST_F(DBTest2, WalFilterTestWithColumnFamilies) {
   ASSERT_EQ(index, keys_cf.size());
 }
 
-TEST_F(DBTest2, PresetCompressionDict) {
-  // Verifies that compression ratio improves when dictionary is enabled, and
-  // improves even further when the dictionary is trained by ZSTD.
-  const size_t kBlockSizeBytes = 4 << 10;
-  const size_t kL0FileBytes = 128 << 10;
-  const size_t kApproxPerBlockOverheadBytes = 50;
-  const int kNumL0Files = 5;
-
-  Options options;
-  // Make sure to use any custom env that the test is configured with.
-  options.env = CurrentOptions().env;
-  options.allow_concurrent_memtable_write = false;
-  options.arena_block_size = kBlockSizeBytes;
-  options.create_if_missing = true;
-  options.disable_auto_compactions = true;
-  options.level0_file_num_compaction_trigger = kNumL0Files;
-  options.memtable_factory.reset(
-      test::NewSpecialSkipListFactory(kL0FileBytes / kBlockSizeBytes));
-  options.num_levels = 2;
-  options.target_file_size_base = kL0FileBytes;
-  options.target_file_size_multiplier = 2;
-  options.write_buffer_size = kL0FileBytes;
-  BlockBasedTableOptions table_options;
-  table_options.block_size = kBlockSizeBytes;
-  std::vector<CompressionType> compression_types;
-  if (Zlib_Supported()) {
-    compression_types.push_back(kZlibCompression);
-  }
-#if LZ4_VERSION_NUMBER >= 10400  // r124+
-  compression_types.push_back(kLZ4Compression);
-  compression_types.push_back(kLZ4HCCompression);
-#endif  // LZ4_VERSION_NUMBER >= 10400
-  if (ZSTD_Supported()) {
-    compression_types.push_back(kZSTD);
-  }
-
-  enum DictionaryTypes : int {
-    kWithoutDict,
-    kWithDict,
-    kWithZSTDfinalizeDict,
-    kWithZSTDTrainedDict,
-    kDictEnd,
-  };
-
-  for (auto compression_type : compression_types) {
-    options.compression = compression_type;
-    size_t bytes_without_dict = 0;
-    size_t bytes_with_dict = 0;
-    size_t bytes_with_zstd_finalize_dict = 0;
-    size_t bytes_with_zstd_trained_dict = 0;
-    for (int i = kWithoutDict; i < kDictEnd; i++) {
-      // First iteration: compress without preset dictionary
-      // Second iteration: compress with preset dictionary
-      // Third iteration (zstd only): compress with zstd-trained dictionary
-      //
-      // To make sure the compression dictionary has the intended effect, we
-      // verify the compressed size is smaller in successive iterations. Also in
-      // the non-first iterations, verify the data we get out is the same data
-      // we put in.
-      switch (i) {
-        case kWithoutDict:
-          options.compression_opts.max_dict_bytes = 0;
-          options.compression_opts.zstd_max_train_bytes = 0;
-          break;
-        case kWithDict:
-          options.compression_opts.max_dict_bytes = kBlockSizeBytes;
-          options.compression_opts.zstd_max_train_bytes = 0;
-          break;
-        case kWithZSTDfinalizeDict:
-          if (compression_type != kZSTD ||
-              !ZSTD_FinalizeDictionarySupported()) {
-            continue;
-          }
-          options.compression_opts.max_dict_bytes = kBlockSizeBytes;
-          options.compression_opts.zstd_max_train_bytes = kL0FileBytes;
-          options.compression_opts.use_zstd_dict_trainer = false;
-          break;
-        case kWithZSTDTrainedDict:
-          if (compression_type != kZSTD || !ZSTD_TrainDictionarySupported()) {
-            continue;
-          }
-          options.compression_opts.max_dict_bytes = kBlockSizeBytes;
-          options.compression_opts.zstd_max_train_bytes = kL0FileBytes;
-          options.compression_opts.use_zstd_dict_trainer = true;
-          break;
-        default:
-          assert(false);
-      }
-
-      options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
-      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-      CreateAndReopenWithCF({"pikachu"}, options);
-      Random rnd(301);
-      std::string seq_datas[10];
-      for (int j = 0; j < 10; ++j) {
-        seq_datas[j] =
-            rnd.RandomString(kBlockSizeBytes - kApproxPerBlockOverheadBytes);
-      }
-
-      ASSERT_EQ(0, NumTableFilesAtLevel(0, 1));
-      for (int j = 0; j < kNumL0Files; ++j) {
-        for (size_t k = 0; k < kL0FileBytes / kBlockSizeBytes + 1; ++k) {
-          auto key_num = j * (kL0FileBytes / kBlockSizeBytes) + k;
-          ASSERT_OK(Put(1, Key(static_cast<int>(key_num)),
-                        seq_datas[(key_num / 10) % 10]));
-        }
-        ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
-        ASSERT_EQ(j + 1, NumTableFilesAtLevel(0, 1));
-      }
-      ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1],
-                                            true /* disallow_trivial_move */));
-      ASSERT_EQ(0, NumTableFilesAtLevel(0, 1));
-      ASSERT_GT(NumTableFilesAtLevel(1, 1), 0);
-
-      // Get the live sst files size
-      size_t total_sst_bytes = TotalSize(1);
-      if (i == kWithoutDict) {
-        bytes_without_dict = total_sst_bytes;
-      } else if (i == kWithDict) {
-        bytes_with_dict = total_sst_bytes;
-      } else if (i == kWithZSTDfinalizeDict) {
-        bytes_with_zstd_finalize_dict = total_sst_bytes;
-      } else if (i == kWithZSTDTrainedDict) {
-        bytes_with_zstd_trained_dict = total_sst_bytes;
-      }
-
-      for (size_t j = 0; j < kNumL0Files * (kL0FileBytes / kBlockSizeBytes);
-           j++) {
-        ASSERT_EQ(seq_datas[(j / 10) % 10], Get(1, Key(static_cast<int>(j))));
-      }
-      if (i == kWithDict) {
-        ASSERT_GT(bytes_without_dict, bytes_with_dict);
-      } else if (i == kWithZSTDTrainedDict) {
-        // In zstd compression, it is sometimes possible that using a finalized
-        // dictionary does not get as good a compression ratio as raw content
-        // dictionary. But using a dictionary should always get better
-        // compression ratio than not using one.
-        ASSERT_TRUE(bytes_with_dict > bytes_with_zstd_finalize_dict ||
-                    bytes_without_dict > bytes_with_zstd_finalize_dict);
-      } else if (i == kWithZSTDTrainedDict) {
-        // In zstd compression, it is sometimes possible that using a trained
-        // dictionary does not get as good a compression ratio as without
-        // training.
-        // But using a dictionary (with or without training) should always get
-        // better compression ratio than not using one.
-        ASSERT_TRUE(bytes_with_dict > bytes_with_zstd_trained_dict ||
-                    bytes_without_dict > bytes_with_zstd_trained_dict);
-      }
-
-      DestroyAndReopen(options);
-    }
-  }
-}
-
-TEST_F(DBTest2, PresetCompressionDictLocality) {
-  if (!ZSTD_Supported()) {
-    return;
-  }
-  // Verifies that compression dictionary is generated from local data. The
-  // verification simply checks all output SSTs have different compression
-  // dictionaries. We do not verify effectiveness as that'd likely be flaky in
-  // the future.
-  const int kNumEntriesPerFile = 1 << 10;  // 1KB
-  const int kNumBytesPerEntry = 1 << 10;   // 1KB
-  const int kNumFiles = 4;
-  Options options = CurrentOptions();
-  options.compression = kZSTD;
-  options.compression_opts.max_dict_bytes = 1 << 14;        // 16KB
-  options.compression_opts.zstd_max_train_bytes = 1 << 18;  // 256KB
-  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
-  options.target_file_size_base = kNumEntriesPerFile * kNumBytesPerEntry;
-  BlockBasedTableOptions table_options;
-  table_options.cache_index_and_filter_blocks = true;
-  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-  Reopen(options);
-
-  Random rnd(301);
-  for (int i = 0; i < kNumFiles; ++i) {
-    for (int j = 0; j < kNumEntriesPerFile; ++j) {
-      ASSERT_OK(Put(Key(i * kNumEntriesPerFile + j),
-                    rnd.RandomString(kNumBytesPerEntry)));
-    }
-    ASSERT_OK(Flush());
-    MoveFilesToLevel(1);
-    ASSERT_EQ(NumTableFilesAtLevel(1), i + 1);
-  }
-
-  // Store all the dictionaries generated during a full compaction.
-  std::vector<std::string> compression_dicts;
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict",
-      [&](void* arg) {
-        compression_dicts.emplace_back(static_cast<Slice*>(arg)->ToString());
-      });
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
-  CompactRangeOptions compact_range_opts;
-  compact_range_opts.bottommost_level_compaction =
-      BottommostLevelCompaction::kForceOptimized;
-  ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr));
-
-  // Dictionary compression should not be so good as to compress four totally
-  // random files into one. If it does then there's probably something wrong
-  // with the test.
-  ASSERT_GT(NumTableFilesAtLevel(1), 1);
-
-  // Furthermore, there should be one compression dictionary generated per file.
-  // And they should all be different from each other.
-  ASSERT_EQ(NumTableFilesAtLevel(1),
-            static_cast<int>(compression_dicts.size()));
-  for (size_t i = 1; i < compression_dicts.size(); ++i) {
-    std::string& a = compression_dicts[i - 1];
-    std::string& b = compression_dicts[i];
-    size_t alen = a.size();
-    size_t blen = b.size();
-    ASSERT_TRUE(alen != blen || memcmp(a.data(), b.data(), alen) != 0);
-  }
-}
-
-class PresetCompressionDictTest
-    : public DBTestBase,
-      public testing::WithParamInterface<std::tuple<CompressionType, bool>> {
- public:
-  PresetCompressionDictTest()
-      : DBTestBase("db_test2", false /* env_do_fsync */),
-        compression_type_(std::get<0>(GetParam())),
-        bottommost_(std::get<1>(GetParam())) {}
-
- protected:
-  const CompressionType compression_type_;
-  const bool bottommost_;
-};
-
-INSTANTIATE_TEST_CASE_P(
-    DBTest2, PresetCompressionDictTest,
-    ::testing::Combine(::testing::ValuesIn(GetSupportedDictCompressions()),
-                       ::testing::Bool()));
-
-TEST_P(PresetCompressionDictTest, Flush) {
-  // Verifies that dictionary is generated and written during flush only when
-  // `ColumnFamilyOptions::compression` enables dictionary. Also verifies the
-  // size of the dictionary is within expectations according to the limit on
-  // buffering set by `CompressionOptions::max_dict_buffer_bytes`.
-  const size_t kValueLen = 256;
-  const size_t kKeysPerFile = 1 << 10;
-  const size_t kDictLen = 16 << 10;
-  const size_t kBlockLen = 4 << 10;
-
-  Options options = CurrentOptions();
-  if (bottommost_) {
-    options.bottommost_compression = compression_type_;
-    options.bottommost_compression_opts.enabled = true;
-    options.bottommost_compression_opts.max_dict_bytes = kDictLen;
-    options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen;
-  } else {
-    options.compression = compression_type_;
-    options.compression_opts.max_dict_bytes = kDictLen;
-    options.compression_opts.max_dict_buffer_bytes = kBlockLen;
-  }
-  options.memtable_factory.reset(test::NewSpecialSkipListFactory(kKeysPerFile));
-  options.statistics = CreateDBStatistics();
-  BlockBasedTableOptions bbto;
-  bbto.block_size = kBlockLen;
-  bbto.cache_index_and_filter_blocks = true;
-  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-  Reopen(options);
-
-  Random rnd(301);
-  for (size_t i = 0; i <= kKeysPerFile; ++i) {
-    ASSERT_OK(Put(Key(static_cast<int>(i)), rnd.RandomString(kValueLen)));
-  }
-  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
-
-  // We can use `BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT` to detect whether a
-  // compression dictionary exists since dictionaries would be preloaded when
-  // the flush finishes.
-  if (bottommost_) {
-    // Flush is never considered bottommost. This should change in the future
-    // since flushed files may have nothing underneath them, like the one in
-    // this test case.
-    ASSERT_EQ(
-        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
-        0);
-  } else {
-    ASSERT_GT(
-        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
-        0);
-    // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on
-    // number of bytes needs to be adjusted in case the cached block is in
-    // ZSTD's digested dictionary format.
-    if (compression_type_ != kZSTD) {
-      // Although we limited buffering to `kBlockLen`, there may be up to two
-      // blocks of data included in the dictionary since we only check limit
-      // after each block is built.
-      ASSERT_LE(TestGetTickerCount(options,
-                                   BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
-                2 * kBlockLen);
-    }
-  }
-}
-
-TEST_P(PresetCompressionDictTest, CompactNonBottommost) {
-  // Verifies that dictionary is generated and written during compaction to
-  // non-bottommost level only when `ColumnFamilyOptions::compression` enables
-  // dictionary. Also verifies the size of the dictionary is within expectations
-  // according to the limit on buffering set by
-  // `CompressionOptions::max_dict_buffer_bytes`.
-  const size_t kValueLen = 256;
-  const size_t kKeysPerFile = 1 << 10;
-  const size_t kDictLen = 16 << 10;
-  const size_t kBlockLen = 4 << 10;
-
-  Options options = CurrentOptions();
-  if (bottommost_) {
-    options.bottommost_compression = compression_type_;
-    options.bottommost_compression_opts.enabled = true;
-    options.bottommost_compression_opts.max_dict_bytes = kDictLen;
-    options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen;
-  } else {
-    options.compression = compression_type_;
-    options.compression_opts.max_dict_bytes = kDictLen;
-    options.compression_opts.max_dict_buffer_bytes = kBlockLen;
-  }
-  options.disable_auto_compactions = true;
-  options.statistics = CreateDBStatistics();
-  BlockBasedTableOptions bbto;
-  bbto.block_size = kBlockLen;
-  bbto.cache_index_and_filter_blocks = true;
-  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-  Reopen(options);
-
-  Random rnd(301);
-  for (size_t j = 0; j <= kKeysPerFile; ++j) {
-    ASSERT_OK(Put(Key(static_cast<int>(j)), rnd.RandomString(kValueLen)));
-  }
-  ASSERT_OK(Flush());
-  MoveFilesToLevel(2);
-
-  for (int i = 0; i < 2; ++i) {
-    for (size_t j = 0; j <= kKeysPerFile; ++j) {
-      ASSERT_OK(Put(Key(static_cast<int>(j)), rnd.RandomString(kValueLen)));
-    }
-    ASSERT_OK(Flush());
-  }
-  ASSERT_EQ("2,0,1", FilesPerLevel(0));
-
-  uint64_t prev_compression_dict_bytes_inserted =
-      TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT);
-  // This L0->L1 compaction merges the two L0 files into L1. The produced L1
-  // file is not bottommost due to the existing L2 file covering the same key-
-  // range.
-  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
-  ASSERT_EQ("0,1,1", FilesPerLevel(0));
-  // We can use `BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT` to detect whether a
-  // compression dictionary exists since dictionaries would be preloaded when
-  // the compaction finishes.
-  if (bottommost_) {
-    ASSERT_EQ(
-        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
-        prev_compression_dict_bytes_inserted);
-  } else {
-    ASSERT_GT(
-        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
-        prev_compression_dict_bytes_inserted);
-    // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on
-    // number of bytes needs to be adjusted in case the cached block is in
-    // ZSTD's digested dictionary format.
-    if (compression_type_ != kZSTD) {
-      // Although we limited buffering to `kBlockLen`, there may be up to two
-      // blocks of data included in the dictionary since we only check limit
-      // after each block is built.
-      ASSERT_LE(TestGetTickerCount(options,
-                                   BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
-                prev_compression_dict_bytes_inserted + 2 * kBlockLen);
-    }
-  }
-}
-
-TEST_P(PresetCompressionDictTest, CompactBottommost) {
-  // Verifies that dictionary is generated and written during compaction to
-  // non-bottommost level only when either `ColumnFamilyOptions::compression` or
-  // `ColumnFamilyOptions::bottommost_compression` enables dictionary. Also
-  // verifies the size of the dictionary is within expectations according to the
-  // limit on buffering set by `CompressionOptions::max_dict_buffer_bytes`.
-  const size_t kValueLen = 256;
-  const size_t kKeysPerFile = 1 << 10;
-  const size_t kDictLen = 16 << 10;
-  const size_t kBlockLen = 4 << 10;
-
-  Options options = CurrentOptions();
-  if (bottommost_) {
-    options.bottommost_compression = compression_type_;
-    options.bottommost_compression_opts.enabled = true;
-    options.bottommost_compression_opts.max_dict_bytes = kDictLen;
-    options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen;
-  } else {
-    options.compression = compression_type_;
-    options.compression_opts.max_dict_bytes = kDictLen;
-    options.compression_opts.max_dict_buffer_bytes = kBlockLen;
-  }
-  options.disable_auto_compactions = true;
-  options.statistics = CreateDBStatistics();
-  BlockBasedTableOptions bbto;
-  bbto.block_size = kBlockLen;
-  bbto.cache_index_and_filter_blocks = true;
-  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-  Reopen(options);
-
-  Random rnd(301);
-  for (int i = 0; i < 2; ++i) {
-    for (size_t j = 0; j <= kKeysPerFile; ++j) {
-      ASSERT_OK(Put(Key(static_cast<int>(j)), rnd.RandomString(kValueLen)));
-    }
-    ASSERT_OK(Flush());
-  }
-  ASSERT_EQ("2", FilesPerLevel(0));
-
-  uint64_t prev_compression_dict_bytes_inserted =
-      TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT);
-  CompactRangeOptions cro;
-  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
-  ASSERT_EQ("0,1", FilesPerLevel(0));
-  ASSERT_GT(
-      TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
-      prev_compression_dict_bytes_inserted);
-  // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on
-  // number of bytes needs to be adjusted in case the cached block is in ZSTD's
-  // digested dictionary format.
-  if (compression_type_ != kZSTD) {
-    // Although we limited buffering to `kBlockLen`, there may be up to two
-    // blocks of data included in the dictionary since we only check limit after
-    // each block is built.
-    ASSERT_LE(
-        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
-        prev_compression_dict_bytes_inserted + 2 * kBlockLen);
-  }
-}
-
-class CompactionCompressionListener : public EventListener {
- public:
-  explicit CompactionCompressionListener(Options* db_options)
-      : db_options_(db_options) {}
-
-  void OnCompactionCompleted(DB* db, const CompactionJobInfo& ci) override {
-    // Figure out last level with files
-    int bottommost_level = 0;
-    for (int level = 0; level < db->NumberLevels(); level++) {
-      std::string files_at_level;
-      ASSERT_TRUE(
-          db->GetProperty("rocksdb.num-files-at-level" + std::to_string(level),
-                          &files_at_level));
-      if (files_at_level != "0") {
-        bottommost_level = level;
-      }
-    }
-
-    if (db_options_->bottommost_compression != kDisableCompressionOption &&
-        ci.output_level == bottommost_level) {
-      ASSERT_EQ(ci.compression, db_options_->bottommost_compression);
-    } else if (db_options_->compression_per_level.size() != 0) {
-      ASSERT_EQ(ci.compression,
-                db_options_->compression_per_level[ci.output_level]);
-    } else {
-      ASSERT_EQ(ci.compression, db_options_->compression);
-    }
-    max_level_checked = std::max(max_level_checked, ci.output_level);
-  }
-
-  int max_level_checked = 0;
-  const Options* db_options_;
-};
-
-enum CompressionFailureType {
-  kTestCompressionFail,
-  kTestDecompressionFail,
-  kTestDecompressionCorruption
-};
-
-class CompressionFailuresTest
-    : public DBTest2,
-      public testing::WithParamInterface<std::tuple<
-          CompressionFailureType, CompressionType, uint32_t, uint32_t>> {
- public:
-  CompressionFailuresTest() {
-    std::tie(compression_failure_type_, compression_type_,
-             compression_max_dict_bytes_, compression_parallel_threads_) =
-        GetParam();
-  }
-
-  CompressionFailureType compression_failure_type_ = kTestCompressionFail;
-  CompressionType compression_type_ = kNoCompression;
-  uint32_t compression_max_dict_bytes_ = 0;
-  uint32_t compression_parallel_threads_ = 0;
-};
-
-INSTANTIATE_TEST_CASE_P(
-    DBTest2, CompressionFailuresTest,
-    ::testing::Combine(::testing::Values(kTestCompressionFail,
-                                         kTestDecompressionFail,
-                                         kTestDecompressionCorruption),
-                       ::testing::ValuesIn(GetSupportedCompressions()),
-                       ::testing::Values(0, 10), ::testing::Values(1, 4)));
-
-TEST_P(CompressionFailuresTest, CompressionFailures) {
-  if (compression_type_ == kNoCompression) {
-    return;
-  }
-
-  Options options = CurrentOptions();
-  options.level0_file_num_compaction_trigger = 2;
-  options.max_bytes_for_level_base = 1024;
-  options.max_bytes_for_level_multiplier = 2;
-  options.num_levels = 7;
-  options.max_background_compactions = 1;
-  options.target_file_size_base = 512;
-
-  BlockBasedTableOptions table_options;
-  table_options.block_size = 512;
-  table_options.verify_compression = true;
-  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-
-  options.compression = compression_type_;
-  options.compression_opts.parallel_threads = compression_parallel_threads_;
-  options.compression_opts.max_dict_bytes = compression_max_dict_bytes_;
-  options.bottommost_compression_opts.parallel_threads =
-      compression_parallel_threads_;
-  options.bottommost_compression_opts.max_dict_bytes =
-      compression_max_dict_bytes_;
-
-  if (compression_failure_type_ == kTestCompressionFail) {
-    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-        "CompressData:TamperWithReturnValue", [](void* arg) {
-          bool* ret = static_cast<bool*>(arg);
-          *ret = false;
-        });
-  } else if (compression_failure_type_ == kTestDecompressionFail) {
-    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-        "UncompressBlockData:TamperWithReturnValue", [](void* arg) {
-          Status* ret = static_cast<Status*>(arg);
-          ASSERT_OK(*ret);
-          *ret = Status::Corruption("kTestDecompressionFail");
-        });
-  } else if (compression_failure_type_ == kTestDecompressionCorruption) {
-    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-        "UncompressBlockData:"
-        "TamperWithDecompressionOutput",
-        [](void* arg) {
-          BlockContents* contents = static_cast<BlockContents*>(arg);
-          // Ensure uncompressed data != original data
-          const size_t len = contents->data.size() + 1;
-          std::unique_ptr<char[]> fake_data(new char[len]());
-          *contents = BlockContents(std::move(fake_data), len);
-        });
-  }
-
-  std::map<std::string, std::string> key_value_written;
-
-  const int kKeySize = 5;
-  const int kValUnitSize = 16;
-  const int kValSize = 256;
-  Random rnd(405);
-
-  Status s = Status::OK();
-
-  DestroyAndReopen(options);
-  // Write 10 random files
-  for (int i = 0; i < 10; i++) {
-    for (int j = 0; j < 5; j++) {
-      std::string key = rnd.RandomString(kKeySize);
-      // Ensure good compression ratio
-      std::string valueUnit = rnd.RandomString(kValUnitSize);
-      std::string value;
-      for (int k = 0; k < kValSize; k += kValUnitSize) {
-        value += valueUnit;
-      }
-      s = Put(key, value);
-      if (compression_failure_type_ == kTestCompressionFail) {
-        key_value_written[key] = value;
-        ASSERT_OK(s);
-      }
-    }
-    s = Flush();
-    if (compression_failure_type_ == kTestCompressionFail) {
-      ASSERT_OK(s);
-    }
-    s = dbfull()->TEST_WaitForCompact();
-    if (compression_failure_type_ == kTestCompressionFail) {
-      ASSERT_OK(s);
-    }
-    if (i == 4) {
-      // Make compression fail at the mid of table building
-      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
-    }
-  }
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
-
-  if (compression_failure_type_ == kTestCompressionFail) {
-    // Should be kNoCompression, check content consistency
-    std::unique_ptr<Iterator> db_iter(db_->NewIterator(ReadOptions()));
-    for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
-      std::string key = db_iter->key().ToString();
-      std::string value = db_iter->value().ToString();
-      ASSERT_NE(key_value_written.find(key), key_value_written.end());
-      ASSERT_EQ(key_value_written[key], value);
-      key_value_written.erase(key);
-    }
-    ASSERT_OK(db_iter->status());
-    ASSERT_EQ(0, key_value_written.size());
-  } else if (compression_failure_type_ == kTestDecompressionFail) {
-    ASSERT_EQ(std::string(s.getState()),
-              "Could not decompress: kTestDecompressionFail");
-  } else if (compression_failure_type_ == kTestDecompressionCorruption) {
-    ASSERT_EQ(std::string(s.getState()),
-              "Decompressed block did not match pre-compression block");
-  }
-}
-
-TEST_F(DBTest2, CompressionOptions) {
-  if (!Zlib_Supported() || !Snappy_Supported()) {
-    return;
-  }
-
-  Options options = CurrentOptions();
-  options.level0_file_num_compaction_trigger = 2;
-  options.max_bytes_for_level_base = 100;
-  options.max_bytes_for_level_multiplier = 2;
-  options.num_levels = 7;
-  options.max_background_compactions = 1;
-
-  CompactionCompressionListener* listener =
-      new CompactionCompressionListener(&options);
-  options.listeners.emplace_back(listener);
-
-  const int kKeySize = 5;
-  const int kValSize = 20;
-  Random rnd(301);
-
-  std::vector<uint32_t> compression_parallel_threads = {1, 4};
-
-  std::map<std::string, std::string> key_value_written;
-
-  for (int iter = 0; iter <= 2; iter++) {
-    listener->max_level_checked = 0;
-
-    if (iter == 0) {
-      // Use different compression algorithms for different levels but
-      // always use Zlib for bottommost level
-      options.compression_per_level = {kNoCompression,     kNoCompression,
-                                       kNoCompression,     kSnappyCompression,
-                                       kSnappyCompression, kSnappyCompression,
-                                       kZlibCompression};
-      options.compression = kNoCompression;
-      options.bottommost_compression = kZlibCompression;
-    } else if (iter == 1) {
-      // Use Snappy except for bottommost level use ZLib
-      options.compression_per_level = {};
-      options.compression = kSnappyCompression;
-      options.bottommost_compression = kZlibCompression;
-    } else if (iter == 2) {
-      // Use Snappy everywhere
-      options.compression_per_level = {};
-      options.compression = kSnappyCompression;
-      options.bottommost_compression = kDisableCompressionOption;
-    }
-
-    for (auto num_threads : compression_parallel_threads) {
-      options.compression_opts.parallel_threads = num_threads;
-      options.bottommost_compression_opts.parallel_threads = num_threads;
-
-      DestroyAndReopen(options);
-      // Write 10 random files
-      for (int i = 0; i < 10; i++) {
-        for (int j = 0; j < 5; j++) {
-          std::string key = rnd.RandomString(kKeySize);
-          std::string value = rnd.RandomString(kValSize);
-          key_value_written[key] = value;
-          ASSERT_OK(Put(key, value));
-        }
-        ASSERT_OK(Flush());
-        ASSERT_OK(dbfull()->TEST_WaitForCompact());
-      }
-
-      // Make sure that we wrote enough to check all 7 levels
-      ASSERT_EQ(listener->max_level_checked, 6);
-
-      // Make sure database content is the same as key_value_written
-      std::unique_ptr<Iterator> db_iter(db_->NewIterator(ReadOptions()));
-      for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
-        std::string key = db_iter->key().ToString();
-        std::string value = db_iter->value().ToString();
-        ASSERT_NE(key_value_written.find(key), key_value_written.end());
-        ASSERT_EQ(key_value_written[key], value);
-        key_value_written.erase(key);
-      }
-      ASSERT_OK(db_iter->status());
-      ASSERT_EQ(0, key_value_written.size());
-    }
-  }
-}
-
 class CompactionStallTestListener : public EventListener {
  public:
   CompactionStallTestListener()
@@ -1992,7 +1292,7 @@ TEST_F(DBTest2, DuplicateSnapshot) {
   Options options;
   options = CurrentOptions(options);
   std::vector<const Snapshot*> snapshots;
-  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  DBImpl* dbi = dbfull();
   SequenceNumber oldest_ww_snap, first_ww_snap;
 
   ASSERT_OK(Put("k", "v"));  // inc seq
@@ -3010,7 +2310,7 @@ TEST_F(DBTest2, PausingManualCompaction1) {
       "TestCompactFiles:PausingManualCompaction:3", [&](void* arg) {
         auto paused = static_cast<std::atomic<int>*>(arg);
         // CompactFiles() relies on manual_compactions_paused to
-        // determine if thie compaction should be paused or not
+        // determine if this compaction should be paused or not
         ASSERT_EQ(0, paused->load(std::memory_order_acquire));
         paused->fetch_add(1, std::memory_order_release);
       });
@@ -3122,6 +2422,7 @@ TEST_F(DBTest2, PausingManualCompaction3) {
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
   dbfull()->DisableManualCompaction();
+
   ASSERT_TRUE(dbfull()
                   ->CompactRange(compact_options, nullptr, nullptr)
                   .IsManualCompactionPaused());
@@ -4393,16 +3694,16 @@ TEST_F(DBTest2, TraceAndReplay) {
 
   // Using a different name than db2, to pacify infer's use-after-lifetime
   // warnings (http://fbinfer.com).
-  DB* db2_init = nullptr;
+  std::unique_ptr<DB> db2_init;
   options.create_if_missing = true;
   ASSERT_OK(DB::Open(options, dbname2, &db2_init));
   ColumnFamilyHandle* cf;
   ASSERT_OK(
       db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
   delete cf;
-  delete db2_init;
+  db2_init.reset();
 
-  DB* db2 = nullptr;
+  std::unique_ptr<DB> db2;
   std::vector<ColumnFamilyDescriptor> column_families;
   ColumnFamilyOptions cf_options;
   cf_options.merge_operator = MergeOperators::CreatePutOperator();
@@ -4489,7 +3790,7 @@ TEST_F(DBTest2, TraceAndReplay) {
   for (auto handle : handles) {
     delete handle;
   }
-  delete db2;
+  db2.reset();
   ASSERT_OK(DestroyDB(dbname2, options));
 }
 
@@ -4584,16 +3885,16 @@ TEST_F(DBTest2, TraceAndManualReplay) {
 
   // Using a different name than db2, to pacify infer's use-after-lifetime
   // warnings (http://fbinfer.com).
-  DB* db2_init = nullptr;
+  std::unique_ptr<DB> db2_init;
   options.create_if_missing = true;
   ASSERT_OK(DB::Open(options, dbname2, &db2_init));
   ColumnFamilyHandle* cf;
   ASSERT_OK(
       db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
   delete cf;
-  delete db2_init;
+  db2_init.reset();
 
-  DB* db2 = nullptr;
+  std::unique_ptr<DB> db2;
   std::vector<ColumnFamilyDescriptor> column_families;
   ColumnFamilyOptions cf_options;
   cf_options.merge_operator = MergeOperators::CreatePutOperator();
@@ -4829,7 +4130,7 @@ TEST_F(DBTest2, TraceAndManualReplay) {
   for (auto handle : handles) {
     delete handle;
   }
-  delete db2;
+  db2.reset();
   ASSERT_OK(DestroyDB(dbname2, options));
 }
 
@@ -4860,16 +4161,16 @@ TEST_F(DBTest2, TraceWithLimit) {
 
   // Using a different name than db2, to pacify infer's use-after-lifetime
   // warnings (http://fbinfer.com).
-  DB* db2_init = nullptr;
+  std::unique_ptr<DB> db2_init;
   options.create_if_missing = true;
   ASSERT_OK(DB::Open(options, dbname2, &db2_init));
   ColumnFamilyHandle* cf;
   ASSERT_OK(
       db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
   delete cf;
-  delete db2_init;
+  db2_init.reset();
 
-  DB* db2 = nullptr;
+  std::unique_ptr<DB> db2;
   std::vector<ColumnFamilyDescriptor> column_families;
   ColumnFamilyOptions cf_options;
   cf_options.merge_operator = MergeOperators::CreatePutOperator();
@@ -4902,7 +4203,7 @@ TEST_F(DBTest2, TraceWithLimit) {
   for (auto handle : handles) {
     delete handle;
   }
-  delete db2;
+  db2.reset();
   ASSERT_OK(DestroyDB(dbname2, options));
 }
 
@@ -4934,16 +4235,16 @@ TEST_F(DBTest2, TraceWithSampling) {
 
   // Using a different name than db2, to pacify infer's use-after-lifetime
   // warnings (http://fbinfer.com).
-  DB* db2_init = nullptr;
+  std::unique_ptr<DB> db2_init;
   options.create_if_missing = true;
   ASSERT_OK(DB::Open(options, dbname2, &db2_init));
   ColumnFamilyHandle* cf;
   ASSERT_OK(
       db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
   delete cf;
-  delete db2_init;
+  db2_init.reset();
 
-  DB* db2 = nullptr;
+  std::unique_ptr<DB> db2;
   std::vector<ColumnFamilyDescriptor> column_families;
   ColumnFamilyOptions cf_options;
   column_families.emplace_back("default", cf_options);
@@ -4978,7 +4279,7 @@ TEST_F(DBTest2, TraceWithSampling) {
   for (auto handle : handles) {
     delete handle;
   }
-  delete db2;
+  db2.reset();
   ASSERT_OK(DestroyDB(dbname2, options));
 }
 
@@ -5038,16 +4339,16 @@ TEST_F(DBTest2, TraceWithFilter) {
 
   // Using a different name than db2, to pacify infer's use-after-lifetime
   // warnings (http://fbinfer.com).
-  DB* db2_init = nullptr;
+  std::unique_ptr<DB> db2_init;
   options.create_if_missing = true;
   ASSERT_OK(DB::Open(options, dbname2, &db2_init));
   ColumnFamilyHandle* cf;
   ASSERT_OK(
       db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
   delete cf;
-  delete db2_init;
+  db2_init.reset();
 
-  DB* db2 = nullptr;
+  std::unique_ptr<DB> db2;
   std::vector<ColumnFamilyDescriptor> column_families;
   ColumnFamilyOptions cf_options;
   cf_options.merge_operator = MergeOperators::CreatePutOperator();
@@ -5083,28 +4384,28 @@ TEST_F(DBTest2, TraceWithFilter) {
   for (auto handle : handles) {
     delete handle;
   }
-  delete db2;
+  db2.reset();
   ASSERT_OK(DestroyDB(dbname2, options));
 
   // Set up a new db.
   std::string dbname3 = test::PerThreadDBPath(env_, "db_not_trace_read");
   ASSERT_OK(DestroyDB(dbname3, options));
 
-  DB* db3_init = nullptr;
+  std::unique_ptr<DB> db3_init;
   options.create_if_missing = true;
   ColumnFamilyHandle* cf3;
   ASSERT_OK(DB::Open(options, dbname3, &db3_init));
   ASSERT_OK(
       db3_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf3));
   delete cf3;
-  delete db3_init;
+  db3_init.reset();
 
   column_families.clear();
   column_families.emplace_back("default", cf_options);
   column_families.emplace_back("pikachu", ColumnFamilyOptions());
   handles.clear();
 
-  DB* db3 = nullptr;
+  std::unique_ptr<DB> db3;
   ASSERT_OK(DB::Open(db_opts, dbname3, column_families, &handles, &db3));
 
   env_->SleepForMicroseconds(100);
@@ -5134,7 +4435,7 @@ TEST_F(DBTest2, TraceWithFilter) {
   for (auto handle : handles) {
     delete handle;
   }
-  delete db3;
+  db3.reset();
   ASSERT_OK(DestroyDB(dbname3, options));
 
   std::unique_ptr<TraceReader> trace_reader3;
@@ -5325,7 +4626,7 @@ TEST_F(DBTest2, TestGetColumnFamilyHandleUnlocked) {
   CreateColumnFamilies({"test1", "test2"}, Options());
   ASSERT_EQ(handles_.size(), 2);
 
-  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  DBImpl* dbi = dbfull();
   port::Thread user_thread1([&]() {
     auto cfh = dbi->GetColumnFamilyHandleUnlocked(handles_[0]->GetID());
     ASSERT_EQ(cfh->GetID(), handles_[0]->GetID());
@@ -5421,6 +4722,103 @@ TEST_F(DBTest2, TestCompactFiles) {
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
 }
 
+TEST_F(DBTest2, TestCancelCompactFiles) {
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options;
+  options.env = env_;
+  options.num_levels = 2;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+
+  auto* handle = db_->DefaultColumnFamily();
+  ASSERT_EQ(db_->NumberLevels(handle), 2);
+
+  ROCKSDB_NAMESPACE::SstFileWriter sst_file_writer{
+      ROCKSDB_NAMESPACE::EnvOptions(), options};
+
+  // ingest large SST files
+  std::vector<std::string> external_sst_file_names;
+  int key_counter = 0;
+  const int num_keys_per_file = 100000;
+  const int num_files = 10;
+  for (int i = 0; i < num_files; ++i) {
+    std::string file_name =
+        dbname_ + "/test_compact_files" + std::to_string(i) + ".sst_t";
+    external_sst_file_names.push_back(file_name);
+    ASSERT_OK(sst_file_writer.Open(file_name));
+    for (int j = 0; j < num_keys_per_file; ++j) {
+      ASSERT_OK(sst_file_writer.Put(Key(j + num_keys_per_file * key_counter),
+                                    std::to_string(j)));
+    }
+    key_counter += 1;
+    ASSERT_OK(sst_file_writer.Finish());
+  }
+
+  ASSERT_OK(db_->IngestExternalFile(handle, external_sst_file_names,
+                                    IngestExternalFileOptions()));
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), num_files);
+  std::vector<std::string> files;
+  GetSstFiles(env_, dbname_, &files);
+  ASSERT_EQ(files.size(), num_files);
+
+  // Test that 0 compactions happen - canceled is set to True initially
+  CompactionOptions compaction_options;
+  std::atomic<bool> canceled(true);
+  compaction_options.canceled = &canceled;
+
+  ASSERT_TRUE(db_->CompactFiles(compaction_options, handle, files, 1)
+                  .IsManualCompactionPaused());
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), num_files);
+
+  // Test cancellation before the check to cancel compaction happens -
+  // compaction should not occur
+  bool disable_compaction = false;
+  compaction_options.canceled->store(false, std::memory_order_release);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "TestCancelCompactFiles:SuccessfulCompaction", [&](void* arg) {
+        auto paused = static_cast<std::atomic<int>*>(arg);
+        if (disable_compaction) {
+          db_->DisableManualCompaction();
+          ASSERT_EQ(1, paused->load(std::memory_order_acquire));
+        } else {
+          compaction_options.canceled->store(true, std::memory_order_release);
+          ASSERT_EQ(0, paused->load(std::memory_order_acquire));
+        }
+      });
+
+  ASSERT_TRUE(db_->CompactFiles(compaction_options, handle, files, 1)
+                  .IsManualCompactionPaused());
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), num_files);
+
+  // DisableManualCompaction() should successfully cancel compaction
+  disable_compaction = true;
+  compaction_options.canceled->store(false, std::memory_order_release);
+  ASSERT_TRUE(db_->CompactFiles(compaction_options, handle, files, 1)
+                  .IsManualCompactionPaused());
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), num_files);
+  // unlike CompactRange, value of compaction_options.canceled will be
+  // unaffected by calling DisableManualCompactions()
+  ASSERT_FALSE(compaction_options.canceled->load());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  db_->EnableManualCompaction();
+
+  // Test cancelation after the check to cancel compaction - compaction should
+  // occur, leaving only 1 file
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactFilesImpl:0", [&](void* /*arg*/) {
+        compaction_options.canceled->store(true, std::memory_order_release);
+      });
+
+  compaction_options.canceled->store(false, std::memory_order_release);
+  ASSERT_OK(db_->CompactFiles(compaction_options, handle, files, 1));
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), 1);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
 TEST_F(DBTest2, MultiDBParallelOpenTest) {
   const int kNumDbs = 2;
   Options options = CurrentOptions();
@@ -5432,7 +4830,7 @@ TEST_F(DBTest2, MultiDBParallelOpenTest) {
 
   // Verify empty DBs can be created in parallel
   std::vector<std::thread> open_threads;
-  std::vector<DB*> dbs{static_cast<unsigned int>(kNumDbs), nullptr};
+  std::vector<std::unique_ptr<DB>> dbs(kNumDbs);
   options.create_if_missing = true;
   for (int i = 0; i < kNumDbs; ++i) {
     open_threads.emplace_back(
@@ -5447,7 +4845,7 @@ TEST_F(DBTest2, MultiDBParallelOpenTest) {
   for (int i = 0; i < kNumDbs; ++i) {
     open_threads[i].join();
     ASSERT_OK(dbs[i]->Put(WriteOptions(), "xi", "gua"));
-    delete dbs[i];
+    dbs[i].reset();
   }
 
   // Verify non-empty DBs can be recovered in parallel
@@ -5463,7 +4861,7 @@ TEST_F(DBTest2, MultiDBParallelOpenTest) {
   // Wait and cleanup
   for (int i = 0; i < kNumDbs; ++i) {
     open_threads[i].join();
-    delete dbs[i];
+    dbs[i].reset();
     ASSERT_OK(DestroyDB(dbnames[i], options));
   }
 }
@@ -5524,8 +4922,7 @@ TEST_F(DBTest2, CloseWithUnreleasedSnapshot) {
   ASSERT_NOK(db_->Close());
   db_->ReleaseSnapshot(ss);
   ASSERT_OK(db_->Close());
-  delete db_;
-  db_ = nullptr;
+  db_.reset();
 }
 
 TEST_F(DBTest2, PrefixBloomReseek) {
@@ -5807,6 +5204,7 @@ TEST_F(DBTest2, SwitchMemtableRaceWithNewManifest) {
   Options options = CurrentOptions();
   DestroyAndReopen(options);
   options.max_manifest_file_size = 10;
+  options.max_manifest_space_amp_pct = 0;
   options.create_if_missing = true;
   CreateAndReopenWithCF({"pikachu"}, options);
   ASSERT_EQ(2, handles_.size());
@@ -6498,6 +5896,7 @@ TEST_P(RenameCurrentTest, Flush) {
   Destroy(last_options_);
   Options options = GetDefaultOptions();
   options.max_manifest_file_size = 1;
+  options.max_manifest_space_amp_pct = 0;
   options.create_if_missing = true;
   Reopen(options);
   ASSERT_OK(Put("key", "value"));
@@ -6517,6 +5916,7 @@ TEST_P(RenameCurrentTest, Compaction) {
   Destroy(last_options_);
   Options options = GetDefaultOptions();
   options.max_manifest_file_size = 1;
+  options.max_manifest_space_amp_pct = 0;
   options.create_if_missing = true;
   Reopen(options);
   ASSERT_OK(Put("a", "a_value"));
@@ -6665,15 +6065,9 @@ TEST_F(DBTest2, VariousFileTemperatures) {
   };
 
   // We don't have enough non-unknown temps to confidently distinguish that
-  // a specific setting caused a specific outcome, in a single run. This is a
-  // reasonable work-around without blowing up test time. Only returns
-  // non-unknown temperatures.
-  auto RandomTemp = [] {
-    static std::vector<Temperature> temps = {
-        Temperature::kHot, Temperature::kWarm, Temperature::kCold};
-    return temps[Random::GetTLSInstance()->Uniform(
-        static_cast<int>(temps.size()))];
-  };
+  // a specific setting caused a specific outcome, in a single run. Using
+  // RandomKnownTemperature() is a reasonable work-around without blowing up
+  // test time.
 
   auto test_fs = std::make_shared<MyTestFS>(env_->GetFileSystem());
   std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, test_fs));
@@ -6689,22 +6083,22 @@ TEST_F(DBTest2, VariousFileTemperatures) {
       options.env = env.get();
       test_fs->Reset();
       if (use_optimize) {
-        test_fs->optimize_manifest_temperature = RandomTemp();
+        test_fs->optimize_manifest_temperature = RandomKnownTemperature();
         test_fs->expected_manifest_temperature =
             test_fs->optimize_manifest_temperature;
-        test_fs->optimize_wal_temperature = RandomTemp();
+        test_fs->optimize_wal_temperature = RandomKnownTemperature();
         test_fs->expected_wal_temperature = test_fs->optimize_wal_temperature;
       }
       if (use_temp_options) {
-        options.metadata_write_temperature = RandomTemp();
+        options.metadata_write_temperature = RandomKnownTemperature();
         test_fs->expected_manifest_temperature =
             options.metadata_write_temperature;
         test_fs->expected_other_metadata_temperature =
             options.metadata_write_temperature;
-        options.wal_write_temperature = RandomTemp();
+        options.wal_write_temperature = RandomKnownTemperature();
         test_fs->expected_wal_temperature = options.wal_write_temperature;
-        options.last_level_temperature = RandomTemp();
-        options.default_write_temperature = RandomTemp();
+        options.last_level_temperature = RandomKnownTemperature();
+        options.default_write_temperature = RandomKnownTemperature();
       }
 
       DestroyAndReopen(options);
@@ -7149,6 +6543,9 @@ TEST_F(DBTest2, LastLevelStatistics) {
 
     DestroyAndReopen(options);
 
+    get_iostats_context()->Reset();
+    IOStatsContext* iostats = get_iostats_context();
+
     // generate 1 sst on level 0
     ASSERT_OK(Put("foo1", "bar"));
     ASSERT_OK(Put("bar", "bar"));
@@ -7249,9 +6646,87 @@ TEST_F(DBTest2, LastLevelStatistics) {
     // Control
     ASSERT_NE(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT),
               options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT));
+
+    // Control: unknown temperature iostats should be zero since files have
+    // explicit temperatures (mapped or written)
+    EXPECT_EQ(
+        iostats->file_io_stats_by_temperature.unknown_non_last_level_bytes_read,
+        0);
+    EXPECT_EQ(
+        iostats->file_io_stats_by_temperature.unknown_non_last_level_read_count,
+        0);
+    EXPECT_EQ(
+        iostats->file_io_stats_by_temperature.unknown_last_level_bytes_read, 0);
+    EXPECT_EQ(
+        iostats->file_io_stats_by_temperature.unknown_last_level_read_count, 0);
   }
 }
 
+// Test the iostats for files with Temperature::kUnknown that is not mapped
+// to another temperature. These stats are used to indicate which non-tiered
+// workloads are most promising for tiering (so this test doesn't set
+// temperatures).
+TEST_F(DBTest2, UnknownLastLevelStatistics) {
+  Options options = CurrentOptions();
+  options.statistics = CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.no_block_cache = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  DestroyAndReopen(options);
+
+  get_iostats_context()->Reset();
+  IOStatsContext* iostats = get_iostats_context();
+
+  // Generate 1 sst file on level 0 with kUnknown temperature
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+
+  // Read from the kUnknown file on non-last level
+  ASSERT_EQ("bar", Get("foo"));
+
+  // Verify unknown_non_last_level stats are populated
+  EXPECT_GT(
+      iostats->file_io_stats_by_temperature.unknown_non_last_level_bytes_read,
+      0);
+  EXPECT_GT(
+      iostats->file_io_stats_by_temperature.unknown_non_last_level_read_count,
+      0);
+  // No reads from last level yet
+  EXPECT_EQ(iostats->file_io_stats_by_temperature.unknown_last_level_bytes_read,
+            0);
+  EXPECT_EQ(iostats->file_io_stats_by_temperature.unknown_last_level_read_count,
+            0);
+
+  // Compact to the last level (level 6) explicitly using MoveFilesToLevel
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  MoveFilesToLevel(6);
+
+  // Reopen DB to ensure table cache is cleared and files are re-opened
+  // with correct is_last_level flag
+  Reopen(options);
+
+  // Reset iostats to measure only the following reads
+  get_iostats_context()->Reset();
+
+  // Read from the file now on last level (still kUnknown since
+  // last_level_temperature is not set)
+  ASSERT_EQ("bar", Get("foo"));
+
+  // Verify unknown_last_level stats are populated
+  EXPECT_GT(iostats->file_io_stats_by_temperature.unknown_last_level_bytes_read,
+            0);
+  EXPECT_GT(iostats->file_io_stats_by_temperature.unknown_last_level_read_count,
+            0);
+  // No new reads from non-last level
+  EXPECT_EQ(
+      iostats->file_io_stats_by_temperature.unknown_non_last_level_bytes_read,
+      0);
+  EXPECT_EQ(
+      iostats->file_io_stats_by_temperature.unknown_non_last_level_read_count,
+      0);
+}
+
 TEST_F(DBTest2, CheckpointFileTemperature) {
   class NoLinkTestFS : public FileTemperatureTestFS {
     using FileTemperatureTestFS::FileTemperatureTestFS;
@@ -7298,7 +6773,7 @@ TEST_F(DBTest2, CheckpointFileTemperature) {
 
   test_fs->PopRequestedSstFileTemperatures();
   Checkpoint* checkpoint;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
   ASSERT_OK(
       checkpoint->CreateCheckpoint(dbname_ + kFilePathSeparator + "tempcp"));
 
@@ -8060,7 +7535,7 @@ TEST_F(DBTest2, GetFileChecksumsFromCurrentManifest_CRC32) {
   opts.level0_file_num_compaction_trigger = 10;
 
   // Bootstrap the test database.
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   std::string dbname = test::PerThreadDBPath("file_chksum");
   ASSERT_OK(DB::Open(opts, dbname, &db));
 
@@ -8068,18 +7543,33 @@ TEST_F(DBTest2, GetFileChecksumsFromCurrentManifest_CRC32) {
   FlushOptions fopts;
   fopts.wait = true;
   Random rnd(test::RandomSeed());
+
+  // Write 4 files into the default column family.
   for (int i = 0; i < 4; i++) {
     ASSERT_OK(db->Put(wopts, Key(i), rnd.RandomString(100)));
     ASSERT_OK(db->Flush(fopts));
   }
 
+  // Create a new column family, write 1 file into it and drop it.
+  ColumnFamilyHandle* cf;
+  ASSERT_OK(
+      db->CreateColumnFamily(ColumnFamilyOptions(), "soon_to_be_deleted", &cf));
+  ASSERT_OK(db->Put(wopts, cf, "some_key", "some_value"));
+  ASSERT_OK(db->Flush(fopts, cf));
+
+  // Drop column family should generate corresponding version edit
+  // in manifest, which we expect to be correctly interpreted by
+  // GetFileChecksumsFromCurrentManifest API after db close.
+  ASSERT_OK(db->DropColumnFamily(cf));
+  delete cf;
+  cf = nullptr;
+
   // Obtain rich files metadata for source of truth.
   std::vector<LiveFileMetaData> live_files;
   db->GetLiveFilesMetaData(&live_files);
 
   ASSERT_OK(db->Close());
-  delete db;
-  db = nullptr;
+  db.reset();
 
   // Process current MANIFEST file and build internal file checksum mappings.
   std::unique_ptr<FileChecksumList> checksum_list(NewFileChecksumList());
diff --git a/db/db_test_util.cc b/db/db_test_util.cc
index 3944e92a0dc0..d62807d265c4 100644
--- a/db/db_test_util.cc
+++ b/db/db_test_util.cc
@@ -11,6 +11,7 @@
 
 #include "cache/cache_reservation_manager.h"
 #include "db/forward_iterator.h"
+#include "env/fs_readonly.h"
 #include "env/mock_env.h"
 #include "port/lang.h"
 #include "rocksdb/cache.h"
@@ -70,9 +71,9 @@ DBTestBase::DBTestBase(const std::string path, bool env_do_fsync)
   if (getenv("MEM_ENV")) {
     mem_env_ = MockEnv::Create(base_env, base_env->GetSystemClock());
   }
-  if (getenv("ENCRYPTED_ENV")) {
+  if (auto ee = getenv("ENCRYPTED_ENV")) {
     std::shared_ptr<EncryptionProvider> provider;
-    std::string provider_id = getenv("ENCRYPTED_ENV");
+    std::string provider_id = ee;
     if (provider_id.find('=') == std::string::npos &&
         !EndsWith(provider_id, "://test")) {
       provider_id = provider_id + "://test";
@@ -96,7 +97,7 @@ DBTestBase::DBTestBase(const std::string path, bool env_do_fsync)
   EXPECT_OK(DestroyDB(dbname_, delete_options));
   // Destroy it for not alternative WAL dir is used.
   EXPECT_OK(DestroyDB(dbname_, options));
-  db_ = nullptr;
+  db_.reset();
   Reopen(options);
   Random::GetTLSInstance()->Reset(0xdeadbeef);
 }
@@ -365,11 +366,6 @@ Options DBTestBase::GetOptions(
     table_options.block_cache = NewLRUCache(/* too small */ 1);
   }
 
-  // Test anticipated new default as much as reasonably possible (and remove
-  // this code when obsolete)
-  assert(!table_options.decouple_partitioned_filters);
-  table_options.decouple_partitioned_filters = true;
-
   bool can_allow_mmap = IsMemoryMappedAccessSupported();
   switch (option_config) {
     case kHashSkipList:
@@ -458,7 +454,8 @@ Options DBTestBase::GetOptions(
       options.allow_mmap_reads = can_allow_mmap;
       break;
     case kManifestFileSize:
-      options.max_manifest_file_size = 50;  // 50 bytes
+      options.max_manifest_file_size = 50;     // 50 bytes
+      options.max_manifest_space_amp_pct = 0;  // old behavior
       break;
     case kPerfOptions:
       options.delayed_write_rate = 8 * 1024 * 1024;
@@ -523,7 +520,7 @@ Options DBTestBase::GetOptions(
     }
     case kBlockBasedTableWithLatestFormat: {
       // In case different from default
-      table_options.format_version = kLatestFormatVersion;
+      table_options.format_version = kLatestBbtFormatVersion;
       break;
     }
     case kOptimizeFiltersForHits: {
@@ -591,7 +588,6 @@ Options DBTestBase::GetOptions(
       options_override.level_compaction_dynamic_level_bytes;
   options.env = env_;
   options.create_if_missing = true;
-  options.fail_if_options_file_error = true;
   return options;
 }
 
@@ -668,7 +664,8 @@ Status DBTestBase::TryReopenWithColumnFamilies(
   DBOptions db_opts = DBOptions(options[0]);
   last_options_ = options[0];
   MaybeInstallTimeElapseOnlySleep(db_opts);
-  return DB::Open(db_opts, dbname_, column_families, &handles_, &db_);
+  Status s = DB::Open(db_opts, dbname_, column_families, &handles_, &db_);
+  return s;
 }
 
 Status DBTestBase::TryReopenWithColumnFamilies(
@@ -687,8 +684,7 @@ void DBTestBase::Close() {
     EXPECT_OK(db_->DestroyColumnFamilyHandle(h));
   }
   handles_.clear();
-  delete db_;
-  db_ = nullptr;
+  db_.reset();
 }
 
 void DBTestBase::DestroyAndReopen(const Options& options) {
@@ -713,7 +709,20 @@ void DBTestBase::Destroy(const Options& options, bool delete_cf_paths) {
 Status DBTestBase::ReadOnlyReopen(const Options& options) {
   Close();
   MaybeInstallTimeElapseOnlySleep(options);
-  return DB::OpenForReadOnly(options, dbname_, &db_);
+  Status s = DB::OpenForReadOnly(options, dbname_, &db_);
+  return s;
+}
+
+Status DBTestBase::EnforcedReadOnlyReopen(const Options& options) {
+  Close();
+  Options options_copy = options;
+  MaybeInstallTimeElapseOnlySleep(options_copy);
+  auto fs_read_only =
+      std::make_shared<ReadOnlyFileSystem>(env_->GetFileSystem());
+  env_read_only_ = std::make_shared<CompositeEnvWrapper>(env_, fs_read_only);
+  options_copy.env = env_read_only_.get();
+  Status s = DB::OpenForReadOnly(options_copy, dbname_, &db_);
+  return s;
 }
 
 Status DBTestBase::TryReopen(const Options& options) {
@@ -728,7 +737,8 @@ Status DBTestBase::TryReopen(const Options& options) {
   // clears the block cache.
   last_options_ = options;
   MaybeInstallTimeElapseOnlySleep(options);
-  return DB::Open(options, dbname_, &db_);
+  Status s = DB::Open(options, dbname_, &db_);
+  return s;
 }
 
 bool DBTestBase::IsDirectIOSupported() {
@@ -1148,16 +1158,18 @@ size_t DBTestBase::CountLiveFiles() {
 }
 
 int DBTestBase::NumTableFilesAtLevel(int level, int cf) {
-  std::string property;
-  if (cf == 0) {
-    // default cfd
-    EXPECT_TRUE(db_->GetProperty(
-        "rocksdb.num-files-at-level" + std::to_string(level), &property));
-  } else {
-    EXPECT_TRUE(db_->GetProperty(
-        handles_[cf], "rocksdb.num-files-at-level" + std::to_string(level),
-        &property));
+  return NumTableFilesAtLevel(level,
+                              cf ? handles_[cf] : db_->DefaultColumnFamily());
+}
+
+int DBTestBase::NumTableFilesAtLevel(int level, ColumnFamilyHandle* cfh,
+                                     DB* db) {
+  if (!db) {
+    db = db_.get();
   }
+  std::string property;
+  EXPECT_TRUE(db->GetProperty(
+      cfh, "rocksdb.num-files-at-level" + std::to_string(level), &property));
   return atoi(property.c_str());
 }
 
@@ -1190,12 +1202,22 @@ int DBTestBase::TotalTableFiles(int cf, int levels) {
 
 // Return spread of files per level
 std::string DBTestBase::FilesPerLevel(int cf) {
-  int num_levels =
-      (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[cf]);
+  if (cf == 0) {
+    return FilesPerLevel(db_->DefaultColumnFamily());
+  } else {
+    return FilesPerLevel(handles_[cf]);
+  }
+}
+
+std::string DBTestBase::FilesPerLevel(ColumnFamilyHandle* cfh, DB* db) {
+  if (!db) {
+    db = db_.get();
+  }
+  int num_levels = db->NumberLevels(cfh);
   std::string result;
   size_t last_non_zero_offset = 0;
   for (int level = 0; level < num_levels; level++) {
-    int f = NumTableFilesAtLevel(level, cf);
+    int f = NumTableFilesAtLevel(level, cfh, db);
     char buf[100];
     snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
     result += buf;
@@ -1328,12 +1350,14 @@ void DBTestBase::FillLevels(const std::string& smallest,
 }
 
 void DBTestBase::MoveFilesToLevel(int level, int cf) {
+  MoveFilesToLevel(level, cf ? handles_[cf] : db_->DefaultColumnFamily());
+}
+
+void DBTestBase::MoveFilesToLevel(int level, ColumnFamilyHandle* column_family,
+                                  DB* db) {
+  DBImpl* db_impl = db ? static_cast<DBImpl*>(db) : dbfull();
   for (int l = 0; l < level; ++l) {
-    if (cf > 0) {
-      EXPECT_OK(dbfull()->TEST_CompactRange(l, nullptr, nullptr, handles_[cf]));
-    } else {
-      EXPECT_OK(dbfull()->TEST_CompactRange(l, nullptr, nullptr));
-    }
+    EXPECT_OK(db_impl->TEST_CompactRange(l, nullptr, nullptr, column_family));
   }
 }
 
@@ -1852,4 +1876,13 @@ template class TargetCacheChargeTrackingCache<
     CacheEntryRole::kBlockBasedTableReader>;
 template class TargetCacheChargeTrackingCache<CacheEntryRole::kFileMetadata>;
 
+const std::vector<Temperature> kKnownTemperatures = {
+    Temperature::kHot, Temperature::kWarm, Temperature::kCool,
+    Temperature::kCold, Temperature::kIce};
+
+Temperature RandomKnownTemperature() {
+  return kKnownTemperatures[Random::GetTLSInstance()->Uniform(
+      static_cast<int>(kKnownTemperatures.size()))];
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/db_test_util.h b/db/db_test_util.h
index 1ddb4faef169..44768f1d1c33 100644
--- a/db/db_test_util.h
+++ b/db/db_test_util.h
@@ -452,6 +452,10 @@ class SpecialEnv : public EnvWrapper {
         return s;
       }
 
+      Status GetFileSize(uint64_t* s) override {
+        return target_->GetFileSize(s);
+      }
+
      private:
       std::unique_ptr<RandomAccessFile> target_;
       anon::AtomicCounter* counter_;
@@ -478,6 +482,10 @@ class SpecialEnv : public EnvWrapper {
         return target_->Prefetch(offset, n);
       }
 
+      Status GetFileSize(uint64_t* s) override {
+        return target_->GetFileSize(s);
+      }
+
      private:
       std::unique_ptr<RandomAccessFile> target_;
       std::atomic<uint64_t>* fail_cnt_;
@@ -1062,8 +1070,9 @@ class DBTestBase : public testing::Test {
   MockEnv* mem_env_;
   Env* encrypted_env_;
   SpecialEnv* env_;
+  std::shared_ptr<Env> env_read_only_;
   std::shared_ptr<Env> env_guard_;
-  DB* db_;
+  std::unique_ptr<DB> db_;
   std::vector<ColumnFamilyHandle*> handles_;
 
   int option_config_;
@@ -1148,7 +1157,7 @@ class DBTestBase : public testing::Test {
                      const anon::OptionsOverride& options_override =
                          anon::OptionsOverride()) const;
 
-  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_); }
+  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_.get()); }
 
   void CreateColumnFamilies(const std::vector<std::string>& cfs,
                             const Options& options);
@@ -1178,6 +1187,9 @@ class DBTestBase : public testing::Test {
 
   Status ReadOnlyReopen(const Options& options);
 
+  // With a filesystem wrapper that fails on attempted write
+  Status EnforcedReadOnlyReopen(const Options& options);
+
   Status TryReopen(const Options& options);
 
   bool IsDirectIOSupported();
@@ -1268,6 +1280,9 @@ class DBTestBase : public testing::Test {
 
   int NumTableFilesAtLevel(int level, int cf = 0);
 
+  int NumTableFilesAtLevel(int level, ColumnFamilyHandle* column_family,
+                           DB* db = nullptr);
+
   double CompressionRatioAtLevel(int level, int cf = 0);
 
   int TotalTableFiles(int cf = 0, int levels = -1);
@@ -1277,6 +1292,8 @@ class DBTestBase : public testing::Test {
   // Return spread of files per level
   std::string FilesPerLevel(int cf = 0);
 
+  std::string FilesPerLevel(ColumnFamilyHandle* cfh, DB* db = nullptr);
+
   size_t CountFiles();
 
   Status CountFiles(size_t* count);
@@ -1308,6 +1325,9 @@ class DBTestBase : public testing::Test {
 
   void MoveFilesToLevel(int level, int cf = 0);
 
+  void MoveFilesToLevel(int level, ColumnFamilyHandle* column_family,
+                        DB* db = nullptr);
+
   void DumpFileCounts(const char* label);
 
   std::string DumpSSTableList();
@@ -1418,20 +1438,23 @@ class DBTestBase : public testing::Test {
     std::replace(tp_string.begin(), tp_string.end(), ';', ' ');
     std::replace(tp_string.begin(), tp_string.end(), '=', ' ');
     ResetTableProperties(tp);
-    sscanf(tp_string.c_str(),
-           "# data blocks %" SCNu64 " # entries %" SCNu64
-           " # deletions %" SCNu64 " # merge operands %" SCNu64
-           " # range deletions %" SCNu64 " raw key size %" SCNu64
-           " raw average key size %lf "
-           " raw value size %" SCNu64
-           " raw average value size %lf "
-           " data block size %" SCNu64 " index block size (user-key? %" SCNu64
-           ", delta-value? %" SCNu64 ") %" SCNu64 " filter block size %" SCNu64,
-           &tp->num_data_blocks, &tp->num_entries, &tp->num_deletions,
-           &tp->num_merge_operands, &tp->num_range_deletions, &tp->raw_key_size,
-           &dummy_double, &tp->raw_value_size, &dummy_double, &tp->data_size,
-           &tp->index_key_is_user_key, &tp->index_value_is_delta_encoded,
-           &tp->index_size, &tp->filter_size);
+    int count = sscanf(
+        tp_string.c_str(),
+        "# data blocks %" SCNu64 " # entries %" SCNu64 " # deletions %" SCNu64
+        " # merge operands %" SCNu64 " # range deletions %" SCNu64
+        " raw key size %" SCNu64
+        " raw average key size %lf "
+        " raw value size %" SCNu64
+        " raw average value size %lf "
+        " data block size %" SCNu64 " data uncompressed size %" SCNu64
+        " index block size (user-key? %" SCNu64 ", delta-value? %" SCNu64
+        ") %" SCNu64 " filter block size %" SCNu64,
+        &tp->num_data_blocks, &tp->num_entries, &tp->num_deletions,
+        &tp->num_merge_operands, &tp->num_range_deletions, &tp->raw_key_size,
+        &dummy_double, &tp->raw_value_size, &dummy_double, &tp->data_size,
+        &tp->uncompressed_data_size, &tp->index_key_is_user_key,
+        &tp->index_value_is_delta_encoded, &tp->index_size, &tp->filter_size);
+    ASSERT_EQ(count, 15);
   }
 
  private:  // Prone to error on direct use
@@ -1444,4 +1467,8 @@ class DBTestBase : public testing::Test {
 // unique ids.
 void VerifySstUniqueIds(const TablePropertiesCollection& props);
 
+// Excludes kUnknown
+extern const std::vector<Temperature> kKnownTemperatures;
+Temperature RandomKnownTemperature();
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/db_universal_compaction_test.cc b/db/db_universal_compaction_test.cc
index 5a540e4d3321..465f5d0c9632 100644
--- a/db/db_universal_compaction_test.cc
+++ b/db/db_universal_compaction_test.cc
@@ -1672,55 +1672,75 @@ TEST_P(DBTestUniversalCompaction, ConcurrentBottomPriLowPriCompactions) {
   }
   const int kNumFilesTrigger = 3;
   Env::Default()->SetBackgroundThreads(1, Env::Priority::BOTTOM);
-  Options options = CurrentOptions();
-  options.compaction_style = kCompactionStyleUniversal;
-  options.max_background_compactions = 2;
-  options.num_levels = num_levels_;
-  options.write_buffer_size = 100 << 10;     // 100KB
-  options.target_file_size_base = 32 << 10;  // 32KB
-  options.level0_file_num_compaction_trigger = kNumFilesTrigger;
-  // Trigger compaction if size amplification exceeds 110%
-  options.compaction_options_universal.max_size_amplification_percent = 110;
-  DestroyAndReopen(options);
-
-  // Need to get a token to enable compaction parallelism up to
-  // `max_background_compactions` jobs.
-  auto pressure_token =
-      dbfull()->TEST_write_controler().GetCompactionPressureToken();
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
-      {// wait for the full compaction to be picked before adding files intended
-       // for the second one.
-       {"DBImpl::BackgroundCompaction:ForwardToBottomPriPool",
-        "DBTestUniversalCompaction:ConcurrentBottomPriLowPriCompactions:0"},
-       // the full (bottom-pri) compaction waits until a partial (low-pri)
-       // compaction has started to verify they can run in parallel.
-       {"DBImpl::BackgroundCompaction:NonTrivial",
-        "DBImpl::BGWorkBottomCompaction"}});
-  SyncPoint::GetInstance()->EnableProcessing();
 
-  Random rnd(301);
-  for (int i = 0; i < 2; ++i) {
-    for (int num = 0; num < kNumFilesTrigger; num++) {
-      int key_idx = 0;
-      GenerateNewFile(&rnd, &key_idx, true /* no_wait */);
-      // use no_wait above because that one waits for flush and compaction. We
-      // don't want to wait for compaction because the full compaction is
-      // intentionally blocked while more files are flushed.
-      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  for (bool universal_reduce_file_locking : {true, false}) {
+    Options options = CurrentOptions();
+    options.compaction_style = kCompactionStyleUniversal;
+    options.compaction_options_universal.reduce_file_locking =
+        universal_reduce_file_locking;
+    options.max_background_compactions = 2;
+    options.num_levels = num_levels_;
+    options.write_buffer_size = 100 << 10;     // 100KB
+    options.target_file_size_base = 32 << 10;  // 32KB
+    options.level0_file_num_compaction_trigger = kNumFilesTrigger;
+    // Trigger compaction if size amplification exceeds 110%
+    options.compaction_options_universal.max_size_amplification_percent = 110;
+    DestroyAndReopen(options);
+
+    // Need to get a token to enable compaction parallelism up to
+    // `max_background_compactions` jobs.
+    auto pressure_token =
+        dbfull()->TEST_write_controler().GetCompactionPressureToken();
+    if (universal_reduce_file_locking) {
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+          {// Wait for the full compaction to be repicked before adding files
+           // intended for the second compaction.
+           {"DBImpl::BackgroundCompaction():AfterPickCompactionBottomPri",
+            "DBTestUniversalCompaction:ConcurrentBottomPriLowPriCompactions:0"},
+           // Wait for the second compaction to run before running the full
+           // compaction to verify they can run in parallel
+           {"DBImpl::BackgroundCompaction:NonTrivial:BeforeRun",
+            "DBImpl::BackgroundCompaction:NonTrivial:BeforeRunBottomPri"}});
+    } else {
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+          {// Wait for the full compaction to be forwarded before adding files
+           // intended for the second compaction.
+           {"DBImpl::BackgroundCompaction:ForwardToBottomPriPool",
+            "DBTestUniversalCompaction:ConcurrentBottomPriLowPriCompactions:0"},
+           // Wait for the second compaction to run before running the full
+           // compaction to verify they can run in parallel
+           {"DBImpl::BackgroundCompaction:NonTrivial:BeforeRun",
+            "DBImpl::BackgroundCompaction:NonTrivial:BeforeRunBottomPri"}});
     }
-    if (i == 0) {
-      TEST_SYNC_POINT(
-          "DBTestUniversalCompaction:ConcurrentBottomPriLowPriCompactions:0");
+
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    Random rnd(301);
+    for (int i = 0; i < 2; ++i) {
+      for (int num = 0; num < kNumFilesTrigger; num++) {
+        int key_idx = 0;
+        GenerateNewFile(&rnd, &key_idx, true /* no_wait */);
+        // use no_wait above because that one waits for flush and compaction. We
+        // don't want to wait for compaction because the full compaction is
+        // intentionally blocked while more files are flushed.
+        ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+      }
+      if (i == 0) {
+        TEST_SYNC_POINT(
+            "DBTestUniversalCompaction:ConcurrentBottomPriLowPriCompactions:0");
+      }
     }
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+    // First compaction should output to bottom level. Second should output to
+    // L0 since older L0 files pending compaction prevent it from being placed
+    // lower.
+    ASSERT_EQ(NumSortedRuns(), 2);
+    ASSERT_GT(NumTableFilesAtLevel(0), 0);
+    ASSERT_GT(NumTableFilesAtLevel(num_levels_ - 1), 0);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
   }
-  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
-  // First compaction should output to bottom level. Second should output to L0
-  // since older L0 files pending compaction prevent it from being placed lower.
-  ASSERT_EQ(NumSortedRuns(), 2);
-  ASSERT_GT(NumTableFilesAtLevel(0), 0);
-  ASSERT_GT(NumTableFilesAtLevel(num_levels_ - 1), 0);
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
   Env::Default()->SetBackgroundThreads(0, Env::Priority::BOTTOM);
 }
 
@@ -2086,46 +2106,79 @@ TEST_F(DBTestUniversalCompaction2, OverlappingL0) {
 }
 
 TEST_F(DBTestUniversalCompaction2, IngestBehind) {
-  const int kNumKeys = 3000;
-  const int kWindowSize = 100;
-  const int kNumDelsTrigger = 90;
-
-  Options opts = CurrentOptions();
-  opts.table_properties_collector_factories.emplace_back(
-      NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger));
-  opts.compaction_style = kCompactionStyleUniversal;
-  opts.level0_file_num_compaction_trigger = 2;
-  opts.compression = kNoCompression;
-  opts.allow_ingest_behind = true;
-  opts.compaction_options_universal.size_ratio = 10;
-  opts.compaction_options_universal.min_merge_width = 2;
-  opts.compaction_options_universal.max_size_amplification_percent = 200;
-  Reopen(opts);
-
-  // add an L1 file to prevent tombstones from dropping due to obsolescence
-  // during flush
-  int i;
-  for (i = 0; i < 2000; ++i) {
-    ASSERT_OK(Put(Key(i), "val"));
-  }
-  ASSERT_OK(Flush());
-  //  MoveFilesToLevel(6);
-  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-
-  for (i = 1999; i < kNumKeys; ++i) {
-    if (i >= kNumKeys - kWindowSize &&
-        i < kNumKeys - kWindowSize + kNumDelsTrigger) {
-      ASSERT_OK(Delete(Key(i)));
+  for (bool cf_option : {false, true}) {
+    SCOPED_TRACE("cf_option = " + std::to_string(cf_option));
+    const int kNumKeys = 3000;
+    const int kWindowSize = 100;
+    const int kNumDelsTrigger = 90;
+
+    Options opts = CurrentOptions();
+    opts.table_properties_collector_factories.emplace_back(
+        NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger));
+    opts.compaction_style = kCompactionStyleUniversal;
+    opts.level0_file_num_compaction_trigger = 2;
+    opts.compression = kNoCompression;
+    if (cf_option) {
+      opts.cf_allow_ingest_behind = true;
     } else {
+      opts.allow_ingest_behind = true;
+    }
+    opts.compaction_options_universal.size_ratio = 10;
+    opts.compaction_options_universal.min_merge_width = 2;
+    opts.compaction_options_universal.max_size_amplification_percent = 200;
+    Reopen(opts);
+
+    // add an L1 file to prevent tombstones from dropping due to obsolescence
+    // during flush
+    int i;
+    for (i = 0; i < 2000; ++i) {
       ASSERT_OK(Put(Key(i), "val"));
     }
-  }
-  ASSERT_OK(Flush());
+    ASSERT_OK(Flush());
+    //  MoveFilesToLevel(6);
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+    for (i = 1999; i < kNumKeys; ++i) {
+      if (i >= kNumKeys - kWindowSize &&
+          i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+        ASSERT_OK(Delete(Key(i)));
+      } else {
+        ASSERT_OK(Put(Key(i), "val"));
+      }
+    }
+    ASSERT_OK(Flush());
 
-  ASSERT_OK(dbfull()->TEST_WaitForCompact());
-  ASSERT_EQ(0, NumTableFilesAtLevel(0));
-  ASSERT_EQ(0, NumTableFilesAtLevel(6));
-  ASSERT_GT(NumTableFilesAtLevel(5), 0);
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_EQ(0, NumTableFilesAtLevel(0));
+    ASSERT_EQ(0, NumTableFilesAtLevel(6));
+    ASSERT_GT(NumTableFilesAtLevel(5), 0);
+
+    if (cf_option) {
+      // Test that another CF does not allow ingest behind
+      ColumnFamilyHandle* new_cfh;
+      Options new_cf_option;
+      new_cf_option.compaction_style = kCompactionStyleUniversal;
+      new_cf_option.num_levels = 7;
+      // CreateColumnFamilies({"new_cf"}, new_cf_option);
+      ASSERT_OK(db_->CreateColumnFamily(new_cf_option, "new_cf", &new_cfh));
+      // handles_.push_back(new_cfh);
+      for (i = 0; i < 10; ++i) {
+        // ASSERT_OK(Put(1, Key(i), "val"));
+        ASSERT_OK(db_->Put(WriteOptions(), new_cfh, Key(i), "val"));
+      }
+      ASSERT_OK(
+          db_->CompactRange(CompactRangeOptions(), new_cfh, nullptr, nullptr));
+      // This CF can use the last leve.
+      std::string property;
+      EXPECT_TRUE(db_->GetProperty(
+          new_cfh, "rocksdb.num-files-at-level" + std::to_string(6),
+          &property));
+      ASSERT_EQ(1, atoi(property.c_str()));
+
+      ASSERT_OK(db_->DropColumnFamily(new_cfh));
+      ASSERT_OK(db_->DestroyColumnFamilyHandle(new_cfh));
+    }
+  }
 }
 
 TEST_F(DBTestUniversalCompaction2, PeriodicCompactionDefault) {
diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc
index f89cfe59463b..1e9270db0dee 100644
--- a/db/db_wal_test.cc
+++ b/db/db_wal_test.cc
@@ -395,13 +395,13 @@ TEST_P(DBWALTestWithTimestamp, RecoverAndNoFlush) {
     read_opts.timestamp = &ts_slice;
     ASSERT_OK(CreateAndReopenWithTs({"pikachu"}, ts_options, persist_udt,
                                     avoid_flush_during_recovery));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), 0U);
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu"), 0U);
     ASSERT_OK(Put(1, "foo", ts1, "v1"));
     ASSERT_OK(Put(1, "baz", ts1, "v5"));
 
     ASSERT_OK(ReopenColumnFamiliesWithTs({"pikachu"}, ts_options, persist_udt,
                                          avoid_flush_during_recovery));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), 0U);
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu"), 0U);
     // Do a timestamped read with ts1 after second reopen.
     CheckGet(read_opts, 1, "foo", "v1", ts1);
     CheckGet(read_opts, 1, "baz", "v5", ts1);
@@ -415,7 +415,7 @@ TEST_P(DBWALTestWithTimestamp, RecoverAndNoFlush) {
 
     ASSERT_OK(ReopenColumnFamiliesWithTs({"pikachu"}, ts_options, persist_udt,
                                          avoid_flush_during_recovery));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), 0U);
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu"), 0U);
     std::string ts3;
     PutFixed64(&ts3, 3);
     ASSERT_OK(Put(1, "foo", ts3, "v4"));
@@ -466,14 +466,14 @@ TEST_P(DBWALTestWithTimestamp, RecoverAndFlush) {
 
   ASSERT_OK(CreateAndReopenWithTs({"pikachu"}, ts_options, persist_udt));
   // No flush, no sst files, because of no data.
-  ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), 0U);
+  ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu"), 0U);
   ASSERT_OK(Put(1, largest_ukey_without_ts, write_ts, "v1"));
   ASSERT_OK(Put(1, smallest_ukey_without_ts, write_ts, "v5"));
 
   ASSERT_OK(ReopenColumnFamiliesWithTs({"pikachu"}, ts_options, persist_udt));
   // Memtable recovered from WAL flushed because `avoid_flush_during_recovery`
   // defaults to false, created one L0 file.
-  ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), 1U);
+  ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu"), 1U);
 
   std::vector<std::vector<FileMetaData>> level_to_files;
   dbfull()->TEST_GetFilesMetaData(handles_[1], &level_to_files);
@@ -1347,7 +1347,7 @@ TEST_F(DBWALTest, RecoverCheckFileAmountWithSmallWriteBuffer) {
     auto tables = ListTableFiles(env_, dbname_);
     ASSERT_EQ(tables.size(), static_cast<size_t>(1));
     // Make sure 'dobrynia' was flushed: check sst files amount
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "dobrynia"),
               static_cast<uint64_t>(1));
   }
   // New WAL file
@@ -1363,16 +1363,16 @@ TEST_F(DBWALTest, RecoverCheckFileAmountWithSmallWriteBuffer) {
                            options);
   {
     // No inserts => default is empty
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "default"),
               static_cast<uint64_t>(0));
     // First 4 keys goes to separate SSTs + 1 more SST for 2 smaller keys
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu"),
               static_cast<uint64_t>(5));
     // 1 SST for big key + 1 SST for small one
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "dobrynia"),
               static_cast<uint64_t>(2));
     // 1 SST for all keys
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "nikitich"),
               static_cast<uint64_t>(1));
   }
 }
@@ -1401,7 +1401,7 @@ TEST_F(DBWALTest, RecoverCheckFileAmount) {
   {
     auto tables = ListTableFiles(env_, dbname_);
     ASSERT_EQ(tables.size(), static_cast<size_t>(1));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "nikitich"),
               static_cast<uint64_t>(1));
   }
   // Memtable for 'nikitich' has flushed, new WAL file has opened
@@ -1425,7 +1425,7 @@ TEST_F(DBWALTest, RecoverCheckFileAmount) {
   {
     auto tables = ListTableFiles(env_, dbname_);
     ASSERT_EQ(tables.size(), static_cast<size_t>(2));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "nikitich"),
               static_cast<uint64_t>(2));
   }
 
@@ -1437,13 +1437,13 @@ TEST_F(DBWALTest, RecoverCheckFileAmount) {
     // first, second and third WALs  went to the same SST.
     // So, there is 6 SSTs: three  for 'nikitich', one for 'default', one for
     // 'dobrynia', one for 'pikachu'
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "default"),
               static_cast<uint64_t>(1));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "nikitich"),
               static_cast<uint64_t>(3));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "dobrynia"),
               static_cast<uint64_t>(1));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu"),
               static_cast<uint64_t>(1));
   }
 }
@@ -1521,9 +1521,9 @@ TEST_F(DBWALTest, DISABLED_RecycleMultipleWalsCrash) {
   // from an old incarnation of the WAL on recovery
   ASSERT_OK(db_->PauseBackgroundWork());
   ASSERT_OK(Put("ignore1", Random::GetTLSInstance()->RandomString(500)));
-  ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
   ASSERT_OK(Put("ignore2", Random::GetTLSInstance()->RandomString(500)));
-  ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
   ASSERT_OK(db_->ContinueBackgroundWork());
   ASSERT_OK(Flush());
   ASSERT_OK(Put("ignore3", Random::GetTLSInstance()->RandomString(500)));
@@ -1545,13 +1545,13 @@ TEST_F(DBWALTest, DISABLED_RecycleMultipleWalsCrash) {
   // gap in sequence numbers to interfere with recovery
   ASSERT_OK(db_->PauseBackgroundWork());
   ASSERT_OK(Put("key1", "val1"));
-  ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
   ASSERT_OK(Put("key2", "val2"));
-  ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
   // Need a gap in sequence numbers, so e.g. ingest external file
   // with an open snapshot
   {
-    ManagedSnapshot snapshot(db_);
+    ManagedSnapshot snapshot(db_.get());
     ASSERT_OK(
         db_->IngestExternalFile({external_file1}, IngestExternalFileOptions()));
   }
@@ -1560,7 +1560,7 @@ TEST_F(DBWALTest, DISABLED_RecycleMultipleWalsCrash) {
   // Need an SST file that is logically after that WAL, so that dropping WAL
   // data is not a valid point in time.
   {
-    ManagedSnapshot snapshot(db_);
+    ManagedSnapshot snapshot(db_.get());
     ASSERT_OK(
         db_->IngestExternalFile({external_file2}, IngestExternalFileOptions()));
   }
@@ -1613,7 +1613,7 @@ TEST_F(DBWALTest, SyncWalPartialFailure) {
       return s;
     }
 
-    AcqRelAtomic<uint32_t> syncs_before_failure_{UINT32_MAX};
+    Atomic<uint32_t> syncs_before_failure_{UINT32_MAX};
 
    protected:
     class MyTestWritableFile : public FSWritableFileOwnerWrapper {
@@ -1655,10 +1655,10 @@ TEST_F(DBWALTest, SyncWalPartialFailure) {
   // with a single thread, to exercise as much logic as we reasonably can.
   ASSERT_OK(db_->PauseBackgroundWork());
   ASSERT_OK(Put("key1", "val1"));
-  ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
   ASSERT_OK(db_->SyncWAL());
   ASSERT_OK(Put("key2", "val2"));
-  ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
   ASSERT_OK(Put("key3", "val3"));
 
   // Allow 1 of the WALs to sync, but another won't
@@ -1746,8 +1746,8 @@ class RecoveryTestHelper {
     WriteController write_controller;
 
     versions.reset(new VersionSet(
-        test->dbname_, &db_options, file_options, table_cache.get(),
-        &write_buffer_manager, &write_controller,
+        test->dbname_, &db_options, MutableDBOptions{options}, file_options,
+        table_cache.get(), &write_buffer_manager, &write_controller,
         /*block_cache_tracer=*/nullptr,
         /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"",
         options.daily_offpeak_time_utc,
@@ -1879,9 +1879,11 @@ TEST_F(DBWALTest, TrackAndVerifyWALsRecycleWAL) {
   // Drop `Put("key1", "old_value")` in the first WAL
   ASSERT_OK(test::TruncateFile(options.env, log_name, 0 /* new_length */));
 
-  Status s = DB::Open(options, dbname_, &db_);
+  {
+    Status s = DB::Open(options, dbname_, &db_);
 
-  ASSERT_OK(s);
+    ASSERT_OK(s);
+  }
 
   ASSERT_EQ("wal_to_recycle", Get("key_ignore2"));
   ASSERT_EQ("NOT_FOUND", Get("key1"));
@@ -1979,7 +1981,10 @@ TEST_P(DBWALTrackAndVerifyWALsWithParamsTest, Basic) {
       ASSERT_OK(options.env->DeleteFile(second_log_name));
     }
 
-    Status s = DB::Open(options, dbname_, &db_);
+    Status s;
+    {
+      s = DB::Open(options, dbname_, &db_);
+    }
 
     if (i == 0) {
       ASSERT_OK(s);
@@ -2266,17 +2271,17 @@ TEST_F(DBWALTest, RaceInstallFlushResultsWithWalObsoletion) {
   SyncPoint::GetInstance()->DisableProcessing();
   SyncPoint::GetInstance()->ClearAllCallBacks();
 
-  DB* db1 = nullptr;
+  std::unique_ptr<DB> db1;
   Status s = DB::OpenForReadOnly(options, dbname_, &db1);
   ASSERT_OK(s);
   assert(db1);
-  delete db1;
 }
 
 TEST_F(DBWALTest, FixSyncWalOnObseletedWalWithNewManifestCausingMissingWAL) {
   Options options = CurrentOptions();
   // Small size to force manifest creation
   options.max_manifest_file_size = 1;
+  options.max_manifest_space_amp_pct = 0;
   options.track_and_verify_wals_in_manifest = true;
   DestroyAndReopen(options);
 
@@ -3024,13 +3029,13 @@ TEST_F(DBWALTest, GetCompressedWalsAfterSync) {
   options.wal_compression = kZSTD;
   DestroyAndReopen(options);
 
-  // Write something to memtable and WAL so that log_empty_ will be false after
+  // Write something to memtable and WAL so that wal_empty_ will be false after
   // next DB::Open().
   ASSERT_OK(Put("a", "v"));
 
   Reopen(options);
 
-  // New WAL is created, thanks to !log_empty_.
+  // New WAL is created, thanks to !wal_empty_.
   ASSERT_OK(dbfull()->TEST_SwitchWAL());
 
   ASSERT_OK(Put("b", "v"));
diff --git a/db/db_with_timestamp_basic_test.cc b/db/db_with_timestamp_basic_test.cc
index af328707aac7..d4728e9811af 100644
--- a/db/db_with_timestamp_basic_test.cc
+++ b/db/db_with_timestamp_basic_test.cc
@@ -19,6 +19,13 @@
 #include "utilities/merge_operators/string_append/stringappend2.h"
 
 namespace ROCKSDB_NAMESPACE {
+namespace {
+std::string EncodeAsUint64(uint64_t v) {
+  std::string dst;
+  PutFixed64(&dst, v);
+  return dst;
+}
+}  // namespace
 class DBBasicTestWithTimestamp : public DBBasicTestWithTimestampBase {
  public:
   DBBasicTestWithTimestamp()
@@ -655,7 +662,7 @@ TEST_F(DBBasicTestWithTimestamp, TrimHistoryTest) {
   ASSERT_OK(db_->Put(WriteOptions(), "k1", Timestamp(4, 0), "v2"));
   ASSERT_OK(db_->Delete(WriteOptions(), "k1", Timestamp(5, 0)));
   ASSERT_OK(db_->Put(WriteOptions(), "k1", Timestamp(6, 0), "v3"));
-  check_value_by_ts(db_, "k1", Timestamp(7, 0), Status::OK(), "v3",
+  check_value_by_ts(db_.get(), "k1", Timestamp(7, 0), Status::OK(), "v3",
                     Timestamp(6, 0));
   ASSERT_OK(Flush());
   Close();
@@ -668,27 +675,27 @@ TEST_F(DBBasicTestWithTimestamp, TrimHistoryTest) {
   // Trim data whose version > Timestamp(5, 0), read(k1, ts(7)) <- NOT_FOUND.
   ASSERT_OK(DB::OpenAndTrimHistory(db_options, dbname_, column_families,
                                    &handles_, &db_, Timestamp(5, 0)));
-  check_value_by_ts(db_, "k1", Timestamp(7, 0), Status::NotFound(), "",
+  check_value_by_ts(db_.get(), "k1", Timestamp(7, 0), Status::NotFound(), "",
                     Timestamp(5, 0));
   Close();
 
   // Trim data whose timestamp > Timestamp(4, 0), read(k1, ts(7)) <- v2
   ASSERT_OK(DB::OpenAndTrimHistory(db_options, dbname_, column_families,
                                    &handles_, &db_, Timestamp(4, 0)));
-  check_value_by_ts(db_, "k1", Timestamp(7, 0), Status::OK(), "v2",
+  check_value_by_ts(db_.get(), "k1", Timestamp(7, 0), Status::OK(), "v2",
                     Timestamp(4, 0));
   Close();
 
   Reopen(options);
   ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "k1",
                              "k3", Timestamp(7, 0)));
-  check_value_by_ts(db_, "k1", Timestamp(8, 0), Status::NotFound(), "",
+  check_value_by_ts(db_.get(), "k1", Timestamp(8, 0), Status::NotFound(), "",
                     Timestamp(7, 0));
   Close();
   // Trim data whose timestamp > Timestamp(6, 0), read(k1, ts(8)) <- v2
   ASSERT_OK(DB::OpenAndTrimHistory(db_options, dbname_, column_families,
                                    &handles_, &db_, Timestamp(6, 0)));
-  check_value_by_ts(db_, "k1", Timestamp(8, 0), Status::OK(), "v2",
+  check_value_by_ts(db_.get(), "k1", Timestamp(8, 0), Status::OK(), "v2",
                     Timestamp(4, 0));
   Close();
 }
@@ -1420,8 +1427,12 @@ TEST_F(DBBasicTestWithTimestamp, ReseekToNextUserKey) {
   {
     std::string ts_str = Timestamp(static_cast<uint64_t>(kNumKeys + 1), 0);
     WriteBatch batch(0, 0, 0, kTimestampSize);
-    { ASSERT_OK(batch.Put("a", "new_value")); }
-    { ASSERT_OK(batch.Put("b", "new_value")); }
+    {
+      ASSERT_OK(batch.Put("a", "new_value"));
+    }
+    {
+      ASSERT_OK(batch.Put("b", "new_value"));
+    }
     s = batch.UpdateTimestamps(
         ts_str, [kTimestampSize](uint32_t) { return kTimestampSize; });
     ASSERT_OK(s);
@@ -1480,13 +1491,24 @@ TEST_F(DBBasicTestWithTimestamp, ReseekToUserKeyBeforeSavedKey) {
   Close();
 }
 
-TEST_F(DBBasicTestWithTimestamp,
-       FIXME_ReverseIterationWithBlobAndUnpreparedValue) {
+class ReverseIterationWithUnpreparedBlobTest
+    : public DBBasicTestWithTimestampBase,
+      public testing::WithParamInterface<std::tuple<bool, uint64_t>> {
+ public:
+  ReverseIterationWithUnpreparedBlobTest()
+      : DBBasicTestWithTimestampBase(
+            "db_basic_test_with_timestamp_reverse_with_unprepare") {}
+};
+INSTANTIATE_TEST_CASE_P(ReverseIterationWithUnpreparedBlobTest,
+                        ReverseIterationWithUnpreparedBlobTest,
+                        ::testing::Combine(::testing::Values(true, false),
+                                           ::testing::Values(0, 2)));
+TEST_P(ReverseIterationWithUnpreparedBlobTest, Basic) {
   Options options = CurrentOptions();
   options.create_if_missing = true;
   options.env = env_;
   options.enable_blob_files = true;
-  options.max_sequential_skip_in_iterations = 0;
+  options.max_sequential_skip_in_iterations = std::get<1>(GetParam());
 
   const size_t kTimestampSize = Timestamp(0, 0).size();
   TestComparator test_cmp(kTimestampSize);
@@ -1501,7 +1523,7 @@ TEST_F(DBBasicTestWithTimestamp,
   for (uint64_t key = 0; key <= kMaxKey; ++key) {
     for (size_t i = 0; i < write_timestamps.size(); ++i) {
       ASSERT_OK(db_->Put(WriteOptions(), Key1(key), write_timestamps[i],
-                         "value" + std::to_string(i)));
+                         Key1(key) + "value" + std::to_string(i)));
     }
   }
 
@@ -1513,17 +1535,28 @@ TEST_F(DBBasicTestWithTimestamp,
 
     ReadOptions read_opts;
     read_opts.timestamp = &read_timestamp;
-    read_opts.allow_unprepared_value = true;
+    read_opts.allow_unprepared_value = std::get<0>(GetParam());
 
     std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
 
     it->SeekForPrev(Key1(kMaxKey));
-    ASSERT_TRUE(it->Valid());
-    ASSERT_OK(it->status());
+    uint64_t key = kMaxKey;
+    int count = 0;
+    while (it->Valid()) {
+      ASSERT_OK(it->status());
 
-    // FIXME: PrepareValue() should succeed and status() should remain OK
-    ASSERT_FALSE(it->PrepareValue());
-    ASSERT_TRUE(it->status().IsCorruption());
+      ASSERT_TRUE(it->PrepareValue());
+      ASSERT_TRUE(it->Valid());
+      ASSERT_OK(it->status());
+      ASSERT_EQ(it->key(), Key1(key));
+      ASSERT_EQ(it->timestamp(), Timestamp(3, 0));
+      ASSERT_EQ(it->value(), Key1(key) + "value" + std::to_string(1));
+      key--;
+      count++;
+      it->Prev();
+    }
+    ASSERT_OK(it->status());
+    ASSERT_EQ(kMaxKey + 1, count);
   }
 
   Close();
@@ -2371,7 +2404,6 @@ class DataVisibilityTest : public DBBasicTestWithTimestampBase {
     }
   }
 };
-constexpr int DataVisibilityTest::kTestDataSize;
 
 // Application specifies timestamp but not snapshot.
 //           reader              writer
@@ -3746,17 +3778,42 @@ INSTANTIATE_TEST_CASE_P(
         test::UserDefinedTimestampTestMode::kStripUserDefinedTimestamp,
         test::UserDefinedTimestampTestMode::kNormal));
 
-TEST_F(DBBasicTestWithTimestamp, EnableDisableUDT) {
+// Test params:
+// 1) whether to flush before close
+class EnableDisableUDTTest : public DBBasicTestWithTimestampBase,
+                             public testing::WithParamInterface<bool> {
+ public:
+  EnableDisableUDTTest()
+      : DBBasicTestWithTimestampBase("/enable_disable_udt") {}
+};
+
+INSTANTIATE_TEST_CASE_P(EnableDisableUDTTest, EnableDisableUDTTest,
+                        ::testing::Values(true, false));
+
+TEST_P(EnableDisableUDTTest, Basic) {
   Options options = CurrentOptions();
+  // Un-flushed data before close will involve a WAL replay on DB reopen.
+  bool flush_before_close = GetParam();
   options.env = env_;
-  // Create a column family without user-defined timestamps.
   options.comparator = BytewiseComparator();
   options.persist_user_defined_timestamps = true;
   DestroyAndReopen(options);
 
+  ReadOptions ropts;
+  std::string read_ts;
+  std::string value;
+  std::string key_ts;
+
   // Create one SST file, its user keys have no user-defined timestamps.
-  ASSERT_OK(db_->Put(WriteOptions(), "foo", "val1"));
-  ASSERT_OK(Flush(0));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "val0"));
+  ASSERT_OK(db_->Put(WriteOptions(), "bar", "val0"));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), "bar", "baz"));
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &value));
+  ASSERT_EQ("val0", value);
+  ASSERT_TRUE(db_->Get(ReadOptions(), "bar", &value).IsNotFound());
+  if (flush_before_close) {
+    ASSERT_OK(Flush(0));
+  }
   Close();
 
   // Reopen the existing column family and enable user-defined timestamps
@@ -3765,47 +3822,63 @@ TEST_F(DBBasicTestWithTimestamp, EnableDisableUDT) {
   options.persist_user_defined_timestamps = false;
   options.allow_concurrent_memtable_write = false;
   Reopen(options);
-
-  std::string value;
-  ASSERT_TRUE(db_->Get(ReadOptions(), "foo", &value).IsInvalidArgument());
-  std::string read_ts;
-  PutFixed64(&read_ts, 0);
-  ReadOptions ropts;
+  // Read data from previous session before and after compaction.
+  read_ts = EncodeAsUint64(1);
   Slice read_ts_slice = read_ts;
   ropts.timestamp = &read_ts_slice;
-  std::string key_ts;
-  // Entries in pre-existing SST files are treated as if they have minimum
-  // user-defined timestamps.
-  ASSERT_OK(db_->Get(ropts, "foo", &value, &key_ts));
-  ASSERT_EQ("val1", value);
-  ASSERT_EQ(read_ts, key_ts);
+  for (int i = 0; i < 2; i++) {
+    ASSERT_TRUE(db_->Get(ReadOptions(), "foo", &value).IsInvalidArgument());
+    // Entries in pre-existing SST files are treated as if they have minimum
+    // user-defined timestamps.
+    ASSERT_OK(db_->Get(ropts, "foo", &value, &key_ts));
+    ASSERT_EQ("val0", value);
+    ASSERT_EQ(EncodeAsUint64(0), key_ts);
+    ASSERT_TRUE(db_->Get(ropts, "bar", &value, &key_ts).IsNotFound());
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  }
 
   // Do timestamped read / write.
-  std::string write_ts;
-  PutFixed64(&write_ts, 1);
-  ASSERT_OK(db_->Put(WriteOptions(), "foo", write_ts, "val2"));
-  read_ts.clear();
-  PutFixed64(&read_ts, 1);
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", EncodeAsUint64(1), "val1"));
+  ASSERT_OK(db_->Put(WriteOptions(), "bar", EncodeAsUint64(1), "val1"));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), "bar", "baz", EncodeAsUint64(2)));
   ASSERT_OK(db_->Get(ropts, "foo", &value, &key_ts));
-  ASSERT_EQ("val2", value);
-  ASSERT_EQ(write_ts, key_ts);
+  ASSERT_EQ("val1", value);
+  ASSERT_EQ(EncodeAsUint64(1), key_ts);
+  ASSERT_OK(db_->Get(ropts, "bar", &value, &key_ts));
+  ASSERT_EQ("val1", value);
+  ASSERT_EQ(EncodeAsUint64(1), key_ts);
+  read_ts = EncodeAsUint64(2);
+  ASSERT_TRUE(db_->Get(ropts, "bar", &value, &key_ts).IsNotFound());
   // The user keys in this SST file don't have user-defined timestamps either,
   // because `persist_user_defined_timestamps` flag is set to false.
-  ASSERT_OK(Flush(0));
+  if (flush_before_close) {
+    ASSERT_OK(Flush(0));
+  }
   Close();
 
   // Reopen the existing column family while disabling user-defined timestamps.
   options.comparator = BytewiseComparator();
   Reopen(options);
 
-  ASSERT_TRUE(db_->Get(ropts, "foo", &value).IsInvalidArgument());
-  ASSERT_OK(db_->Get(ReadOptions(), "foo", &value));
-  ASSERT_EQ("val2", value);
+  // Read data from previous session before and after compaction.
+  for (int i = 0; i < 2; i++) {
+    ASSERT_TRUE(db_->Get(ropts, "foo", &value).IsInvalidArgument());
+    ASSERT_OK(db_->Get(ReadOptions(), "foo", &value));
+    ASSERT_EQ("val1", value);
+    ASSERT_TRUE(db_->Get(ReadOptions(), "bar", &value).IsNotFound());
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  }
 
   // Continue to write / read the column family without user-defined timestamps.
-  ASSERT_OK(db_->Put(WriteOptions(), "foo", "val3"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "val2"));
+  ASSERT_OK(db_->Put(WriteOptions(), "bar", "val2"));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), "bar", "baz"));
   ASSERT_OK(db_->Get(ReadOptions(), "foo", &value));
-  ASSERT_EQ("val3", value);
+  ASSERT_EQ("val2", value);
+  ASSERT_TRUE(db_->Get(ReadOptions(), "bar", &value).IsNotFound());
+  if (flush_before_close) {
+    ASSERT_OK(Flush(0));
+  }
   Close();
 }
 
@@ -4844,6 +4917,117 @@ TEST_F(DBBasicTestWithTimestamp, TimestampFilterTableReadOnGet) {
   Close();
 }
 
+class GetNewestUserDefinedTimestampTest : public DBBasicTestWithTimestampBase {
+ public:
+  explicit GetNewestUserDefinedTimestampTest()
+      : DBBasicTestWithTimestampBase("get_newest_udt_test") {}
+};
+
+TEST_F(GetNewestUserDefinedTimestampTest, Basic) {
+  std::string newest_timestamp;
+  // UDT disabled, get InvalidArgument.
+  ASSERT_TRUE(db_->GetNewestUserDefinedTimestamp(nullptr, &newest_timestamp)
+                  .IsInvalidArgument());
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.max_write_buffer_number = 5;
+  options.min_write_buffer_number_to_merge = 4;
+  options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+
+  DestroyAndReopen(options);
+  // UDT persisted, get NotSupported.
+  ASSERT_TRUE(db_->GetNewestUserDefinedTimestamp(nullptr, &newest_timestamp)
+                  .IsNotSupported());
+
+  options.persist_user_defined_timestamps = false;
+  options.allow_concurrent_memtable_write = false;
+
+  DestroyAndReopen(options);
+  ASSERT_TRUE(
+      db_->GetNewestUserDefinedTimestamp(nullptr, nullptr).IsInvalidArgument());
+
+  ColumnFamilyHandleImpl* cfh = static_cast_with_check<ColumnFamilyHandleImpl>(
+      db_->DefaultColumnFamily());
+  ColumnFamilyData* cfd = cfh->cfd();
+  // The column family hasn't seen any user defined timestamp
+  ASSERT_OK(db_->GetNewestUserDefinedTimestamp(nullptr, &newest_timestamp));
+  ASSERT_TRUE(newest_timestamp.empty());
+
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1), EncodeAsUint64(1), "val1"));
+  // Testing get newest timestamp from mutable memtable.
+  ASSERT_OK(db_->GetNewestUserDefinedTimestamp(nullptr, &newest_timestamp));
+  ASSERT_EQ(EncodeAsUint64(1), newest_timestamp);
+
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1), EncodeAsUint64(2), "val2"));
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable(cfd));
+  // Testing get the newest timestamp from immutable memtable because the
+  // mutable one is empty.
+  ASSERT_OK(db_->GetNewestUserDefinedTimestamp(nullptr, &newest_timestamp));
+  ASSERT_EQ(EncodeAsUint64(2), newest_timestamp);
+
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1), EncodeAsUint64(3), "val3"));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1), EncodeAsUint64(4), "val4"));
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable(cfd));
+  // Testing get the newest timestamp from the more recent immutable memtable
+  // when there are multiple immutable memtables.
+  ASSERT_OK(db_->GetNewestUserDefinedTimestamp(nullptr, &newest_timestamp));
+  ASSERT_EQ(EncodeAsUint64(4), newest_timestamp);
+
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1), EncodeAsUint64(5), "val5"));
+  // Testing get newest timestamp from mutable memtable when it has data, in the
+  // presence of immutable memtables.
+  ASSERT_OK(db_->GetNewestUserDefinedTimestamp(nullptr, &newest_timestamp));
+  ASSERT_EQ(EncodeAsUint64(5), newest_timestamp);
+
+  ASSERT_OK(Flush());
+  // After flushing and all the user defined timestamp are flushed. User defined
+  // timestamp info for SST files is available from MANIFEST.
+  ASSERT_OK(db_->GetNewestUserDefinedTimestamp(nullptr, &newest_timestamp));
+  ASSERT_EQ(EncodeAsUint64(5), newest_timestamp);
+
+  Reopen(options);
+  // Similar after flush, when there is no memtables, but some SST files,
+  // if MANIFEST records the upperbound of flushed timestamps because timestamps
+  // are not persisted in SST files, this info can be found.
+  ASSERT_OK(db_->GetNewestUserDefinedTimestamp(nullptr, &newest_timestamp));
+  ASSERT_EQ(EncodeAsUint64(5), newest_timestamp);
+
+  Close();
+}
+
+TEST_F(GetNewestUserDefinedTimestampTest, ConcurrentWrites) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+  options.persist_user_defined_timestamps = false;
+  options.allow_concurrent_memtable_write = false;
+
+  DestroyAndReopen(options);
+
+  std::vector<std::thread> threads;
+  threads.reserve(10);
+  std::atomic<uint64_t> current_ts{0};
+  for (int i = 0; i < 10; i++) {
+    threads.emplace_back([this, i, &current_ts]() {
+      if (i % 2 == 0) {
+        std::string newest_timestamp;
+        ASSERT_OK(
+            db_->GetNewestUserDefinedTimestamp(nullptr, &newest_timestamp));
+      } else {
+        uint64_t write_ts = current_ts.fetch_add(1);
+        ASSERT_OK(db_->Put(WriteOptions(), Key(1), EncodeAsUint64(write_ts),
+                           "val" + std::to_string(i)));
+      }
+    });
+  }
+
+  for (auto& t : threads) {
+    t.join();
+  }
+  Close();
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/db_with_timestamp_compaction_test.cc b/db/db_with_timestamp_compaction_test.cc
index 783140cbf7d9..1e35d43f829c 100644
--- a/db/db_with_timestamp_compaction_test.cc
+++ b/db/db_with_timestamp_compaction_test.cc
@@ -7,9 +7,13 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#include <set>
+
+#include "db/column_family.h"
 #include "db/compaction/compaction.h"
 #include "db/db_test_util.h"
 #include "port/stack_trace.h"
+#include "rocksdb/sst_file_reader.h"
 #include "test_util/testutil.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -48,6 +52,122 @@ class TimestampCompatibleCompactionTest : public DBTestBase {
     }
     return value;
   }
+
+  // Helper to get all files with their level and timestamps
+  std::vector<std::tuple<int, std::string, std::string>>
+  GetAllFileTimestamps() {
+    std::vector<std::tuple<int, std::string, std::string>> results;
+    ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+    auto* cfd = static_cast_with_check<ColumnFamilyHandleImpl>(cfh)->cfd();
+    auto* vstorage = cfd->current()->storage_info();
+
+    for (int level = 0; level < cfd->NumberLevels(); level++) {
+      for (auto* file : vstorage->LevelFiles(level)) {
+        results.emplace_back(level, file->min_timestamp, file->max_timestamp);
+      }
+    }
+    return results;
+  }
+
+  // Helper to compute overall min/max timestamps across all files
+  // Returns {min_ts, max_ts} as uint64_t values
+  // Asserts that all files have non-empty timestamps
+  std::pair<uint64_t, uint64_t> GetOverallTimestampRange() {
+    auto files = GetAllFileTimestamps();
+    EXPECT_GE(files.size(), 1U);
+
+    uint64_t overall_min = UINT64_MAX;
+    uint64_t overall_max = 0;
+    for (const auto& [level, min_ts, max_ts] : files) {
+      EXPECT_FALSE(min_ts.empty()) << "min_timestamp empty at level " << level;
+      EXPECT_FALSE(max_ts.empty()) << "max_timestamp empty at level " << level;
+
+      if (!min_ts.empty() && !max_ts.empty()) {
+        uint64_t file_min = DecodeFixed64(min_ts.data());
+        uint64_t file_max = DecodeFixed64(max_ts.data());
+        overall_min = std::min(overall_min, file_min);
+        overall_max = std::max(overall_max, file_max);
+      }
+    }
+    return {overall_min, overall_max};
+  }
+
+  // Helper to verify timestamp range matches expected values, including after
+  // reopen
+  void VerifyTimestampRangeWithPersistence(const Options& options,
+                                           uint64_t expected_min,
+                                           uint64_t expected_max) {
+    // Verify before reopen
+    auto [min_ts, max_ts] = GetOverallTimestampRange();
+    ASSERT_EQ(expected_min, min_ts);
+    ASSERT_EQ(expected_max, max_ts);
+
+    size_t file_count_before = GetAllFileTimestamps().size();
+
+    // Verify manifest persistence by reopening
+    Reopen(options);
+
+    // Verify after reopen
+    auto [reopened_min_ts, reopened_max_ts] = GetOverallTimestampRange();
+    ASSERT_EQ(expected_min, reopened_min_ts);
+    ASSERT_EQ(expected_max, reopened_max_ts);
+    ASSERT_EQ(file_count_before, GetAllFileTimestamps().size());
+  }
+
+  // Helper to create common options for UDT tests with level compaction
+  Options CreateTimestampOptions(bool disable_auto_compactions = false) {
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.compaction_style = kCompactionStyleLevel;
+    options.num_levels = 4;
+    options.persist_user_defined_timestamps = true;
+    options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+    options.disable_auto_compactions = disable_auto_compactions;
+    return options;
+  }
+
+  // Helper to write test data with alternating timestamps in a range
+  // Writes keys [start_key, end_key) with timestamps alternating between
+  // min_ts and max_ts
+  void WriteDataWithTimestampRange(int start_key, int end_key, uint64_t min_ts,
+                                   uint64_t max_ts) {
+    std::string ts_buf;
+    for (int i = start_key; i < end_key; i++) {
+      ts_buf.clear();
+      uint64_t ts = (i % 2 == 0) ? min_ts : max_ts;
+      PutFixed64(&ts_buf, ts);
+      ASSERT_OK(db_->Put(WriteOptions(), Key(i), ts_buf,
+                         "value" + std::to_string(i)));
+    }
+  }
+
+  // Helper to check if any file has the expected timestamp range
+  bool HasFileWithTimestampRange(uint64_t expected_min, uint64_t expected_max) {
+    auto file_timestamps = GetAllFileTimestamps();
+    for (const auto& [level, min_ts, max_ts] : file_timestamps) {
+      if (!min_ts.empty() && !max_ts.empty()) {
+        uint64_t file_min = DecodeFixed64(min_ts.data());
+        uint64_t file_max = DecodeFixed64(max_ts.data());
+        if (file_min == expected_min && file_max == expected_max) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  // Helper to verify data is readable with a given timestamp
+  void VerifyDataReadable(int key, const std::string& expected_value,
+                          uint64_t read_ts) {
+    std::string value;
+    std::string ts_buf;
+    PutFixed64(&ts_buf, read_ts);
+    ReadOptions read_opts;
+    Slice ts_slice(ts_buf);
+    read_opts.timestamp = &ts_slice;
+    ASSERT_OK(db_->Get(read_opts, Key(key), &value));
+    ASSERT_EQ(expected_value, value);
+  }
 };
 
 TEST_F(TimestampCompatibleCompactionTest, UserKeyCrossFileBoundary) {
@@ -344,6 +464,385 @@ TEST_F(TimestampCompatibleCompactionTest, EmptyCompactionOutput) {
   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
 }
 
+TEST_F(TimestampCompatibleCompactionTest, SeqnoZeroingWithUDT) {
+  // This test validates that seqno is only zeroed when the timestamp is older
+  // than full_history_ts_low_. Before the fix, seqno was incorrectly zeroed
+  // even when UDT was enabled but timestamp wasn't old enough.
+
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+
+  // Track seqno zeroing events and which keys are zeroed
+  std::set<std::string> zeroed_keys;
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::PrepareOutput:ZeroingSeq", [&](void* arg) {
+        auto* ikey = static_cast<ParsedInternalKey*>(arg);
+        ASSERT_EQ(0, ikey->sequence);
+        // Extract user key without timestamp (last 8 bytes)
+        Slice user_key_with_ts = ikey->user_key;
+        std::string user_key =
+            user_key_with_ts.ToString().substr(0, user_key_with_ts.size() - 8);
+        zeroed_keys.insert(user_key);
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Case 1: Test that seqno is NOT zeroed when full_history_ts_low is not set
+  // Write a key with timestamp 100
+  std::string ts_str = Timestamp(100);
+  ASSERT_OK(db_->Put(WriteOptions(), "key1", ts_str, "value1"));
+  ASSERT_OK(Flush());
+
+  zeroed_keys.clear();
+  {
+    CompactRangeOptions cro;
+    cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+    ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  }
+  // With UDT enabled and no full_history_ts_low, seqno should NOT be zeroed
+  ASSERT_TRUE(zeroed_keys.empty());
+
+  // Case 2: Test that seqno IS zeroed when timestamp < full_history_ts_low
+  // Write a new key with timestamp 200
+  ts_str = Timestamp(200);
+  ASSERT_OK(db_->Put(WriteOptions(), "key2", ts_str, "value2"));
+  ASSERT_OK(Flush());
+
+  zeroed_keys.clear();
+  {
+    // Set full_history_ts_low to 300, so ts < 300 should be zeroed
+    std::string full_history_ts_low = Timestamp(300);
+    Slice ts_slice = full_history_ts_low;
+    CompactRangeOptions cro;
+    cro.full_history_ts_low = &ts_slice;
+    cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+    ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  }
+  // key1 (ts=100) and key2 (ts=200) both have ts < 300, so both should be
+  // zeroed
+  ASSERT_EQ(2u, zeroed_keys.size());
+  ASSERT_TRUE(zeroed_keys.count("key1") > 0);
+  ASSERT_TRUE(zeroed_keys.count("key2") > 0);
+
+  // Case 3: Write a new key with timestamp >= full_history_ts_low
+  // and verify it is NOT zeroed while old keys are re-zeroed
+  ts_str = Timestamp(500);
+  ASSERT_OK(db_->Put(WriteOptions(), "key3", ts_str, "value3"));
+  ASSERT_OK(Flush());
+
+  zeroed_keys.clear();
+  {
+    // Set full_history_ts_low to 400
+    // key1 (ts=100) and key2 (ts=200) have ts < 400, will be re-processed
+    // key3 (ts=500) has ts >= 400, should NOT be zeroed
+    std::string full_history_ts_low = Timestamp(400);
+    Slice ts_slice = full_history_ts_low;
+    CompactRangeOptions cro;
+    cro.full_history_ts_low = &ts_slice;
+    cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+    ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  }
+  // key3 should NOT appear in zeroed_keys since ts=500 >= 400
+  ASSERT_TRUE(zeroed_keys.count("key3") == 0);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // Verify data is still readable
+  std::string value;
+  ts_str = Timestamp(600);
+  Slice read_ts = ts_str;
+  ReadOptions read_opts;
+  read_opts.timestamp = &read_ts;
+  ASSERT_OK(db_->Get(read_opts, "key1", &value));
+  ASSERT_EQ("value1", value);
+  ASSERT_OK(db_->Get(read_opts, "key2", &value));
+  ASSERT_EQ("value2", value);
+  ASSERT_OK(db_->Get(read_opts, "key3", &value));
+  ASSERT_EQ("value3", value);
+}
+
+// Test that files with max_timestamp >= full_history_ts_low are not marked
+// for bottommost compaction, which prevents infinite compaction loops.
+TEST_F(TimestampCompatibleCompactionTest,
+       BottommostCompactionRespectsFullHistoryTsLow) {
+  Options options = CreateTimestampOptions();
+  options.level0_file_num_compaction_trigger = 4;
+
+  DestroyAndReopen(options);
+
+  // Write some data with timestamps 100-199
+  std::string ts_buf;
+  for (int i = 0; i < 100; i++) {
+    ts_buf.clear();
+    PutFixed64(&ts_buf, 100 + i);
+    ASSERT_OK(
+        db_->Put(WriteOptions(), Key(i), ts_buf, "value" + std::to_string(i)));
+  }
+  ASSERT_OK(Flush());
+
+  // Compact to the bottommost level
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  // Set full_history_ts_low to 150 - files with max_ts >= 150 should NOT be
+  // marked for bottommost compaction since seqno cannot be zeroed
+  ts_buf.clear();
+  PutFixed64(&ts_buf, 150);
+  ASSERT_OK(db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), ts_buf));
+
+  // Release a snapshot to potentially trigger bottommost file marking
+  // but files should NOT be marked because max_ts (199) >= full_history_ts_low
+  // (150)
+  const Snapshot* snap = db_->GetSnapshot();
+  db_->ReleaseSnapshot(snap);
+
+  // Wait for any scheduled compactions - should complete without infinite loop
+  // Use a reasonable timeout to detect infinite loops
+  WaitForCompactOptions wfc_options;
+  wfc_options.timeout = std::chrono::microseconds(5000000);  // 5 seconds
+  Status s = dbfull()->WaitForCompact(wfc_options);
+  // Should succeed without timeout (no infinite compaction loop)
+  ASSERT_TRUE(s.ok() || s.IsTimedOut());
+  if (s.IsTimedOut()) {
+    // If timeout, the fix is not working - this should not happen
+    FAIL() << "WaitForCompact timed out - possible infinite compaction loop";
+  }
+
+  // Now set full_history_ts_low beyond max timestamp in the file (200+)
+  // This should allow the file to be properly marked and compacted
+  ts_buf.clear();
+  PutFixed64(&ts_buf, 300);
+  ASSERT_OK(db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), ts_buf));
+
+  // Trigger another snapshot release to potentially mark files
+  snap = db_->GetSnapshot();
+  db_->ReleaseSnapshot(snap);
+
+  // Now compaction should clean up the file.
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+}
+
+// Test that files are NOT marked for bottommost compaction when UDT is enabled
+// and full_history_ts_low has never been set (empty).
+TEST_F(TimestampCompatibleCompactionTest,
+       BottommostCompactionSkipsWhenFullHistoryTsLowNotSet) {
+  Options options = CreateTimestampOptions();
+
+  DestroyAndReopen(options);
+
+  // Write some data with timestamps 100-199
+  std::string ts_buf;
+  for (int i = 0; i < 100; i++) {
+    ts_buf.clear();
+    PutFixed64(&ts_buf, 100 + i);
+    ASSERT_OK(
+        db_->Put(WriteOptions(), Key(i), ts_buf, "value" + std::to_string(i)));
+  }
+  ASSERT_OK(Flush());
+
+  // Compact to the bottommost level without setting full_history_ts_low
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  // Verify files have valid max_timestamp
+  auto file_timestamps = GetAllFileTimestamps();
+  ASSERT_GE(file_timestamps.size(), 1U);
+  for (const auto& [level, min_ts, max_ts] : file_timestamps) {
+    ASSERT_FALSE(max_ts.empty()) << "max_timestamp should not be empty";
+  }
+
+  // full_history_ts_low is NOT set (empty), so files should NOT be marked
+  // for bottommost compaction even after releasing a snapshot.
+  // This tests the branch: if (full_history_ts_low.empty()) { continue; }
+  const Snapshot* snap = db_->GetSnapshot();
+  db_->ReleaseSnapshot(snap);
+
+  // Wait for any scheduled compactions
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Now set full_history_ts_low to a value > max_timestamp (199) in the file
+  // This should allow the file to be properly marked and compacted
+  ts_buf.clear();
+  PutFixed64(&ts_buf, 300);
+  ASSERT_OK(db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), ts_buf));
+
+  // Trigger another snapshot release to potentially mark files
+  snap = db_->GetSnapshot();
+  db_->ReleaseSnapshot(snap);
+
+  // Now compaction should be able to proceed since full_history_ts_low is set
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Verify data is still readable
+  VerifyDataReadable(0, "value0", 250);
+}
+
+// Test that ingested SST files created with UDT have their min/max timestamps
+// properly extracted from table properties and populated in FileMetaData.
+// This verifies the fix in external_sst_file_ingestion_job.cc that calls
+// ExtractTimestampFromTableProperties after creating FileMetaData.
+TEST_F(TimestampCompatibleCompactionTest,
+       IngestedFileTimestampsExtractedFromTableProperties) {
+  Options options = CreateTimestampOptions();
+
+  DestroyAndReopen(options);
+
+  // Create an SST file WITH timestamps using SstFileWriter
+  std::string sst_file = dbname_ + "/ingested_udt_file.sst";
+  const uint64_t kMinTs = 100;
+  const uint64_t kMaxTs = 200;
+
+  {
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+    ASSERT_OK(sst_file_writer.Open(sst_file));
+
+    std::string ts_buf;
+    for (int i = 0; i < 10; i++) {
+      // Alternate between min and max timestamps
+      uint64_t ts = (i % 2 == 0) ? kMinTs : kMaxTs;
+      ts_buf.clear();
+      PutFixed64(&ts_buf, ts);
+      // SstFileWriter with UDT comparator requires key with timestamp
+      ASSERT_OK(
+          sst_file_writer.Put(Key(i), ts_buf, "value" + std::to_string(i)));
+    }
+    ASSERT_OK(sst_file_writer.Finish());
+  }
+
+  // Verify the SST file has timestamp properties before ingestion
+  {
+    std::unique_ptr<SstFileReader> reader(new SstFileReader(options));
+    ASSERT_OK(reader->Open(sst_file));
+    auto props = reader->GetTableProperties();
+    auto& user_collected = props->user_collected_properties;
+    ASSERT_TRUE(user_collected.find("rocksdb.timestamp_min") !=
+                user_collected.end())
+        << "SST file should have rocksdb.timestamp_min property";
+    ASSERT_TRUE(user_collected.find("rocksdb.timestamp_max") !=
+                user_collected.end())
+        << "SST file should have rocksdb.timestamp_max property";
+  }
+
+  // Ingest the SST file
+  IngestExternalFileOptions ifo;
+  ifo.move_files = false;
+  ASSERT_OK(db_->IngestExternalFile({sst_file}, ifo));
+
+  // Verify the ingested file has proper timestamps in FileMetaData
+  ASSERT_TRUE(HasFileWithTimestampRange(kMinTs, kMaxTs))
+      << "Ingested file should have min_timestamp=" << kMinTs
+      << " and max_timestamp=" << kMaxTs << " in FileMetaData";
+
+  // Verify timestamps persist after reopen
+  Reopen(options);
+
+  ASSERT_TRUE(HasFileWithTimestampRange(kMinTs, kMaxTs))
+      << "Ingested file timestamps should persist after reopen";
+
+  // Verify data is readable
+  VerifyDataReadable(0, "value0", kMaxTs);
+
+  // Clean up
+  ASSERT_OK(env_->DeleteFile(sst_file));
+}
+
+// Test that min/max timestamps are correctly tracked in FileMetaData and
+// persisted in the manifest during flush.
+TEST_F(TimestampCompatibleCompactionTest, TimestampRangePersistenceFlush) {
+  Options options = CreateTimestampOptions();
+
+  DestroyAndReopen(options);
+
+  // Expected timestamp range
+  const uint64_t kMinTs = 100;
+  const uint64_t kMaxTs = 200;
+
+  // Write data with specific timestamp range
+  WriteDataWithTimestampRange(0, 50, kMinTs, kMaxTs);
+  ASSERT_OK(Flush());
+
+  // First verify table properties have the timestamps
+  // (this confirms TimestampTablePropertiesCollector is working)
+  TablePropertiesCollection props;
+  ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
+  ASSERT_EQ(1U, props.size());
+  for (const auto& item : props) {
+    auto& user_collected = item.second->user_collected_properties;
+    ASSERT_TRUE(user_collected.find("rocksdb.timestamp_min") !=
+                user_collected.end());
+    ASSERT_TRUE(user_collected.find("rocksdb.timestamp_max") !=
+                user_collected.end());
+    // Verify the collected timestamps match expected values
+    std::string collected_min_ts = user_collected.at("rocksdb.timestamp_min");
+    std::string collected_max_ts = user_collected.at("rocksdb.timestamp_max");
+    ASSERT_EQ(kMinTs, DecodeFixed64(collected_min_ts.data()));
+    ASSERT_EQ(kMaxTs, DecodeFixed64(collected_max_ts.data()));
+  }
+
+  // Verify FileMetaData timestamps and persistence through reopen
+  VerifyTimestampRangeWithPersistence(options, kMinTs, kMaxTs);
+
+  // Verify we can still read the data
+  VerifyDataReadable(0, "value0", kMaxTs);
+}
+
+// Test that min/max timestamps are correctly merged during compaction
+// and persisted in the manifest.
+TEST_F(TimestampCompatibleCompactionTest, TimestampRangePersistenceCompaction) {
+  Options options = CreateTimestampOptions(true /* disable_auto_compactions */);
+
+  DestroyAndReopen(options);
+
+  // Create multiple L0 files with different timestamp ranges
+  // File 1: timestamps 100-150
+  const uint64_t kFile1MinTs = 100;
+  const uint64_t kFile1MaxTs = 150;
+  WriteDataWithTimestampRange(0, 10, kFile1MinTs, kFile1MaxTs);
+  ASSERT_OK(Flush());
+
+  // File 2: timestamps 50-80 (earlier range)
+  const uint64_t kFile2MinTs = 50;
+  const uint64_t kFile2MaxTs = 80;
+  WriteDataWithTimestampRange(10, 20, kFile2MinTs, kFile2MaxTs);
+  ASSERT_OK(Flush());
+
+  // File 3: timestamps 200-300 (later range)
+  const uint64_t kFile3MinTs = 200;
+  const uint64_t kFile3MaxTs = 300;
+  WriteDataWithTimestampRange(20, 30, kFile3MinTs, kFile3MaxTs);
+  ASSERT_OK(Flush());
+
+  // Expected combined range: min=50, max=300
+  const uint64_t kExpectedMinTs = 50;
+  const uint64_t kExpectedMaxTs = 300;
+
+  // Verify we have 3 L0 files before compaction with valid timestamps
+  auto files_before = GetAllFileTimestamps();
+  ASSERT_EQ(3U, files_before.size());
+  for (const auto& [level, min_ts, max_ts] : files_before) {
+    ASSERT_EQ(0, level);  // All files should be in L0
+    ASSERT_FALSE(min_ts.empty());
+    ASSERT_FALSE(max_ts.empty());
+  }
+
+  // Trigger compaction
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Verify timestamp range and persistence through reopen
+  VerifyTimestampRangeWithPersistence(options, kExpectedMinTs, kExpectedMaxTs);
+
+  // Verify data is still readable
+  VerifyDataReadable(0, "value0", kExpectedMaxTs);
+  VerifyDataReadable(15, "value15", kExpectedMaxTs);
+  VerifyDataReadable(25, "value25", kExpectedMaxTs);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/db_write_buffer_manager_test.cc b/db/db_write_buffer_manager_test.cc
index db4bf2b8a289..2eff1d397f7e 100644
--- a/db/db_write_buffer_manager_test.cc
+++ b/db/db_write_buffer_manager_test.cc
@@ -183,11 +183,11 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferAcrossCFs2) {
 // is waiting to be finished but DBs tries to write meanwhile.
 TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB) {
   std::vector<std::string> dbnames;
-  std::vector<DB*> dbs;
+  std::vector<std::unique_ptr<DB>> dbs;
   int num_dbs = 3;
 
   for (int i = 0; i < num_dbs; i++) {
-    dbs.push_back(nullptr);
+    dbs.emplace_back();
     dbnames.push_back(
         test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i)));
   }
@@ -266,7 +266,7 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB) {
   //  Last writer will write and when its blocked it will signal Flush to
   //  continue to clear the stall.
 
-  threads.emplace_back(write_db, db_);
+  threads.emplace_back(write_db, db_.get());
   // Wait untill first DB is blocked and then create the multiple writers for
   // different DBs which will be blocked from getting added to the queue because
   // stall is in effect.
@@ -277,7 +277,7 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB) {
     }
   }
   for (int i = 0; i < num_dbs; i++) {
-    threads.emplace_back(write_db, dbs[i]);
+    threads.emplace_back(write_db, dbs[i].get());
   }
   for (auto& t : threads) {
     t.join();
@@ -289,7 +289,7 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB) {
   for (int i = 0; i < num_dbs; i++) {
     ASSERT_OK(dbs[i]->Close());
     ASSERT_OK(DestroyDB(dbnames[i], options));
-    delete dbs[i];
+    dbs[i].reset();
   }
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
@@ -300,11 +300,11 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB) {
 // blocked when stall by WriteBufferManager is in effect.
 TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB1) {
   std::vector<std::string> dbnames;
-  std::vector<DB*> dbs;
+  std::vector<std::unique_ptr<DB>> dbs;
   int num_dbs = 3;
 
   for (int i = 0; i < num_dbs; i++) {
-    dbs.push_back(nullptr);
+    dbs.emplace_back();
     dbnames.push_back(
         test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i)));
   }
@@ -407,7 +407,7 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB1) {
   //  |
   //  Last writer thread will write and when its blocked it will signal Flush to
   //  continue to clear the stall.
-  threads.emplace_back(write_db, db_);
+  threads.emplace_back(write_db, db_.get());
   // Wait untill first thread is blocked and then create the multiple writer
   // threads.
   {
@@ -421,7 +421,7 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB1) {
     // Write to multiple columns of db_.
     writer_threads.emplace_back(write_cf, i % 3);
     // Write to different dbs.
-    threads.emplace_back(write_db, dbs[i]);
+    threads.emplace_back(write_db, dbs[i].get());
   }
   for (auto& t : threads) {
     t.join();
@@ -441,7 +441,7 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB1) {
   for (int i = 0; i < num_dbs; i++) {
     ASSERT_OK(dbs[i]->Close());
     ASSERT_OK(DestroyDB(dbnames[i], options));
-    delete dbs[i];
+    dbs[i].reset();
   }
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
@@ -604,11 +604,11 @@ TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsSingleDB) {
 // dbs by passing different values to WriteOption.no_slown_down.
 TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsMultipleDB) {
   std::vector<std::string> dbnames;
-  std::vector<DB*> dbs;
+  std::vector<std::unique_ptr<DB>> dbs;
   int num_dbs = 4;
 
   for (int i = 0; i < num_dbs; i++) {
-    dbs.push_back(nullptr);
+    dbs.emplace_back();
     dbnames.push_back(
         test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i)));
   }
@@ -732,7 +732,7 @@ TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsMultipleDB) {
   //  |
   //  Last writer thread will write and when its blocked/return it will signal
   //  Flush to continue to clear the stall.
-  threads.emplace_back(write_slow_down, db_);
+  threads.emplace_back(write_slow_down, db_.get());
   // Wait untill first thread writing to DB is blocked and then
   // create the multiple writers.
   {
@@ -744,11 +744,11 @@ TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsMultipleDB) {
 
   for (int i = 0; i < num_dbs; i += 2) {
     // Write to multiple columns of db_.
-    writer_threads.emplace_back(write_slow_down, db_);
-    writer_threads.emplace_back(write_no_slow_down, db_);
+    writer_threads.emplace_back(write_slow_down, db_.get());
+    writer_threads.emplace_back(write_no_slow_down, db_.get());
     // Write to different DBs.
-    threads.emplace_back(write_slow_down, dbs[i]);
-    threads.emplace_back(write_no_slow_down, dbs[i + 1]);
+    threads.emplace_back(write_slow_down, dbs[i].get());
+    threads.emplace_back(write_no_slow_down, dbs[i + 1].get());
   }
 
   for (auto& t : threads) {
@@ -773,7 +773,7 @@ TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsMultipleDB) {
   for (int i = 0; i < num_dbs; i++) {
     ASSERT_OK(dbs[i]->Close());
     ASSERT_OK(DestroyDB(dbnames[i], options));
-    delete dbs[i];
+    dbs[i].reset();
   }
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
@@ -809,7 +809,7 @@ TEST_P(DBWriteBufferManagerTest, StopSwitchingMemTablesOnceFlushing) {
 
   Reopen(options);
   std::string dbname = test::PerThreadDBPath("db_shared_wbm_db");
-  DB* shared_wbm_db = nullptr;
+  std::unique_ptr<DB> shared_wbm_db;
 
   ASSERT_OK(DestroyDB(dbname, options));
   ASSERT_OK(DB::Open(options, dbname, &shared_wbm_db));
@@ -842,7 +842,7 @@ TEST_P(DBWriteBufferManagerTest, StopSwitchingMemTablesOnceFlushing) {
   sleeping_task_high.WaitUntilDone();
   ASSERT_OK(shared_wbm_db->Close());
   ASSERT_OK(DestroyDB(dbname, options));
-  delete shared_wbm_db;
+  shared_wbm_db.reset();
 }
 
 TEST_F(DBWriteBufferManagerTest, RuntimeChangeableAllowStall) {
diff --git a/db/db_write_test.cc b/db/db_write_test.cc
index 2dfcd864f5a5..97fb86c14c2c 100644
--- a/db/db_write_test.cc
+++ b/db/db_write_test.cc
@@ -741,7 +741,7 @@ TEST_P(DBWriteTest, LockWALConcurrentRecursive) {
     ExternalSstFileInfo external_info;
     ASSERT_OK(sst_file_writer.Finish(&external_info));
   }
-  AcqRelAtomic<bool> parallel_ingest_completed{false};
+  Atomic<bool> parallel_ingest_completed{false};
   port::Thread parallel_ingest{[&]() {
     IngestExternalFileOptions ingest_opts;
     ingest_opts.move_files = true;  // faster than copy
@@ -750,7 +750,7 @@ TEST_P(DBWriteTest, LockWALConcurrentRecursive) {
     parallel_ingest_completed.Store(true);
   }};
 
-  AcqRelAtomic<bool> flush_completed{false};
+  Atomic<bool> flush_completed{false};
   port::Thread parallel_flush{[&]() {
     FlushOptions flush_opts;
     // NB: Flush with wait=false case is tested above in LockWALInEffect
@@ -762,7 +762,7 @@ TEST_P(DBWriteTest, LockWALConcurrentRecursive) {
     flush_completed.Store(true);
   }};
 
-  AcqRelAtomic<bool> parallel_put_completed{false};
+  Atomic<bool> parallel_put_completed{false};
   port::Thread parallel_put{[&]() {
     // This can make certain failure scenarios more likely:
     //   sleep(1);
@@ -987,7 +987,7 @@ TEST_P(DBWriteTest, RecycleLogToggleTest) {
 
   options.recycle_log_file_num = 1;
   Reopen(options);
-  // 1.log is added to alive_log_files_
+  // 1.log is added to alive_wal_files_
   ASSERT_OK(Put(Key(2), "val1"));
   ASSERT_OK(Flush());
   // 1.log should be deleted and not recycled, since it
@@ -1000,6 +1000,80 @@ TEST_P(DBWriteTest, RecycleLogToggleTest) {
   ASSERT_EQ(Get(Key(1)), "val2");
 }
 
+TEST_P(DBWriteTest, IngestWriteBatchWithIndex) {
+  if (GetParam() == kPipelinedWrite) {
+    return;
+  }
+
+  Options options = GetOptions();
+  options.disable_auto_compactions = true;
+  Reopen(options);
+  Options cf_options = GetOptions();
+  cf_options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  CreateColumnFamilies({"cf1", "cf2"}, cf_options);
+  ReopenWithColumnFamilies({"default", "cf1", "cf2"},
+                           {options, cf_options, cf_options});
+
+  // default cf
+  auto wbwi1 = std::make_shared<WriteBatchWithIndex>(options.comparator, 0,
+                                                     /*overwrite_key=*/true);
+  ASSERT_OK(wbwi1->Put("key1", "value1"));
+  ASSERT_OK(wbwi1->Put("key2", "value2"));
+  if (GetParam() == kPipelinedWrite) {
+    ASSERT_TRUE(db_->IngestWriteBatchWithIndex({}, wbwi1).IsNotSupported());
+    return;
+  }
+  // Test disableWAL=false
+  ASSERT_TRUE(db_->IngestWriteBatchWithIndex({}, wbwi1).IsNotSupported());
+
+  WriteOptions wo;
+  wo.disableWAL = true;
+  ASSERT_OK(db_->IngestWriteBatchWithIndex(wo, wbwi1));
+  ASSERT_EQ("value1", Get("key1"));
+  ASSERT_EQ("value2", Get("key2"));
+
+  // Test with overwrites
+  auto wbwi = std::make_shared<WriteBatchWithIndex>(options.comparator, 0,
+                                                    /*overwrite_key=*/true);
+  ASSERT_OK(wbwi->Put("key2", "value3"));
+  ASSERT_OK(wbwi->Delete("key1"));  // Delete an existing key
+  ASSERT_OK(db_->IngestWriteBatchWithIndex(wo, wbwi));
+  ASSERT_EQ("NOT_FOUND", Get("key1"));
+  ASSERT_EQ("value3", Get("key2"));
+
+  auto wbwi2 = std::make_shared<WriteBatchWithIndex>(options.comparator, 0,
+                                                     /*overwrite_key=*/true);
+  ASSERT_OK(wbwi2->Put(handles_[1], "cf1_key1", "cf1_value1"));
+  ASSERT_OK(wbwi2->Delete(handles_[1], "cf1_key2"));
+  // Test ingestion with column family
+  ASSERT_OK(db_->IngestWriteBatchWithIndex(wo, wbwi2));
+  ASSERT_EQ("cf1_value1", Get(1, "cf1_key1"));
+  ASSERT_EQ("NOT_FOUND", Get(1, "cf1_key2"));
+
+  auto wbwi3 = std::make_shared<WriteBatchWithIndex>(options.comparator, 0,
+                                                     /*overwrite_key=*/true);
+  ASSERT_OK(wbwi3->Merge(handles_[2], "cf2_key1", "cf2_value1"));
+  ASSERT_OK(wbwi3->Merge(handles_[2], "cf2_key1", "cf2_value2"));
+  // Test ingestion with merge operations
+  ASSERT_OK(db_->IngestWriteBatchWithIndex(wo, wbwi3));
+  ASSERT_EQ("cf2_value1,cf2_value2", Get(2, "cf2_key1"));
+
+  // Test with overwrite_key = false
+  auto wbwi_no_overwrite = std::make_shared<WriteBatchWithIndex>(
+      options.comparator, 0, /*overwrite_key=*/false);
+  ASSERT_OK(wbwi_no_overwrite->Put("key1", "value1"));
+  Status s = db_->IngestWriteBatchWithIndex(wo, wbwi_no_overwrite);
+  ASSERT_TRUE(s.IsNotSupported());
+
+  auto empty_wbwi = std::make_shared<WriteBatchWithIndex>(
+      options.comparator, 0, /*overwrite_key=*/true);
+  ASSERT_OK(db_->IngestWriteBatchWithIndex(wo, empty_wbwi));
+
+  DestroyAndReopen(options);
+  // Should fail when trying to ingest to non-existent column family
+  ASSERT_NOK(db_->IngestWriteBatchWithIndex(wo, wbwi2));
+}
+
 INSTANTIATE_TEST_CASE_P(DBWriteTestInstance, DBWriteTest,
                         testing::Values(DBTestBase::kDefault,
                                         DBTestBase::kConcurrentWALWrites,
diff --git a/db/dbformat.h b/db/dbformat.h
index 3dfb077397ed..e1b9342ff430 100644
--- a/db/dbformat.h
+++ b/db/dbformat.h
@@ -83,6 +83,8 @@ extern const ValueType kValueTypeForSeekForPrev;
 
 // A range of user keys used internally by RocksDB. Also see `Range` used by
 // public APIs.
+// TODO: merge with Range in pubic API, but this is generally inclusive limit
+// and it is maybe exclusive limit
 struct UserKeyRange {
   // In case of user_defined timestamp, if enabled, `start` and `limit` should
   // include user_defined timestamps.
@@ -93,18 +95,17 @@ struct UserKeyRange {
   UserKeyRange(const Slice& s, const Slice& l) : start(s), limit(l) {}
 };
 
-// A range of user keys used internally by RocksDB. Also see `RangePtr` used by
+// A range of user keys used internally by RocksDB. Also see `RangeOpt` used by
 // public APIs.
-struct UserKeyRangePtr {
+struct UserKeyRangeOpt {
   // In case of user_defined timestamp, if enabled, `start` and `limit` should
   // point to key with timestamp part.
   // An optional range start, if missing, indicating a start before all keys.
-  std::optional<Slice> start;
+  OptSlice start;
   // An optional range end, if missing, indicating an end after all keys.
-  std::optional<Slice> limit;
+  OptSlice limit;
 
-  UserKeyRangePtr(const std::optional<Slice>& s, const std::optional<Slice>& l)
-      : start(s), limit(l) {}
+  UserKeyRangeOpt(const OptSlice& s, const OptSlice& l) : start(s), limit(l) {}
 };
 
 // Checks whether a type is an inline value type
@@ -469,6 +470,7 @@ class InternalKey {
 
   Slice user_key() const { return ExtractUserKey(rep_); }
   size_t size() const { return rep_.size(); }
+  bool unset() const { return rep_.empty(); }
 
   void Set(const Slice& _user_key, SequenceNumber s, ValueType t) {
     SetFrom(ParsedInternalKey(_user_key, s, t));
@@ -978,11 +980,6 @@ class InternalKeySliceTransform : public SliceTransform {
     return transform_->InDomain(user_key);
   }
 
-  bool InRange(const Slice& dst) const override {
-    auto user_key = ExtractUserKey(dst);
-    return transform_->InRange(user_key);
-  }
-
   const SliceTransform* user_prefix_extractor() const { return transform_; }
 
  private:
diff --git a/db/dbformat_test.cc b/db/dbformat_test.cc
index ab31e5a6f087..674e01307f19 100644
--- a/db/dbformat_test.cc
+++ b/db/dbformat_test.cc
@@ -333,6 +333,50 @@ TEST_F(FormatTest, ReplaceInternalKeyWithMinTimestamp) {
   ASSERT_EQ(kTypeValue, new_key.type);
 }
 
+TEST(RocksdbVersionTest, Version) {
+  // Test preprocessor macros for versioning
+  ASSERT_GT(ROCKSDB_MAJOR, 0);
+  ASSERT_GE(ROCKSDB_MINOR, 0);
+  ASSERT_GE(ROCKSDB_PATCH, 0);
+  ASSERT_LT(ROCKSDB_MAJOR, 1000);
+  ASSERT_LT(ROCKSDB_MINOR, 1000);
+  ASSERT_LT(ROCKSDB_PATCH, 1000);
+  ASSERT_EQ(ROCKSDB_MAKE_VERSION_INT(123, 456, 789), 123456789);
+  ASSERT_GT(ROCKSDB_VERSION_INT, 9999999);
+  ASSERT_LT(ROCKSDB_VERSION_INT, 99999999);
+  static_assert(ROCKSDB_VERSION_GE(9, 8, 7));
+  static_assert(
+      ROCKSDB_VERSION_GE(ROCKSDB_MAJOR, ROCKSDB_MINOR, ROCKSDB_PATCH));
+  static_assert(
+      ROCKSDB_VERSION_GE(ROCKSDB_MAJOR, ROCKSDB_MINOR, ROCKSDB_PATCH - 1));
+  static_assert(
+      ROCKSDB_VERSION_GE(ROCKSDB_MAJOR, ROCKSDB_MINOR, ROCKSDB_PATCH - 100));
+  static_assert(
+      ROCKSDB_VERSION_GE(ROCKSDB_MAJOR, ROCKSDB_MINOR - 1, ROCKSDB_PATCH + 1));
+  static_assert(ROCKSDB_VERSION_GE(ROCKSDB_MAJOR - 1, ROCKSDB_MINOR + 1,
+                                   ROCKSDB_PATCH + 1));
+  static_assert(
+      !ROCKSDB_VERSION_GE(ROCKSDB_MAJOR, ROCKSDB_MINOR, ROCKSDB_PATCH + 1));
+  static_assert(
+      !ROCKSDB_VERSION_GE(ROCKSDB_MAJOR, ROCKSDB_MINOR, ROCKSDB_PATCH + 100));
+  static_assert(
+      !ROCKSDB_VERSION_GE(ROCKSDB_MAJOR, ROCKSDB_MINOR + 1, ROCKSDB_PATCH - 1));
+  static_assert(!ROCKSDB_VERSION_GE(ROCKSDB_MAJOR + 1, ROCKSDB_MINOR - 1,
+                                    ROCKSDB_PATCH - 1));
+  // More typical usage (but with literal numbers based on relevant API
+  // features)
+#if ROCKSDB_VERSION_GE(ROCKSDB_MAJOR, ROCKSDB_MINOR, ROCKSDB_PATCH)
+  static_assert(true);
+#else
+  static_assert(false);
+#endif
+#if !ROCKSDB_VERSION_GE(ROCKSDB_MAJOR, ROCKSDB_MINOR, ROCKSDB_PATCH + 1)
+  static_assert(true);
+#else
+  static_assert(false);
+#endif
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/error_handler.cc b/db/error_handler.cc
index 24c555764f30..1e777fd42600 100644
--- a/db/error_handler.cc
+++ b/db/error_handler.cc
@@ -275,9 +275,6 @@ void ErrorHandler::HandleKnownErrors(const Status& bg_err,
     return;
   }
 
-  ROCKS_LOG_INFO(db_options_.info_log,
-                 "ErrorHandler: Set regular background error\n");
-
   bool paranoid = db_options_.paranoid_checks;
   Status::Severity sev = Status::Severity::kFatalError;
   Status new_bg_err;
@@ -335,12 +332,21 @@ void ErrorHandler::HandleKnownErrors(const Status& bg_err,
     if (!s.ok() && (s.severity() > bg_error_.severity())) {
       bg_error_ = s;
     } else {
+      ROCKS_LOG_INFO(db_options_.info_log,
+                     "ErrorHandler: Hit less severe background error\n");
+
       // This error is less severe than previously encountered error. Don't
       // take any further action
       return;
     }
   }
 
+  bool stop = bg_error_.severity() >= Status::Severity::kHardError;
+  ROCKS_LOG_INFO(
+      db_options_.info_log,
+      "ErrorHandler: Set regular background error, auto_recovery=%d, stop=%d\n",
+      int{auto_recovery}, int{stop});
+
   recover_context_ = context;
   if (auto_recovery) {
     recovery_in_prog_ = true;
@@ -351,7 +357,7 @@ void ErrorHandler::HandleKnownErrors(const Status& bg_err,
       RecoverFromNoSpace();
     }
   }
-  if (bg_error_.severity() >= Status::Severity::kHardError) {
+  if (stop) {
     is_db_stopped_.store(true, std::memory_order_release);
   }
 }
diff --git a/db/error_handler_fs_test.cc b/db/error_handler_fs_test.cc
index 57c3c0dcdd88..26263011ffde 100644
--- a/db/error_handler_fs_test.cc
+++ b/db/error_handler_fs_test.cc
@@ -1550,7 +1550,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) {
   std::vector<FaultInjectionTestFS*> fault_fs;
   std::vector<Options> options;
   std::vector<std::shared_ptr<ErrorHandlerFSListener>> listener;
-  std::vector<DB*> db;
+  std::vector<std::unique_ptr<DB>> db;
   std::shared_ptr<SstFileManager> sfm(NewSstFileManager(def_env));
   int kNumDbInstances = 3;
   Random rnd(301);
@@ -1567,7 +1567,6 @@ TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) {
     options[i].writable_file_max_buffer_size = 32768;
     options[i].listeners.emplace_back(listener[i]);
     options[i].sst_file_manager = sfm;
-    DB* dbptr;
     char buf[16];
 
     listener[i]->EnableAutoRecovery();
@@ -1576,8 +1575,8 @@ TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) {
                                          IOStatus::NoSpace("Out of space"));
     snprintf(buf, sizeof(buf), "_%d", i);
     ASSERT_OK(DestroyDB(dbname_ + std::string(buf), options[i]));
-    ASSERT_OK(DB::Open(options[i], dbname_ + std::string(buf), &dbptr));
-    db.emplace_back(dbptr);
+    ASSERT_OK(
+        DB::Open(options[i], dbname_ + std::string(buf), &db.emplace_back()));
   }
 
   for (auto i = 0; i < kNumDbInstances; ++i) {
@@ -1609,7 +1608,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) {
   }
 
   for (auto i = 0; i < kNumDbInstances; ++i) {
-    Status s = static_cast<DBImpl*>(db[i])->TEST_WaitForCompact();
+    Status s = static_cast<DBImpl*>(db[i].get())->TEST_WaitForCompact();
     ASSERT_EQ(s.severity(), Status::Severity::kSoftError);
     fault_fs[i]->SetFilesystemActive(true);
   }
@@ -1618,7 +1617,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) {
   for (auto i = 0; i < kNumDbInstances; ++i) {
     std::string prop;
     ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true);
-    ASSERT_OK(static_cast<DBImpl*>(db[i])->TEST_WaitForCompact());
+    ASSERT_OK(static_cast<DBImpl*>(db[i].get())->TEST_WaitForCompact());
     EXPECT_TRUE(db[i]->GetProperty(
         "rocksdb.num-files-at-level" + std::to_string(0), &prop));
     EXPECT_EQ(atoi(prop.c_str()), 0);
@@ -1634,7 +1633,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) {
   for (auto i = 0; i < kNumDbInstances; ++i) {
     char buf[16];
     snprintf(buf, sizeof(buf), "_%d", i);
-    delete db[i];
+    db[i].reset();
     fault_fs[i]->SetFilesystemActive(true);
     if (getenv("KEEP_DB")) {
       printf("DB is still at %s%s\n", dbname_.c_str(), buf);
@@ -1657,7 +1656,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) {
   std::vector<FaultInjectionTestFS*> fault_fs;
   std::vector<Options> options;
   std::vector<std::shared_ptr<ErrorHandlerFSListener>> listener;
-  std::vector<DB*> db;
+  std::vector<std::unique_ptr<DB>> db;
   std::shared_ptr<SstFileManager> sfm(NewSstFileManager(def_env));
   int kNumDbInstances = 3;
   Random rnd(301);
@@ -1674,7 +1673,6 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) {
     options[i].writable_file_max_buffer_size = 32768;
     options[i].listeners.emplace_back(listener[i]);
     options[i].sst_file_manager = sfm;
-    DB* dbptr;
     char buf[16];
 
     listener[i]->EnableAutoRecovery();
@@ -1695,8 +1693,8 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) {
     }
     snprintf(buf, sizeof(buf), "_%d", i);
     ASSERT_OK(DestroyDB(dbname_ + std::string(buf), options[i]));
-    ASSERT_OK(DB::Open(options[i], dbname_ + std::string(buf), &dbptr));
-    db.emplace_back(dbptr);
+    ASSERT_OK(
+        DB::Open(options[i], dbname_ + std::string(buf), &db.emplace_back()));
   }
 
   for (auto i = 0; i < kNumDbInstances; ++i) {
@@ -1732,7 +1730,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) {
   }
 
   for (auto i = 0; i < kNumDbInstances; ++i) {
-    Status s = static_cast<DBImpl*>(db[i])->TEST_WaitForCompact();
+    Status s = static_cast<DBImpl*>(db[i].get())->TEST_WaitForCompact();
     switch (i) {
       case 0:
         ASSERT_EQ(s.severity(), Status::Severity::kSoftError);
@@ -1754,7 +1752,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) {
       ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true);
     }
     if (i == 1) {
-      ASSERT_OK(static_cast<DBImpl*>(db[i])->TEST_WaitForCompact());
+      ASSERT_OK(static_cast<DBImpl*>(db[i].get())->TEST_WaitForCompact());
     }
     EXPECT_TRUE(db[i]->GetProperty(
         "rocksdb.num-files-at-level" + std::to_string(0), &prop));
@@ -1772,7 +1770,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) {
     char buf[16];
     snprintf(buf, sizeof(buf), "_%d", i);
     fault_fs[i]->SetFilesystemActive(true);
-    delete db[i];
+    db[i].reset();
     if (getenv("KEEP_DB")) {
       printf("DB is still at %s%s\n", dbname_.c_str(), buf);
     } else {
diff --git a/db/event_helpers.cc b/db/event_helpers.cc
index 2b901f6adc06..5c69f3fb81c6 100644
--- a/db/event_helpers.cc
+++ b/db/event_helpers.cc
@@ -77,7 +77,12 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished(
     TableFileCreationReason reason, const Status& s,
     const std::string& file_checksum,
     const std::string& file_checksum_func_name) {
-  if (s.ok() && event_logger) {
+  if (!event_logger && listeners.empty()) {
+    s.PermitUncheckedError();
+    return;
+  }
+
+  if (event_logger) {
     JSONWriter jwriter;
     AppendCurrentTime(&jwriter);
     jwriter << "cf_name" << cf_name << "job" << job_id << "event"
@@ -124,6 +129,7 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished(
               << "user_defined_timestamps_persisted"
               << table_properties.user_defined_timestamps_persisted
               << "key_largest_seqno" << table_properties.key_largest_seqno
+              << "key_smallest_seqno" << table_properties.key_smallest_seqno
               << "merge_operator" << table_properties.merge_operator_name
               << "prefix_extractor_name"
               << table_properties.prefix_extractor_name << "property_collectors"
@@ -165,6 +171,8 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished(
       jwriter << "oldest_blob_file_number" << oldest_blob_file_number;
     }
 
+    jwriter << "status" << s.ToString();
+
     jwriter.EndObject();
 
     event_logger->Log(jwriter);
@@ -195,18 +203,22 @@ void EventHelpers::LogAndNotifyTableFileDeletion(
     const std::string& file_path, const Status& status,
     const std::string& dbname,
     const std::vector<std::shared_ptr<EventListener>>& listeners) {
-  JSONWriter jwriter;
-  AppendCurrentTime(&jwriter);
-
-  jwriter << "job" << job_id << "event" << "table_file_deletion"
-          << "file_number" << file_number;
-  if (!status.ok()) {
-    jwriter << "status" << status.ToString();
+  if (!event_logger && listeners.empty()) {
+    status.PermitUncheckedError();
+    return;
   }
 
-  jwriter.EndObject();
+  if (event_logger) {
+    JSONWriter jwriter;
+    AppendCurrentTime(&jwriter);
+
+    jwriter << "job" << job_id << "event" << "table_file_deletion"
+            << "file_number" << file_number << "status" << status.ToString();
 
-  event_logger->Log(jwriter);
+    jwriter.EndObject();
+
+    event_logger->Log(jwriter);
+  }
 
   if (listeners.empty()) {
     return;
@@ -274,7 +286,12 @@ void EventHelpers::LogAndNotifyBlobFileCreationFinished(
     const std::string& file_checksum,
     const std::string& file_checksum_func_name, uint64_t total_blob_count,
     uint64_t total_blob_bytes) {
-  if (s.ok() && event_logger) {
+  if (!event_logger && listeners.empty()) {
+    s.PermitUncheckedError();
+    return;
+  }
+
+  if (event_logger) {
     JSONWriter jwriter;
     AppendCurrentTime(&jwriter);
     jwriter << "cf_name" << cf_name << "job" << job_id << "event"
@@ -305,15 +322,17 @@ void EventHelpers::LogAndNotifyBlobFileDeletion(
     const std::vector<std::shared_ptr<EventListener>>& listeners, int job_id,
     uint64_t file_number, const std::string& file_path, const Status& status,
     const std::string& dbname) {
+  if (!event_logger && listeners.empty()) {
+    status.PermitUncheckedError();
+    return;
+  }
+
   if (event_logger) {
     JSONWriter jwriter;
     AppendCurrentTime(&jwriter);
 
     jwriter << "job" << job_id << "event" << "blob_file_deletion"
-            << "file_number" << file_number;
-    if (!status.ok()) {
-      jwriter << "status" << status.ToString();
-    }
+            << "file_number" << file_number << "status" << status.ToString();
 
     jwriter.EndObject();
     event_logger->Log(jwriter);
diff --git a/db/experimental.cc b/db/experimental.cc
index 3691cfe8f741..b6efc1a47534 100644
--- a/db/experimental.cc
+++ b/db/experimental.cc
@@ -57,7 +57,8 @@ Status GetFileChecksumsFromCurrentManifest(FileSystem* fs,
   }
   assert(checksum_list);
 
-  const ReadOptions read_options(Env::IOActivity::kReadManifest);
+  const ReadOptions read_options(
+      Env::IOActivity::kGetFileChecksumsFromCurrentManifest);
   checksum_list->reset();
 
   std::unique_ptr<SequentialFileReader> file_reader;
@@ -87,11 +88,12 @@ Status GetFileChecksumsFromCurrentManifest(FileSystem* fs,
 
   // Read all records from the manifest file...
   uint64_t manifest_file_size = std::numeric_limits<uint64_t>::max();
-  FileChecksumRetriever retriever(read_options, manifest_file_size,
-                                  *checksum_list);
+  FileChecksumRetriever retriever(read_options, manifest_file_size);
   retriever.Iterate(reader, &s);
-
-  return retriever.status();
+  if (!retriever.status().ok()) {
+    return retriever.status();
+  }
+  return retriever.FetchFileChecksumList(*checksum_list);
 }
 
 Status UpdateManifestForFilesState(
@@ -156,15 +158,17 @@ Status UpdateManifestForFilesState(
               // Current state inconsistent with manifest
               ++files_updated;
               edit.DeleteFile(level, number);
-              edit.AddFile(
-                  level, number, lf->fd.GetPathId(), lf->fd.GetFileSize(),
-                  lf->smallest, lf->largest, lf->fd.smallest_seqno,
-                  lf->fd.largest_seqno, lf->marked_for_compaction, temp,
-                  lf->oldest_blob_file_number, lf->oldest_ancester_time,
-                  lf->file_creation_time, lf->epoch_number, lf->file_checksum,
-                  lf->file_checksum_func_name, lf->unique_id,
-                  lf->compensated_range_deletion_size, lf->tail_size,
-                  lf->user_defined_timestamps_persisted);
+              edit.AddFile(level, lf->fd.GetNumber(), lf->fd.GetPathId(),
+                           lf->fd.GetFileSize(), lf->smallest, lf->largest,
+                           lf->fd.smallest_seqno, lf->fd.largest_seqno,
+                           lf->marked_for_compaction, temp,
+                           lf->oldest_blob_file_number,
+                           lf->oldest_ancester_time, lf->file_creation_time,
+                           lf->epoch_number, lf->file_checksum,
+                           lf->file_checksum_func_name, lf->unique_id,
+                           lf->compensated_range_deletion_size, lf->tail_size,
+                           lf->user_defined_timestamps_persisted,
+                           lf->min_timestamp, lf->max_timestamp);
             }
           }
         } else {
@@ -1184,7 +1188,8 @@ class SstQueryFilterConfigsManagerImpl : public SstQueryFilterConfigsManager {
             break;
           default:
             // TODO? Report problem
-            {}
+            {
+            }
             // Unknown filter type
         }
         if (!may_match) {
diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc
index 69b2668aea80..326b3d567a09 100644
--- a/db/external_sst_file_basic_test.cc
+++ b/db/external_sst_file_basic_test.cc
@@ -16,6 +16,7 @@
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/defer.h"
+#include "util/file_checksum_helper.h"
 #include "util/random.h"
 #include "utilities/fault_injection_env.h"
 
@@ -260,55 +261,6 @@ TEST_F(ExternalSSTFileBasicTest, Basic) {
   s = sst_file_writer.DeleteRange(Key(100), Key(200));
   ASSERT_NOK(s) << s.ToString();
 
-  DestroyAndReopen(options);
-
-  SyncPoint::GetInstance()->LoadDependency({
-      {"DBImpl::IngestExternalFile:AfterIncIngestFileCounter",
-       "ExternalSSTFileBasicTest.LiveWriteStart"},
-      {"WriteThread::JoinBatchGroup:Wait",
-       "DBImpl::IngestExternalFile:AfterIncIngestFileCounter:2"},
-  });
-  SyncPoint::GetInstance()->EnableProcessing();
-  PerfContext* write_thread_perf_context;
-  std::thread write_thread([&] {
-    TEST_SYNC_POINT("ExternalSSTFileBasicTest.LiveWriteStart");
-    SetPerfLevel(kEnableWait);
-    write_thread_perf_context = get_perf_context();
-    write_thread_perf_context->Reset();
-    ASSERT_OK(db_->Put(WriteOptions(), "bar", "v2"));
-    ASSERT_GT(write_thread_perf_context->write_thread_wait_nanos, 0);
-    // Test sync points were used to make sure this live write enter write
-    // thread after the file ingestion entered write thread. So by the time this
-    // live write finishes, the latest seqno is 1 means file ingestion used
-    // seqno 0.
-    ASSERT_EQ(db_->GetLatestSequenceNumber(), 1U);
-  });
-
-  // Add file using file path
-  SetPerfLevel(kEnableTimeExceptForMutex);
-  PerfContext* perf_ctx = get_perf_context();
-  perf_ctx->Reset();
-  s = DeprecatedAddFile({file1});
-  ASSERT_GT(perf_context.file_ingestion_nanos, 0);
-  ASSERT_GT(perf_context.file_ingestion_blocking_live_writes_nanos, 0);
-  ASSERT_OK(s) << s.ToString();
-  for (int k = 0; k < 100; k++) {
-    ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
-  }
-
-  write_thread.join();
-  SyncPoint::GetInstance()->DisableProcessing();
-
-  // Re-ingest the file just to check the perf context not enabled at and below
-  // kEnableWait.
-  SetPerfLevel(kEnableWait);
-  perf_ctx->Reset();
-  IngestExternalFileOptions opts;
-  opts.allow_global_seqno = true;
-  opts.allow_blocking_flush = true;
-  ASSERT_OK(db_->IngestExternalFile({file1}, opts));
-  ASSERT_EQ(perf_context.file_ingestion_nanos, 0);
-  ASSERT_EQ(perf_context.file_ingestion_blocking_live_writes_nanos, 0);
   DestroyAndRecreateExternalSSTFilesDir();
 }
 
@@ -395,7 +347,8 @@ class ChecksumVerifyHelper {
 
   Status GetSingleFileChecksumAndFuncName(
       const std::string& file_path, std::string* file_checksum,
-      std::string* file_checksum_func_name) {
+      std::string* file_checksum_func_name,
+      const std::string& requested_func_name = {}) {
     Status s;
     EnvOptions soptions;
     std::unique_ptr<SequentialFile> file_reader;
@@ -413,6 +366,8 @@ class ChecksumVerifyHelper {
       return Status::OK();
     } else {
       FileChecksumGenContext gen_context;
+      gen_context.file_name = file_path;
+      gen_context.requested_checksum_func_name = requested_func_name;
       std::unique_ptr<FileChecksumGenerator> file_checksum_gen =
           file_checksum_gen_factory->CreateFileChecksumGenerator(gen_context);
       *file_checksum_func_name = file_checksum_gen->Name();
@@ -488,10 +443,50 @@ TEST_F(ExternalSSTFileBasicTest, BasicWithFileChecksumCrc32c) {
   DestroyAndRecreateExternalSSTFilesDir();
 }
 
+namespace {
+class VariousFileChecksumGenerator : public FileChecksumGenCrc32c {
+ public:
+  explicit VariousFileChecksumGenerator(const std::string& name)
+      : FileChecksumGenCrc32c({}), name_(name) {}
+
+  const char* Name() const override { return name_.c_str(); }
+
+  std::string GetChecksum() const override {
+    return FileChecksumGenCrc32c::GetChecksum() + "_" + name_;
+  }
+
+ private:
+  const std::string name_;
+};
+
+class VariousFileChecksumGenFactory : public FileChecksumGenFactory {
+ public:
+  std::unique_ptr<FileChecksumGenerator> CreateFileChecksumGenerator(
+      const FileChecksumGenContext& context) override {
+    static RelaxedAtomic<int> counter{0};
+    if (Slice(context.requested_checksum_func_name).starts_with("Various")) {
+      return std::make_unique<VariousFileChecksumGenerator>(
+          context.requested_checksum_func_name);
+    } else if (context.requested_checksum_func_name.empty()) {
+      // Lacking a specific request, use a different function name for each
+      // result.
+      return std::make_unique<VariousFileChecksumGenerator>(
+          "Various" + std::to_string(counter.FetchAddRelaxed(1)));
+    } else {
+      return nullptr;
+    }
+  }
+
+  static const char* kClassName() { return "VariousFileChecksumGenFactory"; }
+  const char* Name() const override { return kClassName(); }
+};
+}  // namespace
+
 TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
   Options old_options = CurrentOptions();
   Options options = CurrentOptions();
-  options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  options.file_checksum_gen_factory =
+      std::make_shared<VariousFileChecksumGenFactory>();
   const ImmutableCFOptions ioptions(options);
   ChecksumVerifyHelper checksum_helper(options);
 
@@ -512,7 +507,8 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
   ASSERT_EQ(file1_info.largest_key, Key(1099));
   std::string file_checksum1, file_checksum_func_name1;
   ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
-      file1, &file_checksum1, &file_checksum_func_name1));
+      file1, &file_checksum1, &file_checksum_func_name1,
+      file1_info.file_checksum_func_name));
   ASSERT_EQ(file1_info.file_checksum, file_checksum1);
   ASSERT_EQ(file1_info.file_checksum_func_name, file_checksum_func_name1);
 
@@ -531,7 +527,8 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
   ASSERT_EQ(file2_info.largest_key, Key(1299));
   std::string file_checksum2, file_checksum_func_name2;
   ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
-      file2, &file_checksum2, &file_checksum_func_name2));
+      file2, &file_checksum2, &file_checksum_func_name2,
+      file2_info.file_checksum_func_name));
   ASSERT_EQ(file2_info.file_checksum, file_checksum2);
   ASSERT_EQ(file2_info.file_checksum_func_name, file_checksum_func_name2);
 
@@ -550,7 +547,8 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
   ASSERT_EQ(file3_info.largest_key, Key(1499));
   std::string file_checksum3, file_checksum_func_name3;
   ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
-      file3, &file_checksum3, &file_checksum_func_name3));
+      file3, &file_checksum3, &file_checksum_func_name3,
+      file3_info.file_checksum_func_name));
   ASSERT_EQ(file3_info.file_checksum, file_checksum3);
   ASSERT_EQ(file3_info.file_checksum_func_name, file_checksum_func_name3);
 
@@ -569,7 +567,8 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
   ASSERT_EQ(file4_info.largest_key, Key(1799));
   std::string file_checksum4, file_checksum_func_name4;
   ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
-      file4, &file_checksum4, &file_checksum_func_name4));
+      file4, &file_checksum4, &file_checksum_func_name4,
+      file4_info.file_checksum_func_name));
   ASSERT_EQ(file4_info.file_checksum, file_checksum4);
   ASSERT_EQ(file4_info.file_checksum_func_name, file_checksum_func_name4);
 
@@ -588,7 +587,8 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
   ASSERT_EQ(file5_info.largest_key, Key(1999));
   std::string file_checksum5, file_checksum_func_name5;
   ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
-      file5, &file_checksum5, &file_checksum_func_name5));
+      file5, &file_checksum5, &file_checksum_func_name5,
+      file5_info.file_checksum_func_name));
   ASSERT_EQ(file5_info.file_checksum, file_checksum5);
   ASSERT_EQ(file5_info.file_checksum_func_name, file_checksum_func_name5);
 
@@ -607,7 +607,8 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
   ASSERT_EQ(file6_info.largest_key, Key(2199));
   std::string file_checksum6, file_checksum_func_name6;
   ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
-      file6, &file_checksum6, &file_checksum_func_name6));
+      file6, &file_checksum6, &file_checksum_func_name6,
+      file6_info.file_checksum_func_name));
   ASSERT_EQ(file6_info.file_checksum, file_checksum6);
   ASSERT_EQ(file6_info.file_checksum_func_name, file_checksum_func_name6);
 
@@ -677,18 +678,23 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
   }
   ASSERT_OK(env_->FileExists(file2));
 
-  // Enable verify_file_checksum option
-  // No checksum information is provided, generate it when ingesting
-  std::vector<std::string> checksum, checksum_func;
-  s = AddFileWithFileChecksum({file3}, checksum, checksum_func, true, false,
-                              false, false);
+  // Enable verify_file_checksum option. No checksum information is provided,
+  // so it is generated when ingesting. The configured checksum factory will
+  // use a different function than before.
+  s = AddFileWithFileChecksum({file3}, {}, {}, true, false, false, false);
   ASSERT_OK(s) << s.ToString();
   std::vector<LiveFileMetaData> live_files2;
   dbfull()->GetLiveFilesMetaData(&live_files2);
   for (const auto& f : live_files2) {
     if (set1.find(f.name) == set1.end()) {
-      ASSERT_EQ(f.file_checksum, file_checksum3);
-      ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name3);
+      // Recomputed checksum, different function
+      EXPECT_NE(f.file_checksum_func_name, file_checksum_func_name3);
+      std::string cur_checksum3, cur_checksum_func_name3;
+      ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
+          dbname_ + f.name, &cur_checksum3, &cur_checksum_func_name3,
+          f.file_checksum_func_name));
+      EXPECT_EQ(f.file_checksum, cur_checksum3);
+      EXPECT_EQ(f.file_checksum_func_name, cur_checksum_func_name3);
       set1.insert(f.name);
     }
   }
@@ -702,8 +708,9 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
   ASSERT_NOK(s) << s.ToString();
 
   // Does not enable verify_file_checksum options
-  // Checksum function name matches, store the checksum being ingested.
-  s = AddFileWithFileChecksum({file4}, {"asd"}, {file_checksum_func_name4},
+  // Checksum function name is recognized, so store the checksum being ingested.
+  std::string file_checksum_func_name4alt = "VariousABCD";
+  s = AddFileWithFileChecksum({file4}, {"asd"}, {file_checksum_func_name4alt},
                               false, false, false, false);
   ASSERT_OK(s) << s.ToString();
   std::vector<LiveFileMetaData> live_files3;
@@ -712,7 +719,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
     if (set1.find(f.name) == set1.end()) {
       ASSERT_FALSE(f.file_checksum == file_checksum4);
       ASSERT_EQ(f.file_checksum, "asd");
-      ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name4);
+      ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name4alt);
       set1.insert(f.name);
     }
   }
@@ -721,7 +728,8 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
 
   // enable verify_file_checksum options, DB enable checksum, and enable
   // write_global_seq. So the checksum stored is different from the one
-  // ingested due to the sequence number changes.
+  // ingested due to the sequence number changes. The checksum function name
+  // may also change since the checksum is recomputed.
   s = AddFileWithFileChecksum({file5}, {file_checksum5},
                               {file_checksum_func_name5}, true, false, false,
                               true);
@@ -730,11 +738,14 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
   dbfull()->GetLiveFilesMetaData(&live_files4);
   for (const auto& f : live_files4) {
     if (set1.find(f.name) == set1.end()) {
+      // Recomputed checksum, different function
+      EXPECT_NE(f.file_checksum_func_name, file_checksum_func_name5);
       std::string cur_checksum5, cur_checksum_func_name5;
       ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
-          dbname_ + f.name, &cur_checksum5, &cur_checksum_func_name5));
-      ASSERT_EQ(f.file_checksum, cur_checksum5);
-      ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name5);
+          dbname_ + f.name, &cur_checksum5, &cur_checksum_func_name5,
+          f.file_checksum_func_name));
+      EXPECT_EQ(f.file_checksum, cur_checksum5);
+      EXPECT_EQ(f.file_checksum_func_name, cur_checksum_func_name5);
       set1.insert(f.name);
     }
   }
@@ -742,18 +753,22 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
   ASSERT_OK(env_->FileExists(file5));
 
   // Does not enable verify_file_checksum options and also the ingested file
-  // checksum information is empty. DB will generate and store the checksum
-  // in Manifest.
-  std::vector<std::string> files_c6, files_name6;
-  s = AddFileWithFileChecksum({file6}, files_c6, files_name6, false, false,
-                              false, false);
+  // checksum information is empty. DB will generate and store file checksum
+  // in Manifest, which could be different from the previous invocation.
+  s = AddFileWithFileChecksum({file6}, {}, {}, false, false, false, false);
   ASSERT_OK(s) << s.ToString();
   std::vector<LiveFileMetaData> live_files6;
   dbfull()->GetLiveFilesMetaData(&live_files6);
   for (const auto& f : live_files6) {
     if (set1.find(f.name) == set1.end()) {
-      ASSERT_EQ(f.file_checksum, file_checksum6);
-      ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name6);
+      // Recomputed checksum, different function
+      EXPECT_NE(f.file_checksum_func_name, file_checksum_func_name6);
+      std::string cur_checksum6, cur_checksum_func_name6;
+      ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
+          dbname_ + f.name, &cur_checksum6, &cur_checksum_func_name6,
+          f.file_checksum_func_name));
+      EXPECT_EQ(f.file_checksum, cur_checksum6);
+      EXPECT_EQ(f.file_checksum_func_name, cur_checksum_func_name6);
       set1.insert(f.name);
     }
   }
@@ -1954,21 +1969,44 @@ TEST_F(ExternalSSTFileBasicTest, OverlappingFiles) {
     SstFileWriter sst_file_writer(EnvOptions(), options);
     std::string file3 = sst_files_dir_ + "file3.sst";
     ASSERT_OK(sst_file_writer.Open(file3));
-    ASSERT_OK(sst_file_writer.Put("j", "j1"));
+    ASSERT_OK(sst_file_writer.Put("k", "k1"));
     ASSERT_OK(sst_file_writer.Put("m", "m1"));
     ExternalSstFileInfo file3_info;
     ASSERT_OK(sst_file_writer.Finish(&file3_info));
     files.push_back(std::move(file3));
   }
 
+  // This could be ingested to the same level as file3 and file4, but the
+  // greedy/simple overlap check relegates it to a later level
+  {
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+    std::string file4 = sst_files_dir_ + "file4.sst";
+    ASSERT_OK(sst_file_writer.Open(file4));
+    ASSERT_OK(sst_file_writer.Put("j", "j1"));
+    ExternalSstFileInfo file4_info;
+    ASSERT_OK(sst_file_writer.Finish(&file4_info));
+    files.push_back(std::move(file4));
+  }
+
+  {
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+    std::string file5 = sst_files_dir_ + "file5.sst";
+    ASSERT_OK(sst_file_writer.Open(file5));
+    ASSERT_OK(sst_file_writer.Put("i", "i3"));
+    ExternalSstFileInfo file5_info;
+    ASSERT_OK(sst_file_writer.Finish(&file5_info));
+    files.push_back(std::move(file5));
+  }
+
   IngestExternalFileOptions ifo;
   ifo.allow_global_seqno = false;
   ASSERT_NOK(db_->IngestExternalFile(files, ifo));
   ifo.allow_global_seqno = true;
   ASSERT_OK(db_->IngestExternalFile(files, ifo));
   ASSERT_EQ(Get("a"), "a1");
-  ASSERT_EQ(Get("i"), "i2");
+  ASSERT_EQ(Get("i"), "i3");
   ASSERT_EQ(Get("j"), "j1");
+  ASSERT_EQ(Get("k"), "k1");
   ASSERT_EQ(Get("m"), "m1");
 
   int total_keys = 0;
@@ -1979,10 +2017,11 @@ TEST_F(ExternalSSTFileBasicTest, OverlappingFiles) {
   }
   ASSERT_OK(iter->status());
   delete iter;
-  ASSERT_EQ(total_keys, 4);
+  ASSERT_EQ(total_keys, 5);
 
   ASSERT_EQ(1, NumTableFilesAtLevel(6));
   ASSERT_EQ(2, NumTableFilesAtLevel(5));
+  ASSERT_EQ(2, NumTableFilesAtLevel(4));
 }
 
 class CompactionJobStatsCheckerForFilteredFiles : public EventListener {
@@ -2528,7 +2567,14 @@ TEST_F(ExternalSSTFileBasicTest, IngestWithTemperature) {
     options.default_write_temperature = Temperature::kHot;
     SstFileWriter sst_file_writer(EnvOptions(), options);
     options.level0_file_num_compaction_trigger = 2;
-    options.allow_ingest_behind = (mode == "ingest_behind");
+    bool cf_option = Random::GetTLSInstance()->OneIn(2);
+    SCOPED_TRACE(std::string("Use ") + (cf_option ? "CF" : "DB") +
+                 " option for ingest behind");
+    if (cf_option) {
+      options.cf_allow_ingest_behind = (mode == "ingest_behind");
+    } else {
+      options.allow_ingest_behind = (mode == "ingest_behind");
+    }
     Reopen(options);
     Defer destroyer([&]() { Destroy(options); });
 
@@ -2669,51 +2715,358 @@ TEST_F(ExternalSSTFileBasicTest, IngestWithTemperature) {
   }
 }
 
-TEST_F(ExternalSSTFileBasicTest, FailIfNotBottommostLevel) {
+// This tests an internal user's exact usage and expectation of the
+// IngestExternalFiles APIs to bulk load and replace files.
+TEST_F(ExternalSSTFileBasicTest,
+       AtomicReplaceColumnFamilyWithIngestedVersionKey) {
   Options options = GetDefaultOptions();
-
-  std::string file_path = sst_files_dir_ + std::to_string(1);
-  SstFileWriter sfw(EnvOptions(), options);
-
-  ASSERT_OK(sfw.Open(file_path));
-  ASSERT_OK(sfw.Put("b", "dontcare"));
-  ASSERT_OK(sfw.Finish());
-
-  // Test universal compaction + ingest with snapshot consistency
   options.create_if_missing = true;
   options.compaction_style = CompactionStyle::kCompactionStyleUniversal;
+  options.num_levels = 7;
+  options.disallow_memtable_writes = false;
+
   DestroyAndReopen(options);
-  {
-    const Snapshot* snapshot = db_->GetSnapshot();
-    ManagedSnapshot snapshot_guard(db_, snapshot);
-    IngestExternalFileOptions ifo;
-    ifo.fail_if_not_bottommost_level = true;
-    ifo.snapshot_consistency = true;
-    const Status s = db_->IngestExternalFile({file_path}, ifo);
-    ASSERT_TRUE(s.ok());
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+  std::string data_file_original = sst_files_dir_ + "data_original";
+  ASSERT_OK(sst_file_writer.Open(data_file_original));
+  ASSERT_OK(sst_file_writer.Put("ukey1", "uval1_orig"));
+  ASSERT_OK(sst_file_writer.Put("ukey2", "uval2_orig"));
+  ASSERT_OK(sst_file_writer.Finish());
+  ASSERT_OK(db_->IngestExternalFile(db_->DefaultColumnFamily(),
+                                    {data_file_original},
+                                    IngestExternalFileOptions()));
+
+  ASSERT_OK(Put("data_version", "v_original"));
+  ASSERT_OK(Flush());
+  std::string value;
+  ASSERT_OK(db_->Get(ReadOptions(), "data_version", &value));
+  ASSERT_EQ(value, "v_original");
+  ASSERT_OK(db_->Get(ReadOptions(), "ukey1", &value));
+  ASSERT_EQ(value, "uval1_orig");
+  ASSERT_OK(db_->Get(ReadOptions(), "ukey2", &value));
+  ASSERT_EQ(value, "uval2_orig");
+  // Set up a 1) data version key file on L0, and 2) a user data file on L6
+  // to test the initial transitioning to use `atomic_replace_range`.
+  ASSERT_EQ("1,0,0,0,0,0,1", FilesPerLevel());
+
+  // Test multiple cycles of replacing by atomically ingest a data file and a
+  // version key file while replace the whole range in the column family.
+  for (int i = 0; i < 10; i++) {
+    std::string version_file_path =
+        sst_files_dir_ + "version" + std::to_string(i);
+    ASSERT_OK(sst_file_writer.Open(version_file_path));
+    ASSERT_OK(sst_file_writer.Put("data_version", "v" + std::to_string(i)));
+    ASSERT_OK(sst_file_writer.Finish());
+
+    std::string file_path = sst_files_dir_ + std::to_string(i);
+    ASSERT_OK(sst_file_writer.Open(file_path));
+    ASSERT_OK(sst_file_writer.Put("ukey1", "uval1" + std::to_string(i)));
+    ASSERT_OK(sst_file_writer.Put("ukey2", "uval2" + std::to_string(i)));
+    ASSERT_OK(sst_file_writer.Finish());
+
+    IngestExternalFileArg arg;
+    arg.column_family = db_->DefaultColumnFamily();
+    arg.external_files = {version_file_path, file_path};
+    arg.atomic_replace_range = {{nullptr, nullptr}};
+    // Test both fail_if_not_bottomost_level: true and false
+    arg.options.fail_if_not_bottommost_level = i % 2 == 0;
+    arg.options.snapshot_consistency = false;
+    // Ingest 1) a new data version file and 2) a new user data file while erase
+    // the whole column family
+    Status s = db_->IngestExternalFiles({arg});
+    ASSERT_OK(s);
+
+    // Check ingestion result and the expected LSM shape:
+    // Two files on L6, 1) a data version file 2) a user data file.
+    ASSERT_OK(db_->Get(ReadOptions(), "ukey1", &value));
+    ASSERT_EQ(value, "uval1" + std::to_string(i));
+    ASSERT_OK(db_->Get(ReadOptions(), "ukey2", &value));
+    ASSERT_EQ(value, "uval2" + std::to_string(i));
+    ASSERT_OK(db_->Get(ReadOptions(), "data_version", &value));
+    ASSERT_EQ(value, "v" + std::to_string(i));
+    ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
   }
 
-  // Test level compaction
-  options.compaction_style = CompactionStyle::kCompactionStyleLevel;
-  options.num_levels = 2;
-  DestroyAndReopen(options);
-  ASSERT_OK(db_->Put(WriteOptions(), "a", "dontcare"));
-  ASSERT_OK(db_->Put(WriteOptions(), "c", "dontcare"));
-  ASSERT_OK(db_->Flush(FlushOptions()));
+  Close();
+}
 
-  ASSERT_OK(db_->Put(WriteOptions(), "b", "dontcare"));
-  ASSERT_OK(db_->Put(WriteOptions(), "d", "dontcare"));
-  ASSERT_OK(db_->Flush(FlushOptions()));
+TEST_F(ExternalSSTFileBasicTest, FailIfNotBottommostLevelAndDisallowMemtable) {
+  for (bool disallow_memtable : {false, true}) {
+    Options options = GetDefaultOptions();
 
-  {
-    CompactRangeOptions cro;
-    cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
-    ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+    // First test with universal compaction
+    options.create_if_missing = true;
+    options.compaction_style = CompactionStyle::kCompactionStyleUniversal;
+    DestroyAndReopen(options);
 
-    IngestExternalFileOptions ifo;
-    ifo.fail_if_not_bottommost_level = true;
-    const Status s = db_->IngestExternalFile({file_path}, ifo);
-    ASSERT_TRUE(s.IsTryAgain());
+    // And a CF potentially disallowing memtable write
+    options.disallow_memtable_writes = disallow_memtable;
+    CreateColumnFamilies({"cf0"}, options);
+    ASSERT_EQ(db_->GetOptions(handles_[0]).disallow_memtable_writes,
+              disallow_memtable);
+
+    // Ingest with snapshot consistency
+    std::string file_path = sst_files_dir_ + std::to_string(1);
+    std::string file_path2 = sst_files_dir_ + std::to_string(2);
+    SstFileWriter sfw(EnvOptions(), options);
+
+    ASSERT_OK(sfw.Open(file_path));
+    ASSERT_OK(sfw.Put("b", "0"));
+    ASSERT_OK(sfw.Finish());
+
+    {
+      const Snapshot* snapshot = db_->GetSnapshot();
+      ManagedSnapshot snapshot_guard(db_.get(), snapshot);
+      IngestExternalFileOptions ifo;
+      ifo.fail_if_not_bottommost_level = true;
+      ifo.snapshot_consistency = true;
+      ASSERT_OK(db_->IngestExternalFile(handles_[0], {file_path}, ifo));
+    }
+    ASSERT_EQ(Get(0, "b"), "0");
+
+    // Test level compaction
+    options.compaction_style = CompactionStyle::kCompactionStyleLevel;
+    options.num_levels = 2;
+    CreateColumnFamilies({"cf1"}, options);
+    ASSERT_EQ(db_->GetOptions(handles_[1]).disallow_memtable_writes,
+              disallow_memtable);
+
+    if (!disallow_memtable) {
+      ASSERT_OK(Put(1, "a", "1"));
+      ASSERT_OK(Put(1, "c", "3"));
+      ASSERT_OK(Flush(1));
+
+      ASSERT_OK(Put(1, "b", "2"));
+      ASSERT_OK(Put(1, "d", "4"));
+      ASSERT_OK(Flush(1));
+    } else {
+      // Memtable write disallowed
+      EXPECT_EQ(Put(1, "a", "1").code(), Status::Code::kInvalidArgument);
+
+      // Use ingestion to get to the same state as above
+      ASSERT_OK(sfw.Open(file_path2));
+      ASSERT_OK(sfw.Put("a", "1"));
+      ASSERT_OK(sfw.Put("c", "3"));
+      ASSERT_OK(sfw.Finish());
+      ASSERT_OK(db_->IngestExternalFile(handles_[1], {file_path2}, {}));
+
+      ASSERT_OK(sfw.Open(file_path2));
+      ASSERT_OK(sfw.Put("b", "2"));
+      ASSERT_OK(sfw.Put("d", "4"));
+      ASSERT_OK(sfw.Finish());
+      ASSERT_OK(db_->IngestExternalFile(handles_[1], {file_path2}, {}));
+    }
+    ASSERT_EQ(Get(1, "a"), "1");
+    ASSERT_EQ(Get(1, "b"), "2");
+    ASSERT_EQ(Get(1, "c"), "3");
+    ASSERT_EQ(Get(1, "d"), "4");
+
+    {
+      // Test fail_if_not_bottommost_level, which fails if there's any overlap
+      // anywhere, even with snapshot_consistency=false
+      IngestExternalFileOptions ifo;
+      ASSERT_FALSE(ifo.fail_if_not_bottommost_level);
+      ifo.fail_if_not_bottommost_level = true;
+      ifo.snapshot_consistency = false;
+      // Fails with overlap on earlier level
+      Status s = db_->IngestExternalFile(handles_[1], {file_path}, ifo);
+      ASSERT_EQ(s.code(), Status::Code::kTryAgain);
+
+      CompactRangeOptions cro;
+      cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+      ASSERT_OK(db_->CompactRange(cro, handles_[1], nullptr, nullptr));
+
+      // Fails with overlap on last level
+      s = db_->IngestExternalFile(handles_[1], {file_path}, ifo);
+      ASSERT_EQ(s.code(), Status::Code::kTryAgain);
+
+      // No change to data
+      ASSERT_EQ(Get(1, "a"), "1");
+      ASSERT_EQ(Get(1, "b"), "2");
+      ASSERT_EQ(Get(1, "c"), "3");
+      ASSERT_EQ(Get(1, "d"), "4");
+    }
+
+    if (!disallow_memtable) {
+      // Test allow_blocking_flush=false (fail because of memtable overlap)
+      IngestExternalFileOptions ifo;
+      ASSERT_TRUE(ifo.allow_blocking_flush);
+      ifo.allow_blocking_flush = false;
+      ASSERT_OK(Put(1, "b", "42"));
+      Status s = db_->IngestExternalFile(handles_[1], {file_path}, ifo);
+      ASSERT_EQ(s.code(), Status::Code::kInvalidArgument);
+
+      ASSERT_EQ(Get(1, "a"), "1");
+      ASSERT_EQ(Get(1, "b"), "42");
+      ASSERT_EQ(Get(1, "c"), "3");
+      ASSERT_EQ(Get(1, "d"), "4");
+
+      // Revert state
+      ASSERT_OK(Put(1, "b", "2"));
+      ASSERT_OK(Flush(1));
+    }
+
+    {
+      // Test atomic_replace_range
+      IngestExternalFileArg arg;
+      arg.column_family = handles_[1];
+      arg.external_files = {file_path};
+      arg.atomic_replace_range = {{"a", "zzz"}};
+
+      // start with some failure cases
+      // TODO: support snapshot consistency with tombstone file
+      ASSERT_TRUE(arg.options.snapshot_consistency);
+      Status s = db_->IngestExternalFiles({arg});
+      ASSERT_EQ(s.code(), Status::Code::kNotSupported);
+
+      ASSERT_EQ(Get(1, "a"), "1");
+      ASSERT_EQ(Get(1, "b"), "2");
+      ASSERT_EQ(Get(1, "c"), "3");
+      ASSERT_EQ(Get(1, "d"), "4");
+
+      arg.options.snapshot_consistency = false;
+      // Can usually be used with atomic_replace_range and
+      // snapshot_consistency=false, except it requires no input overlap
+      arg.options.fail_if_not_bottommost_level = true;
+
+      // one-sided ranges not yet supported
+      arg.atomic_replace_range = {{{}, "zzz"}};
+      s = db_->IngestExternalFiles({arg});
+      ASSERT_EQ(s.code(), Status::Code::kNotSupported);
+
+      arg.atomic_replace_range = {{"a", {}}};
+      s = db_->IngestExternalFiles({arg});
+      ASSERT_EQ(s.code(), Status::Code::kNotSupported);
+
+      // rejected because doesn't cover ingested file
+      arg.atomic_replace_range = {{"x", "z"}};
+      s = db_->IngestExternalFiles({arg});
+      ASSERT_EQ(s.code(), Status::Code::kInvalidArgument);
+
+      // rejected because of partial file overlap
+      arg.atomic_replace_range = {{"a", "c"}};
+      s = db_->IngestExternalFiles({arg});
+      ASSERT_EQ(s.code(), Status::Code::kInvalidArgument);
+
+      if (!disallow_memtable) {
+        // memtable overlap with replace range
+        ASSERT_OK(Put(1, "e", "5"));
+        arg.options.allow_blocking_flush = false;
+
+        // rejected because of memtable overlap
+        arg.atomic_replace_range = {{"a", "z"}};
+        s = db_->IngestExternalFiles({arg});
+        ASSERT_EQ(s.code(), Status::Code::kInvalidArgument);
+
+        // rejected because of memtable overlap
+        arg.atomic_replace_range = {{nullptr, nullptr}};
+        s = db_->IngestExternalFiles({arg});
+        ASSERT_EQ(s.code(), Status::Code::kInvalidArgument);
+
+        // FIXME: upper bound should be exclusive (DeleteRange semantics).
+        // currently rejected because of documented bug
+        arg.atomic_replace_range = {{"a", "e"}};
+        s = db_->IngestExternalFiles({arg});
+        ASSERT_EQ(s.code(), Status::Code::kInvalidArgument);
+
+        // work-around ensuring no memtable overlap
+        arg.atomic_replace_range = {{"a", "d2"}};
+        ASSERT_OK(db_->IngestExternalFiles({arg}));
+
+        ASSERT_EQ(Get(1, "e"), "5");
+      } else {
+        // rejected because of partial file overlap
+        arg.atomic_replace_range = {{"b", "z"}};
+        s = db_->IngestExternalFiles({arg});
+        ASSERT_EQ(s.code(), Status::Code::kInvalidArgument);
+
+        // no memtable complications
+        arg.atomic_replace_range = {{"a", "z"}};
+        ASSERT_OK(db_->IngestExternalFiles({arg}));
+
+        ASSERT_EQ(Get(1, "e"), "NOT_FOUND");
+      }
+      ASSERT_EQ(Get(1, "a"), "NOT_FOUND");
+      ASSERT_EQ(Get(1, "b"), "0");
+      ASSERT_EQ(Get(1, "c"), "NOT_FOUND");
+      ASSERT_EQ(Get(1, "d"), "NOT_FOUND");
+
+      // The single ingested file replaced everything (except perhaps memtable)
+      std::vector<LiveFileMetaData> live_files;
+      db_->GetLiveFilesMetaData(&live_files);
+      // One file in each CF
+      ASSERT_EQ(live_files.size(), 2);
+
+      ASSERT_OK(sfw.Open(file_path));
+      ASSERT_OK(sfw.Put("f", "6"));
+      ASSERT_OK(sfw.Finish());
+
+      // Another file
+      ASSERT_OK(sfw.Open(file_path2));
+      ASSERT_OK(sfw.Put("f", "7"));
+      ASSERT_OK(sfw.Put("g", "8"));
+      ASSERT_OK(sfw.Finish());
+
+      if (!disallow_memtable) {
+        // rejected because of memtable overlap with range
+        arg.atomic_replace_range = {{"e", "z"}};
+        s = db_->IngestExternalFiles({arg});
+        ASSERT_EQ(s.code(), Status::Code::kInvalidArgument);
+
+        // allow blocking flush of "e" (which is then replaced), and the file
+        // with just "b" is not replaced
+        arg.options.allow_blocking_flush = true;
+        ASSERT_OK(db_->IngestExternalFiles({arg}));
+
+        ASSERT_EQ(Get(1, "b"), "0");
+        ASSERT_EQ(Get(1, "e"), "NOT_FOUND");
+        ASSERT_EQ(Get(1, "f"), "6");
+        ASSERT_EQ(Get(1, "g"), "NOT_FOUND");
+
+        // memtable overlap with replace range
+        ASSERT_OK(Put(1, "e", "5"));
+        arg.options.allow_blocking_flush = false;
+        arg.external_files = {file_path2};
+
+        // rejected because of memtable overlap
+        arg.atomic_replace_range = {{nullptr, nullptr}};
+        s = db_->IngestExternalFiles({arg});
+        ASSERT_EQ(s.code(), Status::Code::kInvalidArgument);
+
+        // Replace everything, including with memtable flush
+        arg.options.allow_blocking_flush = true;
+        ASSERT_OK(db_->IngestExternalFiles({arg}));
+
+        ASSERT_EQ(Get(1, "b"), "NOT_FOUND");
+        ASSERT_EQ(Get(1, "e"), "NOT_FOUND");
+        ASSERT_EQ(Get(1, "f"), "7");
+        ASSERT_EQ(Get(1, "g"), "8");
+      } else {
+        arg.external_files = {file_path2, file_path};
+
+        // rejected because of overlap in files to ingest with fail_if_ = true
+        arg.atomic_replace_range = {{"e", "z"}};
+        s = db_->IngestExternalFiles({arg});
+        ASSERT_EQ(s.code(), Status::Code::kTryAgain);
+
+        arg.options.fail_if_not_bottommost_level = false;
+
+        // rejected because range doesn't cover ingested files
+        // FIXME: upper bound should be exclusive "g" instead
+        arg.atomic_replace_range = {{"e", "f2"}};
+        s = db_->IngestExternalFiles({arg});
+        ASSERT_EQ(s.code(), Status::Code::kInvalidArgument);
+
+        // Loaded into different levels, and the file with just "b" is not
+        // replaced
+        arg.atomic_replace_range = {{"e", "z"}};
+        ASSERT_OK(db_->IngestExternalFiles({arg}));
+
+        ASSERT_EQ(Get(1, "b"), "0");
+        ASSERT_EQ(Get(1, "f"), "6");  // earlier file listed later to ingest
+        ASSERT_EQ(Get(1, "g"), "8");
+      }
+    }
   }
 }
 
diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index a439189afa7e..7a379b9df790 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -11,6 +11,7 @@
 #include <unordered_set>
 #include <vector>
 
+#include "db/builder.h"
 #include "db/db_impl/db_impl.h"
 #include "db/version_edit.h"
 #include "file/file_util.h"
@@ -29,6 +30,7 @@ Status ExternalSstFileIngestionJob::Prepare(
     const std::vector<std::string>& external_files_paths,
     const std::vector<std::string>& files_checksums,
     const std::vector<std::string>& files_checksum_func_names,
+    const std::optional<RangeOpt>& atomic_replace_range,
     const Temperature& file_temperature, uint64_t next_file_number,
     SuperVersion* sv) {
   Status status;
@@ -41,6 +43,9 @@ Status ExternalSstFileIngestionJob::Prepare(
     status =
         GetIngestedFileInfo(file_path, next_file_number++, &file_to_ingest, sv);
     if (!status.ok()) {
+      ROCKS_LOG_WARN(db_options_.info_log,
+                     "Failed to get ingested file info: %s: %s",
+                     file_path.c_str(), status.ToString().c_str());
       return status;
     }
 
@@ -80,33 +85,69 @@ Status ExternalSstFileIngestionJob::Prepare(
     std::sort(sorted_files.begin(), sorted_files.end(), file_range_checker_);
 
     for (size_t i = 0; i + 1 < num_files; i++) {
-      if (file_range_checker_.OverlapsWithPrev(sorted_files[i],
-                                               sorted_files[i + 1],
-                                               /* ranges_sorted= */ true)) {
+      if (file_range_checker_.Overlaps(*sorted_files[i], *sorted_files[i + 1],
+                                       /* known_sorted= */ true)) {
         files_overlap_ = true;
         break;
       }
     }
   }
 
-  if (ingestion_options_.ingest_behind && files_overlap_) {
-    return Status::NotSupported(
-        "Files with overlapping ranges cannot be ingested with ingestion "
-        "behind mode.");
+  if (atomic_replace_range.has_value()) {
+    atomic_replace_range_.emplace();
+
+    if (atomic_replace_range->start && atomic_replace_range->limit) {
+      // User keys to internal keys (with timestamps)
+      const size_t ts_sz = ucmp_->timestamp_size();
+      std::string start_with_ts, limit_with_ts;
+      auto [start, limit] = MaybeAddTimestampsToRange(
+          atomic_replace_range->start, atomic_replace_range->limit, ts_sz,
+          &start_with_ts, &limit_with_ts);
+      assert(start.has_value());
+      assert(limit.has_value());
+      atomic_replace_range_->smallest_internal_key.Set(
+          *start, kMaxSequenceNumber, kValueTypeForSeek);
+      atomic_replace_range_->largest_internal_key.Set(
+          *limit, kMaxSequenceNumber, kValueTypeForSeek);
+      // Check files to ingest against replace range
+      for (size_t i = 0; i < num_files; i++) {
+        if (!file_range_checker_.Contains(*atomic_replace_range_,
+                                          files_to_ingest_[i])) {
+          return Status::InvalidArgument(
+              "Atomic replace range does not contain all files");
+        }
+      }
+    } else {
+      // Currently if either bound is not present, both must be
+      assert(atomic_replace_range->start.has_value() == false);
+      assert(atomic_replace_range->limit.has_value() == false);
+      assert(atomic_replace_range_->smallest_internal_key.unset());
+      assert(atomic_replace_range_->largest_internal_key.unset());
+    }
   }
 
-  // Overlapping files need at least two different sequence numbers. If settings
-  // disables global seqno, ingestion will fail anyway, so fail fast in prepare.
-  if (!ingestion_options_.allow_global_seqno && files_overlap_) {
-    return Status::InvalidArgument(
-        "Global seqno is required, but disabled (because external files key "
-        "range overlaps).");
-  }
+  if (files_overlap_) {
+    if (ingestion_options_.ingest_behind) {
+      return Status::NotSupported(
+          "Files with overlapping ranges cannot be ingested with ingestion "
+          "behind mode.");
+    }
 
-  if (ucmp_->timestamp_size() > 0 && files_overlap_) {
-    return Status::NotSupported(
-        "Files with overlapping ranges cannot be ingested to column "
-        "family with user-defined timestamp enabled.");
+    // Overlapping files need at least two different sequence numbers. If
+    // settings disables global seqno, ingestion will fail anyway, so fail
+    // fast in prepare.
+    if (!ingestion_options_.allow_global_seqno &&
+        !ingestion_options_.allow_db_generated_files) {
+      return Status::InvalidArgument(
+          "Global seqno is required, but disabled (because external files key "
+          "range overlaps).");
+    }
+
+    if (ucmp_->timestamp_size() > 0) {
+      return Status::NotSupported(
+          "Files with overlapping ranges cannot be ingested to column "
+          "family with user-defined timestamp enabled.");
+    }
   }
 
   // Copy/Move external files into DB
@@ -123,6 +164,14 @@ Status ExternalSstFileIngestionJob::Prepare(
         // It is unsafe to assume application had sync the file and file
         // directory before ingest the file. For integrity of RocksDB we need
         // to sync the file.
+
+        // TODO(xingbo), We should in general be moving away from production
+        // uses of ReuseWritableFile (except explicitly for WAL recycling),
+        // ReopenWritableFile, and NewRandomRWFile. We should create a
+        // FileSystem::SyncFile/FsyncFile API that by default does the
+        // re-open+sync+close combo but can (a) be reused easily, and (b) be
+        // overridden to do that more cleanly, e.g. in EncryptedEnv.
+        // https://github.com/facebook/rocksdb/issues/13741
         std::unique_ptr<FSWritableFile> file_to_sync;
         Status s = fs_->ReopenWritableFile(path_inside_db, env_options_,
                                            &file_to_sync, nullptr);
@@ -153,6 +202,10 @@ Status ExternalSstFileIngestionJob::Prepare(
         ROCKS_LOG_INFO(db_options_.info_log,
                        "Tried to link file %s but it's not supported : %s",
                        path_outside_db.c_str(), status.ToString().c_str());
+      } else {
+        ROCKS_LOG_WARN(db_options_.info_log, "Failed to link file %s to %s: %s",
+                       path_outside_db.c_str(), path_inside_db.c_str(),
+                       status.ToString().c_str());
       }
     } else {
       f.copy_file = true;
@@ -177,6 +230,12 @@ Status ExternalSstFileIngestionJob::Prepare(
                         io_tracer_);
       // The destination of the copy will be ingested
       f.file_temperature = dst_temp;
+
+      if (!status.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log, "Failed to copy file %s to %s: %s",
+                       path_outside_db.c_str(), path_inside_db.c_str(),
+                       status.ToString().c_str());
+      }
     } else {
       // Note: we currently assume that linking files does not cross
       // temperatures, so no need to change f.file_temperature
@@ -227,10 +286,6 @@ Status ExternalSstFileIngestionJob::Prepare(
     } else {
       need_generate_file_checksum_ = true;
     }
-    FileChecksumGenContext gen_context;
-    std::unique_ptr<FileChecksumGenerator> file_checksum_gen =
-        db_options_.file_checksum_gen_factory->CreateFileChecksumGenerator(
-            gen_context);
     std::vector<std::string> generated_checksums;
     std::vector<std::string> generated_checksum_func_names;
     // Step 1: generate the checksum for ingested sst file.
@@ -238,11 +293,25 @@ Status ExternalSstFileIngestionJob::Prepare(
       for (size_t i = 0; i < files_to_ingest_.size(); i++) {
         std::string generated_checksum;
         std::string generated_checksum_func_name;
-        std::string requested_checksum_func_name;
+        std::string requested_checksum_func_name =
+            i < files_checksum_func_names.size() ? files_checksum_func_names[i]
+                                                 : "";
         // TODO: rate limit file reads for checksum calculation during file
         // ingestion.
         // TODO: plumb Env::IOActivity
         ReadOptions ro;
+        // Pass user-provided checksums through FileOptions when available.
+        // The caller may not have provided checksums at all (empty vectors),
+        // so we guard with a bounds check.
+        FileOptions fopts;
+        if (i < files_checksums.size()) {
+          fopts.file_checksum = files_checksums[i];
+        }
+        if (i < files_checksum_func_names.size()) {
+          fopts.file_checksum_func_name = files_checksum_func_names[i];
+        } else {
+          fopts.file_checksum_func_name = kNoFileChecksumFuncName;
+        }
         IOStatus io_s = GenerateOneFileChecksum(
             fs_.get(), files_to_ingest_[i].internal_file_path,
             db_options_.file_checksum_gen_factory.get(),
@@ -251,7 +320,7 @@ Status ExternalSstFileIngestionJob::Prepare(
             ingestion_options_.verify_checksums_readahead_size,
             db_options_.allow_mmap_reads, io_tracer_,
             db_options_.rate_limiter.get(), ro, db_options_.stats,
-            db_options_.clock);
+            db_options_.clock, fopts);
         if (!io_s.ok()) {
           status = io_s;
           ROCKS_LOG_WARN(db_options_.info_log,
@@ -281,40 +350,50 @@ Status ExternalSstFileIngestionJob::Prepare(
             if (files_checksum_func_names[i] !=
                 generated_checksum_func_names[i]) {
               status = Status::InvalidArgument(
-                  "Checksum function name does not match with the checksum "
-                  "function name of this DB");
-              ROCKS_LOG_WARN(
-                  db_options_.info_log,
-                  "Sst file checksum verification of file: %s failed: %s",
-                  external_files_paths[i].c_str(), status.ToString().c_str());
+                  "DB file checksum gen factory " +
+                  std::string(db_options_.file_checksum_gen_factory->Name()) +
+                  " generated checksum function name " +
+                  generated_checksum_func_names[i] + " for file " +
+                  external_files_paths[i] +
+                  " which does not match requested/provided " +
+                  files_checksum_func_names[i]);
               break;
             }
             if (files_checksums[i] != generated_checksums[i]) {
               status = Status::Corruption(
-                  "Ingested checksum does not match with the generated "
-                  "checksum");
-              ROCKS_LOG_WARN(
-                  db_options_.info_log,
-                  "Sst file checksum verification of file: %s failed: %s",
-                  files_to_ingest_[i].internal_file_path.c_str(),
-                  status.ToString().c_str());
+                  "Checksum verification mismatch for ingestion file " +
+                  external_files_paths[i] + " using function " +
+                  generated_checksum_func_names[i] + ". Expected: " +
+                  Slice(files_checksums[i]).ToString(/*hex=*/true) +
+                  " Computed: " +
+                  Slice(generated_checksums[i]).ToString(/*hex=*/true));
               break;
             }
           }
         } else {
-          // If verify_file_checksum is not enabled, we only verify the
-          // checksum function name. If it does not match, fail the ingestion.
-          // If matches, we trust the ingested checksum information and store
-          // in the Manifest.
+          // If verify_file_checksum is not enabled, we only verify the factory
+          // recognizes the checksum function name. If it does not match, fail
+          // the ingestion. If matches, we trust the ingested checksum
+          // information and store in the Manifest.
           for (size_t i = 0; i < files_to_ingest_.size(); i++) {
-            if (files_checksum_func_names[i] != file_checksum_gen->Name()) {
+            FileChecksumGenContext gen_context;
+            gen_context.file_name = files_to_ingest_[i].internal_file_path;
+            gen_context.requested_checksum_func_name =
+                files_checksum_func_names[i];
+            auto file_checksum_gen =
+                db_options_.file_checksum_gen_factory
+                    ->CreateFileChecksumGenerator(gen_context);
+
+            if (file_checksum_gen == nullptr ||
+                files_checksum_func_names[i] != file_checksum_gen->Name()) {
               status = Status::InvalidArgument(
-                  "Checksum function name does not match with the checksum "
-                  "function name of this DB");
-              ROCKS_LOG_WARN(
-                  db_options_.info_log,
-                  "Sst file checksum verification of file: %s failed: %s",
-                  external_files_paths[i].c_str(), status.ToString().c_str());
+                  "Checksum function name " + files_checksum_func_names[i] +
+                  " for file " + external_files_paths[i] +
+                  " not recognized by DB checksum gen factory" +
+                  db_options_.file_checksum_gen_factory->Name() +
+                  (file_checksum_gen ? (" Returned function " +
+                                        std::string(file_checksum_gen->Name()))
+                                     : ""));
               break;
             }
             files_to_ingest_[i].file_checksum = files_checksums[i];
@@ -329,12 +408,11 @@ Status ExternalSstFileIngestionJob::Prepare(
         status = Status::InvalidArgument(
             "The checksum information of ingested sst files are nonempty and "
             "the size of checksums or the size of the checksum function "
-            "names "
-            "does not match with the number of ingested sst files");
-        ROCKS_LOG_WARN(
-            db_options_.info_log,
-            "The ingested sst files checksum information is incomplete: %s",
-            status.ToString().c_str());
+            "names does not match with the number of ingested sst files");
+      }
+      if (!status.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log, "Ingestion failed: %s",
+                       status.ToString().c_str());
       }
     }
   }
@@ -359,9 +437,9 @@ void ExternalSstFileIngestionJob::DivideInputFilesIntoBatches() {
 
   file_batches_to_ingest_.emplace_back(/* _track_batch_range= */ true);
   for (auto& file : files_to_ingest_) {
-    if (file_range_checker_.OverlapsWithPrev(&file_batches_to_ingest_.back(),
-                                             &file,
-                                             /* ranges_sorted= */ false)) {
+    if (!file_batches_to_ingest_.back().unset() &&
+        file_range_checker_.Overlaps(file_batches_to_ingest_.back(), file,
+                                     /* known_sorted= */ false)) {
       file_batches_to_ingest_.emplace_back(/* _track_batch_range= */ true);
     }
     file_batches_to_ingest_.back().AddFile(&file, file_range_checker_);
@@ -370,14 +448,37 @@ void ExternalSstFileIngestionJob::DivideInputFilesIntoBatches() {
 
 Status ExternalSstFileIngestionJob::NeedsFlush(bool* flush_needed,
                                                SuperVersion* super_version) {
-  size_t n = files_to_ingest_.size();
-  autovector<UserKeyRange> ranges;
-  ranges.reserve(n);
-  for (const IngestedFileInfo& file_to_ingest : files_to_ingest_) {
-    ranges.emplace_back(file_to_ingest.start_ukey, file_to_ingest.limit_ukey);
-  }
-  Status status = cfd_->RangesOverlapWithMemtables(
-      ranges, super_version, db_options_.allow_data_in_errors, flush_needed);
+  Status status;
+  if (atomic_replace_range_.has_value() && atomic_replace_range_->unset()) {
+    // For replacing whole CF, we can simply check whether memtable is empty
+    *flush_needed = !super_version->mem->IsEmpty();
+  } else {
+    autovector<UserKeyRange> ranges;
+    if (atomic_replace_range_.has_value()) {
+      assert(!atomic_replace_range_->smallest_internal_key.unset());
+      assert(!atomic_replace_range_->largest_internal_key.unset());
+      // NOTE: we already checked in Prepare() that the atomic_replace_range
+      // covers all the files_to_ingest
+      // FIXME: need to make upper bound key exclusive (not easy here because
+      // the existing internal APIs deal in inclusive upper bound user keys)
+      ranges.emplace_back(
+          atomic_replace_range_->smallest_internal_key.user_key(),
+          atomic_replace_range_->largest_internal_key.user_key());
+    } else {
+      ranges.reserve(files_to_ingest_.size());
+      for (const IngestedFileInfo& file_to_ingest : files_to_ingest_) {
+        ranges.emplace_back(file_to_ingest.start_ukey,
+                            file_to_ingest.limit_ukey);
+      }
+    }
+    status = cfd_->RangesOverlapWithMemtables(
+        ranges, super_version, db_options_.allow_data_in_errors, flush_needed);
+    if (!status.ok()) {
+      ROCKS_LOG_WARN(db_options_.info_log,
+                     "Failed to check ranges overlap with memtables: %s",
+                     status.ToString().c_str());
+    }
+  }
   if (status.ok() && *flush_needed) {
     if (!ingestion_options_.allow_blocking_flush) {
       status = Status::InvalidArgument("External file requires flush");
@@ -411,6 +512,9 @@ Status ExternalSstFileIngestionJob::Run() {
   bool need_flush = false;
   status = NeedsFlush(&need_flush, super_version);
   if (!status.ok()) {
+    ROCKS_LOG_WARN(db_options_.info_log,
+                   "Failed to check if flush is needed: %s",
+                   status.ToString().c_str());
     return status;
   }
   if (need_flush) {
@@ -430,15 +534,61 @@ Status ExternalSstFileIngestionJob::Run() {
   // the only active writer, and hence they are equal
   SequenceNumber last_seqno = versions_->LastSequence();
   edit_.SetColumnFamily(cfd_->GetID());
-  // The levels that the files will be ingested into
 
+  if (atomic_replace_range_.has_value()) {
+    auto* vstorage = super_version->current->storage_info();
+    if (atomic_replace_range_->unset()) {
+      if (cfd_->compaction_picker()->IsCompactionInProgress()) {
+        return Status::InvalidArgument(
+            "Atomic replace range (full) overlaps with pending compaction");
+      }
+      for (int lvl = 0; lvl < cfd_->NumberLevels(); lvl++) {
+        for (auto file : vstorage->LevelFiles(lvl)) {
+          // Set up to delete file to be replaced
+          edit_.DeleteFile(lvl, file->fd.GetNumber());
+        }
+      }
+    } else {
+      assert(!atomic_replace_range_->smallest_internal_key.unset());
+      assert(!atomic_replace_range_->largest_internal_key.unset());
+      for (int lvl = 0; lvl < cfd_->NumberLevels(); lvl++) {
+        if (cfd_->RangeOverlapWithCompaction(
+                atomic_replace_range_->smallest_internal_key.user_key(),
+                atomic_replace_range_->largest_internal_key.user_key(), lvl)) {
+          return Status::InvalidArgument(
+              "Atomic replace range overlaps with pending compaction");
+        }
+        for (auto file : vstorage->LevelFiles(lvl)) {
+          if (file_range_checker_.Overlaps(*atomic_replace_range_,
+                                           file->smallest, file->largest)) {
+            if (file_range_checker_.Contains(*atomic_replace_range_,
+                                             file->smallest, file->largest)) {
+              // Set up to delete file to be replaced
+              edit_.DeleteFile(lvl, file->fd.GetNumber());
+            } else {
+              // TODO: generate and ingest a tombstone file also
+              return Status::InvalidArgument(
+                  "Atomic replace range partially overlaps with existing file");
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Find levels to ingest into
   std::optional<int> prev_batch_uppermost_level;
+  // batches at the front of file_batches_to_ingest_ contains older updates and
+  // are placed in smaller levels.
   for (auto& batch : file_batches_to_ingest_) {
     int batch_uppermost_level = 0;
     status = AssignLevelsForOneBatch(batch, super_version, force_global_seqno,
                                      &last_seqno, &batch_uppermost_level,
                                      prev_batch_uppermost_level);
     if (!status.ok()) {
+      ROCKS_LOG_WARN(db_options_.info_log,
+                     "Failed to assign levels for one batch: %s",
+                     status.ToString().c_str());
       return status;
     }
 
@@ -481,8 +631,19 @@ Status ExternalSstFileIngestionJob::AssignLevelsForOneBatch(
                                 &largest_parsed, false /* log_err_key */);
     }
     if (!status.ok()) {
+      ROCKS_LOG_WARN(db_options_.info_log, "Failed to parse internal key: %s",
+                     status.ToString().c_str());
       return status;
     }
+
+    // If any ingested file overlaps with the DB, it will fail here.
+    if (ingestion_options_.allow_db_generated_files && assigned_seqno != 0) {
+      return Status::InvalidArgument(
+          "An ingested file overlaps with existing data in the DB and has been "
+          "assigned a non-zero sequence number, which is not allowed when "
+          "'allow_db_generated_files' is enabled.");
+    }
+
     if (smallest_parsed.sequence == 0 && assigned_seqno != 0) {
       UpdateInternalKey(file->smallest_internal_key.rep(), assigned_seqno,
                         smallest_parsed.type);
@@ -494,6 +655,10 @@ Status ExternalSstFileIngestionJob::AssignLevelsForOneBatch(
 
     status = AssignGlobalSeqnoForIngestedFile(file, assigned_seqno);
     if (!status.ok()) {
+      ROCKS_LOG_WARN(
+          db_options_.info_log,
+          "Failed to assign global sequence number for ingested file: %s",
+          status.ToString().c_str());
       return status;
     }
     TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Run",
@@ -501,11 +666,14 @@ Status ExternalSstFileIngestionJob::AssignLevelsForOneBatch(
     assert(assigned_seqno == 0 || assigned_seqno == *last_seqno + 1);
     if (assigned_seqno > *last_seqno) {
       *last_seqno = assigned_seqno;
-      ++consumed_seqno_count_;
     }
+    max_assigned_seqno_ = std::max(max_assigned_seqno_, assigned_seqno);
 
     status = GenerateChecksumForIngestedFile(file);
     if (!status.ok()) {
+      ROCKS_LOG_WARN(db_options_.info_log,
+                     "Failed to generate checksum for ingested file: %s",
+                     status.ToString().c_str());
       return status;
     }
 
@@ -518,34 +686,39 @@ Status ExternalSstFileIngestionJob::AssignLevelsForOneBatch(
       current_time = oldest_ancester_time =
           static_cast<uint64_t>(temp_current_time);
     }
-    uint64_t tail_size = 0;
-    bool contain_no_data_blocks = file->table_properties.num_entries > 0 &&
-                                  (file->table_properties.num_entries ==
-                                   file->table_properties.num_range_deletions);
-    if (file->table_properties.tail_start_offset > 0 ||
-        contain_no_data_blocks) {
-      uint64_t file_size = file->fd.GetFileSize();
-      assert(file->table_properties.tail_start_offset <= file_size);
-      tail_size = file_size - file->table_properties.tail_start_offset;
-    }
+    uint64_t tail_size = FileMetaData::CalculateTailSize(
+        file->fd.GetFileSize(), file->table_properties);
 
     bool marked_for_compaction =
         file->table_properties.num_range_deletions == 1 &&
         (file->table_properties.num_entries ==
          file->table_properties.num_range_deletions);
+    SequenceNumber smallest_seqno = file->assigned_seqno;
+    SequenceNumber largest_seqno = file->assigned_seqno;
+    if (ingestion_options_.allow_db_generated_files) {
+      assert(file->assigned_seqno == 0);
+      assert(file->smallest_seqno != kMaxSequenceNumber);
+      assert(file->largest_seqno != kMaxSequenceNumber);
+      smallest_seqno = file->smallest_seqno;
+      largest_seqno = file->largest_seqno;
+      max_assigned_seqno_ = std::max(max_assigned_seqno_, file->largest_seqno);
+    }
     FileMetaData f_metadata(
         file->fd.GetNumber(), file->fd.GetPathId(), file->fd.GetFileSize(),
-        file->smallest_internal_key, file->largest_internal_key,
-        file->assigned_seqno, file->assigned_seqno, false,
-        file->file_temperature, kInvalidBlobFileNumber, oldest_ancester_time,
-        current_time,
+        file->smallest_internal_key, file->largest_internal_key, smallest_seqno,
+        largest_seqno, false, file->file_temperature, kInvalidBlobFileNumber,
+        oldest_ancester_time, current_time,
         ingestion_options_.ingest_behind
             ? kReservedEpochNumberForFileIngestedBehind
-            : cfd_->NewEpochNumber(),
+            : cfd_->NewEpochNumber(),  // orders files ingested to L0
         file->file_checksum, file->file_checksum_func_name, file->unique_id, 0,
-        tail_size, file->user_defined_timestamps_persisted);
+        tail_size, file->user_defined_timestamps_persisted, "", "");
     f_metadata.temperature = file->file_temperature;
     f_metadata.marked_for_compaction = marked_for_compaction;
+    // Extract min/max timestamps from table properties for UDT support.
+    // This ensures ingested files have proper timestamp ranges in FileMetaData,
+    // similar to files created by flush and compaction.
+    ExtractTimestampFromTableProperties(file->table_properties, &f_metadata);
     edit_.AddFile(file->picked_level, f_metadata);
 
     *batch_uppermost_level =
@@ -593,15 +766,13 @@ void ExternalSstFileIngestionJob::CreateEquivalentFileIngestingCompactions() {
                             cfd_->ioptions().compaction_style),
         LLONG_MAX /* max compaction bytes, not applicable */,
         0 /* output path ID, not applicable */, mutable_cf_options.compression,
-        mutable_cf_options.compression_opts,
-        mutable_cf_options.default_write_temperature,
+        mutable_cf_options.compression_opts, Temperature::kUnknown,
         0 /* max_subcompaction, not applicable */,
         {} /* grandparents, not applicable */,
         std::nullopt /* earliest_snapshot */, nullptr /* snapshot_checker */,
-        false /* is manual */, "" /* trim_ts */, -1 /* score, not applicable */,
-        false /* is deletion compaction, not applicable */,
-        files_overlap_ /* l0_files_might_overlap, not applicable */,
-        CompactionReason::kExternalSstIngestion));
+        CompactionReason::kExternalSstIngestion, "" /* trim_ts */,
+        -1 /* score, not applicable */,
+        files_overlap_ /* l0_files_might_overlap, not applicable */));
   }
 }
 
@@ -689,7 +860,6 @@ void ExternalSstFileIngestionJob::Cleanup(const Status& status) {
     // We failed to add the files to the database
     // remove all the files we copied
     DeleteInternalFiles();
-    consumed_seqno_count_ = 0;
     files_overlap_ = false;
   } else if (status.ok() && ingestion_options_.move_files) {
     // The files were moved and added successfully, remove original file links
@@ -732,6 +902,10 @@ Status ExternalSstFileIngestionJob::ResetTableReader(
   Status status =
       fs_->NewRandomAccessFile(external_file, fo, &sst_file, nullptr);
   if (!status.ok()) {
+    ROCKS_LOG_WARN(
+        db_options_.info_log,
+        "Failed to create random access file for external file %s: %s",
+        external_file.c_str(), status.ToString().c_str());
     return status;
   }
   Temperature updated_temp = sst_file->GetTemperature();
@@ -750,7 +924,8 @@ Status ExternalSstFileIngestionJob::ResetTableReader(
       ro,
       TableReaderOptions(
           cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor,
-          env_options_, cfd_->internal_comparator(),
+          sv->mutable_cf_options.compression_manager.get(), env_options_,
+          cfd_->internal_comparator(),
           sv->mutable_cf_options.block_protection_bytes_per_key,
           /*skip_filters*/ false, /*immortal*/ false,
           /*force_direct_prefetch*/ false, /*level*/ -1,
@@ -853,6 +1028,10 @@ Status ExternalSstFileIngestionJob::SanityCheckTableProperties(
     // user_defined_timestamps_persisted flag for the file.
     file_to_ingest->user_defined_timestamps_persisted = false;
   } else if (!s.ok()) {
+    ROCKS_LOG_WARN(
+        db_options_.info_log,
+        "ValidateUserDefinedTimestampsOptions failed for external file %s: %s",
+        external_file.c_str(), s.ToString().c_str());
     return s;
   }
 
@@ -877,6 +1056,9 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
   Status status = fs_->GetFileSize(external_file, IOOptions(),
                                    &file_to_ingest->file_size, nullptr);
   if (!status.ok()) {
+    ROCKS_LOG_WARN(db_options_.info_log,
+                   "Failed to get file size for external file %s: %s",
+                   external_file.c_str(), status.ToString().c_str());
     return status;
   }
 
@@ -893,15 +1075,52 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
                             /*user_defined_timestamps_persisted=*/true, sv,
                             file_to_ingest, &table_reader);
   if (!status.ok()) {
+    ROCKS_LOG_WARN(db_options_.info_log,
+                   "Failed to reset table reader for external file %s: %s",
+                   external_file.c_str(), status.ToString().c_str());
     return status;
   }
 
   status = SanityCheckTableProperties(external_file, new_file_number, sv,
                                       file_to_ingest, &table_reader);
   if (!status.ok()) {
+    ROCKS_LOG_WARN(
+        db_options_.info_log,
+        "Failed to sanity check table properties for external file %s: %s",
+        external_file.c_str(), status.ToString().c_str());
     return status;
   }
 
+  const bool allow_data_in_errors = db_options_.allow_data_in_errors;
+  ParsedInternalKey key;
+  if (ingestion_options_.allow_db_generated_files) {
+    // We are ingesting a DB generated SST file for which we don't reassign
+    // sequence numbers. We need its smallest sequence number and largest
+    // sequence number for FileMetaData.
+    Status seqno_status = GetSeqnoBoundaryForFile(
+        table_reader.get(), sv, file_to_ingest, allow_data_in_errors);
+
+    if (!seqno_status.ok()) {
+      ROCKS_LOG_WARN(
+          db_options_.info_log,
+          "Failed to get sequence number boundary for external file %s: %s",
+          external_file.c_str(), seqno_status.ToString().c_str());
+      return seqno_status;
+    }
+    assert(file_to_ingest->smallest_seqno <= file_to_ingest->largest_seqno);
+    assert(file_to_ingest->largest_seqno < kMaxSequenceNumber);
+  } else {
+    SequenceNumber largest_seqno =
+        table_reader.get()->GetTableProperties()->key_largest_seqno;
+    // UINT64_MAX means unknown and the file is generated before table property
+    // `key_largest_seqno` is introduced.
+    if (largest_seqno != UINT64_MAX && largest_seqno > 0) {
+      return Status::Corruption(
+          "External file has non zero largest sequence number " +
+          std::to_string(largest_seqno));
+    }
+  }
+
   if (ingestion_options_.verify_checksums_before_ingest) {
     // If customized readahead size is needed, we can pass a user option
     // all the way to here. Right now we just rely on the default readahead
@@ -913,11 +1132,13 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
     status = table_reader->VerifyChecksum(
         ro, TableReaderCaller::kExternalSSTIngestion);
     if (!status.ok()) {
+      ROCKS_LOG_WARN(db_options_.info_log,
+                     "Failed to verify checksum for table reader: %s",
+                     status.ToString().c_str());
       return status;
     }
   }
 
-  ParsedInternalKey key;
   // TODO: plumb Env::IOActivity, Env::IOPriority
   ReadOptions ro;
   ro.fill_cache = ingestion_options_.fill_cache;
@@ -926,7 +1147,6 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
       /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion));
 
   // Get first (smallest) and last (largest) key from file.
-  bool allow_data_in_errors = db_options_.allow_data_in_errors;
   iter->SeekToFirst();
   if (iter->Valid()) {
     Status pik_status =
@@ -935,7 +1155,7 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
       return Status::Corruption("Corrupted key in external file. ",
                                 pik_status.getState());
     }
-    if (key.sequence != 0) {
+    if (key.sequence != 0 && !ingestion_options_.allow_db_generated_files) {
       return Status::Corruption("External file has non zero sequence number");
     }
     file_to_ingest->smallest_internal_key.SetFrom(key);
@@ -972,41 +1192,13 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
       return Status::Corruption("Corrupted key in external file. ",
                                 pik_status.getState());
     }
-    if (key.sequence != 0) {
+    if (key.sequence != 0 && !ingestion_options_.allow_db_generated_files) {
       return Status::Corruption("External file has non zero sequence number");
     }
     file_to_ingest->largest_internal_key.SetFrom(key);
   } else if (!iter->status().ok()) {
     return iter->status();
   }
-  SequenceNumber largest_seqno =
-      table_reader.get()->GetTableProperties()->key_largest_seqno;
-  // UINT64_MAX means unknown and the file is generated before table property
-  // `key_largest_seqno` is introduced.
-  if (largest_seqno != UINT64_MAX && largest_seqno > 0) {
-    return Status::Corruption(
-        "External file has non zero largest sequence number " +
-        std::to_string(largest_seqno));
-  }
-  if (ingestion_options_.allow_db_generated_files &&
-      largest_seqno == UINT64_MAX) {
-    // Need to verify that all keys have seqno zero.
-    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-      Status pik_status =
-          ParseInternalKey(iter->key(), &key, allow_data_in_errors);
-      if (!pik_status.ok()) {
-        return Status::Corruption("Corrupted key in external file. ",
-                                  pik_status.getState());
-      }
-      if (key.sequence != 0) {
-        return Status::NotSupported(
-            "External file has a key with non zero sequence number.");
-      }
-    }
-    if (!iter->status().ok()) {
-      return iter->status();
-    }
-  }
 
   std::unique_ptr<InternalIterator> range_del_iter(
       table_reader->NewRangeTombstoneIterator(ro));
@@ -1021,7 +1213,7 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
         return Status::Corruption("Corrupted key in external file. ",
                                   pik_status.getState());
       }
-      if (key.sequence != 0) {
+      if (key.sequence != 0 && !ingestion_options_.allow_db_generated_files) {
         return Status::Corruption(
             "External file has a range deletion with non zero sequence "
             "number.");
@@ -1069,12 +1261,14 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
   const size_t ts_sz = ucmp_->timestamp_size();
   assert(!prev_batch_uppermost_level.has_value() ||
          prev_batch_uppermost_level.value() < cfd_->NumberLevels());
-  bool must_assign_to_l0 = prev_batch_uppermost_level.has_value() &&
-                           prev_batch_uppermost_level.value() == 0;
-  if (force_global_seqno || files_overlap_ ||
-      compaction_style == kCompactionStyleFIFO || must_assign_to_l0) {
+  bool must_assign_to_l0 = (prev_batch_uppermost_level.has_value() &&
+                            prev_batch_uppermost_level.value() == 0) ||
+                           compaction_style == kCompactionStyleFIFO;
+
+  if (force_global_seqno || (!ingestion_options_.allow_db_generated_files &&
+                             (files_overlap_ || must_assign_to_l0))) {
     *assigned_seqno = last_seqno + 1;
-    if (compaction_style == kCompactionStyleFIFO || must_assign_to_l0) {
+    if (must_assign_to_l0) {
       assert(ts_sz == 0);
       file_to_ingest->picked_level = 0;
       if (ingestion_options_.fail_if_not_bottommost_level &&
@@ -1095,15 +1289,29 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
   ro.total_order_seek = true;
   int target_level = 0;
   auto* vstorage = cfd_->current()->storage_info();
-  assert(!must_assign_to_l0);
-  int exclusive_end_level = prev_batch_uppermost_level.has_value()
-                                ? prev_batch_uppermost_level.value()
-                                : cfd_->NumberLevels();
+  assert(!must_assign_to_l0 || ingestion_options_.allow_db_generated_files);
+  int assigned_level_exclusive_end = cfd_->NumberLevels();
+  if (must_assign_to_l0) {
+    assigned_level_exclusive_end = 0;
+  } else if (prev_batch_uppermost_level.has_value()) {
+    assigned_level_exclusive_end = prev_batch_uppermost_level.value();
+  }
 
-  for (int lvl = 0; lvl < exclusive_end_level; lvl++) {
+  // When ingesting db generated files, we require that ingested files do not
+  // overlap with any file in the DB. So we need to check all levels.
+  int overlap_checking_exclusive_end =
+      ingestion_options_.allow_db_generated_files
+          ? cfd_->NumberLevels()
+          : assigned_level_exclusive_end;
+  for (int lvl = 0; lvl < overlap_checking_exclusive_end; lvl++) {
     if (lvl > 0 && lvl < vstorage->base_level()) {
       continue;
     }
+    if (lvl < assigned_level_exclusive_end &&
+        atomic_replace_range_.has_value()) {
+      target_level = lvl;
+      continue;
+    }
     if (cfd_->RangeOverlapWithCompaction(file_to_ingest->start_ukey,
                                          file_to_ingest->limit_ukey, lvl)) {
       // We must use L0 or any level higher than `lvl` to be able to overwrite
@@ -1118,6 +1326,9 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
           ro, env_options_, file_to_ingest->start_ukey,
           file_to_ingest->limit_ukey, lvl, &overlap_with_level);
       if (!status.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log,
+                       "Failed to check overlap with level iterator: %s",
+                       status.ToString().c_str());
         return status;
       }
       if (overlap_with_level) {
@@ -1131,7 +1342,8 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
 
     // We don't overlap with any keys in this level, but we still need to check
     // if our file can fit in it
-    if (IngestedFileFitInLevel(file_to_ingest, lvl)) {
+    if (lvl < assigned_level_exclusive_end &&
+        IngestedFileFitInLevel(file_to_ingest, lvl)) {
       target_level = lvl;
     }
   }
@@ -1140,8 +1352,9 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
       target_level < cfd_->NumberLevels() - 1) {
     status = Status::TryAgain(
         "Files cannot be ingested to Lmax. Please make sure key range of Lmax "
-        "and ongoing compaction's output to Lmax"
-        "does not overlap with files to ingest.");
+        "and ongoing compaction's output to Lmax does not overlap with files "
+        "to ingest. Input files overlapping with each other can cause some "
+        "file to be assigned to non Lmax level.");
     return status;
   }
 
@@ -1162,16 +1375,13 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
     }
   }
 
-  if (ingestion_options_.allow_db_generated_files && *assigned_seqno != 0) {
-    return Status::InvalidArgument(
-        "An ingested file is assigned to a non-zero sequence number, which is "
-        "incompatible with ingestion option allow_db_generated_files.");
-  }
   return status;
 }
 
 Status ExternalSstFileIngestionJob::CheckLevelForIngestedBehindFile(
     IngestedFileInfo* file_to_ingest) {
+  assert(!atomic_replace_range_.has_value());
+
   auto* vstorage = cfd_->current()->storage_info();
   // First, check if new files fit in the last level
   int last_lvl = cfd_->NumberLevels() - 1;
@@ -1181,13 +1391,13 @@ Status ExternalSstFileIngestionJob::CheckLevelForIngestedBehindFile(
         "at the last level!");
   }
 
-  // Second, check if despite allow_ingest_behind=true we still have 0 seqnums
-  // at some upper level
+  // Second, check if despite cf_allow_ingest_behind=true we still have 0
+  // seqnums at some upper level
   for (int lvl = 0; lvl < cfd_->NumberLevels() - 1; lvl++) {
     for (auto file : vstorage->LevelFiles(lvl)) {
       if (file->fd.smallest_seqno == 0) {
         return Status::InvalidArgument(
-            "Can't ingest_behind file as despite allow_ingest_behind=true "
+            "Can't ingest_behind file as despite cf_allow_ingest_behind=true "
             "there are files with 0 seqno in database at upper levels!");
       }
     }
@@ -1199,8 +1409,12 @@ Status ExternalSstFileIngestionJob::CheckLevelForIngestedBehindFile(
 
 Status ExternalSstFileIngestionJob::AssignGlobalSeqnoForIngestedFile(
     IngestedFileInfo* file_to_ingest, SequenceNumber seqno) {
+  if (ingestion_options_.allow_db_generated_files) {
+    assert(seqno == 0);
+    assert(file_to_ingest->original_seqno == 0);
+  }
   if (file_to_ingest->original_seqno == seqno) {
-    // This file already have the correct global seqno
+    // This file already has the correct global seqno.
     return Status::OK();
   } else if (!ingestion_options_.allow_global_seqno) {
     return Status::InvalidArgument("Global seqno is required, but disabled");
@@ -1227,6 +1441,14 @@ Status ExternalSstFileIngestionJob::AssignGlobalSeqnoForIngestedFile(
       PutFixed64(&seqno_val, seqno);
       status = fsptr->Write(file_to_ingest->global_seqno_offset, seqno_val,
                             IOOptions(), nullptr);
+      if (!status.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log,
+                       "Failed to write global seqno to %s: %s",
+                       file_to_ingest->internal_file_path.c_str(),
+                       status.ToString().c_str());
+        return status;
+      }
+
       if (status.ok()) {
         TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncGlobalSeqno");
         status = SyncIngestedFile(fsptr.get());
@@ -1243,6 +1465,11 @@ Status ExternalSstFileIngestionJob::AssignGlobalSeqnoForIngestedFile(
         return status;
       }
     } else if (!status.IsNotSupported()) {
+      ROCKS_LOG_WARN(
+          db_options_.info_log,
+          "Failed to open ingested file %s for random read/write: %s",
+          file_to_ingest->internal_file_path.c_str(),
+          status.ToString().c_str());
       return status;
     }
   }
@@ -1267,14 +1494,19 @@ IOStatus ExternalSstFileIngestionJob::GenerateChecksumForIngestedFile(
   // TODO: rate limit file reads for checksum calculation during file ingestion.
   // TODO: plumb Env::IOActivity
   ReadOptions ro;
+  FileOptions gen_fopts;
+  gen_fopts.file_checksum_func_name = kNoFileChecksumFuncName;
   IOStatus io_s = GenerateOneFileChecksum(
       fs_.get(), file_to_ingest->internal_file_path,
       db_options_.file_checksum_gen_factory.get(), requested_checksum_func_name,
       &file_checksum, &file_checksum_func_name,
       ingestion_options_.verify_checksums_readahead_size,
       db_options_.allow_mmap_reads, io_tracer_, db_options_.rate_limiter.get(),
-      ro, db_options_.stats, db_options_.clock);
+      ro, db_options_.stats, db_options_.clock, gen_fopts);
   if (!io_s.ok()) {
+    ROCKS_LOG_WARN(
+        db_options_.info_log, "Failed to generate checksum for %s: %s",
+        file_to_ingest->internal_file_path.c_str(), io_s.ToString().c_str());
     return io_s;
   }
   file_to_ingest->file_checksum = std::move(file_checksum);
@@ -1314,4 +1546,91 @@ Status ExternalSstFileIngestionJob::SyncIngestedFile(TWritableFile* file) {
   }
 }
 
+Status ExternalSstFileIngestionJob::GetSeqnoBoundaryForFile(
+    TableReader* table_reader, SuperVersion* sv,
+    IngestedFileInfo* file_to_ingest, bool allow_data_in_errors) {
+  const auto tp = table_reader->GetTableProperties();
+  const bool has_largest_seqno = tp->HasKeyLargestSeqno();
+  SequenceNumber largest_seqno = tp->key_largest_seqno;
+  if (has_largest_seqno) {
+    file_to_ingest->largest_seqno = largest_seqno;
+    if (largest_seqno == 0) {
+      file_to_ingest->smallest_seqno = 0;
+      return Status::OK();
+    }
+    if (tp->HasKeySmallestSeqno()) {
+      file_to_ingest->smallest_seqno = tp->key_smallest_seqno;
+      return Status::OK();
+    }
+  }
+
+  // For older SST files they may not be recorded in table properties, so
+  // we scan the file to find out.
+  TEST_SYNC_POINT(
+      "ExternalSstFileIngestionJob::GetSeqnoBoundaryForFile:FileScan");
+  SequenceNumber smallest_seqno = kMaxSequenceNumber;
+  SequenceNumber largest_seqno_from_iter = 0;
+  ReadOptions ro;
+  ro.fill_cache = ingestion_options_.fill_cache;
+  std::unique_ptr<InternalIterator> iter(table_reader->NewIterator(
+      ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion));
+  ParsedInternalKey key;
+  iter->SeekToFirst();
+  while (iter->Valid()) {
+    Status pik_status =
+        ParseInternalKey(iter->key(), &key, allow_data_in_errors);
+    if (!pik_status.ok()) {
+      return Status::Corruption("Corrupted key in external file. ",
+                                pik_status.getState());
+    }
+    smallest_seqno = std::min(smallest_seqno, key.sequence);
+    largest_seqno_from_iter = std::max(largest_seqno_from_iter, key.sequence);
+    iter->Next();
+  }
+  if (!iter->status().ok()) {
+    return iter->status();
+  }
+
+  if (table_reader->GetTableProperties()->num_range_deletions > 0) {
+    std::unique_ptr<InternalIterator> range_del_iter(
+        table_reader->NewRangeTombstoneIterator(ro));
+    if (range_del_iter != nullptr) {
+      for (range_del_iter->SeekToFirst(); range_del_iter->Valid();
+           range_del_iter->Next()) {
+        Status pik_status =
+            ParseInternalKey(range_del_iter->key(), &key, allow_data_in_errors);
+        if (!pik_status.ok()) {
+          return Status::Corruption("Corrupted key in external file. ",
+                                    pik_status.getState());
+        }
+        smallest_seqno = std::min(smallest_seqno, key.sequence);
+        largest_seqno_from_iter =
+            std::max(largest_seqno_from_iter, key.sequence);
+      }
+      if (!range_del_iter->status().ok()) {
+        return range_del_iter->status();
+      }
+    }
+  }
+
+  file_to_ingest->smallest_seqno = smallest_seqno;
+  if (!has_largest_seqno) {
+    file_to_ingest->largest_seqno = largest_seqno_from_iter;
+  } else {
+    assert(largest_seqno == largest_seqno_from_iter);
+    file_to_ingest->largest_seqno = largest_seqno;
+  }
+
+  if (file_to_ingest->largest_seqno == kMaxSequenceNumber) {
+    return Status::InvalidArgument(
+        "Unknown smallest seqno for db generated file.");
+  }
+  if (file_to_ingest->smallest_seqno == kMaxSequenceNumber) {
+    return Status::InvalidArgument(
+        "Unknown largest seqno for db generated file.");
+  }
+  return Status::OK();
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/external_sst_file_ingestion_job.h b/db/external_sst_file_ingestion_job.h
index 4a853afed971..d9ecf43da1b4 100644
--- a/db/external_sst_file_ingestion_job.h
+++ b/db/external_sst_file_ingestion_job.h
@@ -27,50 +27,77 @@ class SystemClock;
 
 struct KeyRangeInfo {
   // Smallest internal key in an external file or for a batch of external files.
+  // unset() could be either invalid or "before all keys"
   InternalKey smallest_internal_key;
   // Largest internal key in an external file or for a batch of external files.
+  // unset() could be either invalid or "after all keys"
   InternalKey largest_internal_key;
 
-  bool empty() const {
-    return smallest_internal_key.size() == 0 &&
-           largest_internal_key.size() == 0;
+  bool unset() const {
+    // Legal internal keys are at least 8 bytes.
+    return smallest_internal_key.unset() || largest_internal_key.unset();
   }
 };
 
 // Helper class to apply SST file key range checks to the external files.
+// XXX: using sstableKeyCompare with user comparator on internal keys is
+// very broken
 class ExternalFileRangeChecker {
  public:
   explicit ExternalFileRangeChecker(const Comparator* ucmp) : ucmp_(ucmp) {}
 
   // Operator used for sorting ranges.
-  bool operator()(const KeyRangeInfo* prev_range,
-                  const KeyRangeInfo* range) const {
-    assert(prev_range);
-    assert(range);
-    return sstableKeyCompare(ucmp_, prev_range->smallest_internal_key,
-                             range->smallest_internal_key) < 0;
+  bool operator()(const KeyRangeInfo* range1,
+                  const KeyRangeInfo* range2) const {
+    assert(range1);
+    assert(range2);
+    assert(!range1->unset());
+    assert(!range2->unset());
+    return sstableKeyCompare(ucmp_, range1->smallest_internal_key,
+                             range2->smallest_internal_key) < 0;
   }
 
-  // Check whether `range` overlaps with `prev_range`. `ranges_sorted` can be
-  // set to true when the inputs are already sorted based on the sorting logic
-  // provided by this checker's operator(), which can help simplify the check.
-  bool OverlapsWithPrev(const KeyRangeInfo* prev_range,
-                        const KeyRangeInfo* range,
-                        bool ranges_sorted = false) const {
-    assert(prev_range);
-    assert(range);
-    if (prev_range->empty() || range->empty()) {
+  bool Overlaps(const KeyRangeInfo& range1, const KeyRangeInfo& range2,
+                bool known_sorted = false) const {
+    return Overlaps(range1, range2.smallest_internal_key,
+                    range2.largest_internal_key, known_sorted);
+  }
+  bool Overlaps(const KeyRangeInfo& range1, const InternalKey& range2_smallest,
+                const InternalKey& range2_largest,
+                bool known_sorted = false) const {
+    bool any_unset =
+        range1.unset() || range2_smallest.unset() || range2_largest.unset();
+    if (any_unset) {
+      assert(!any_unset);
       return false;
     }
-    if (ranges_sorted) {
-      return sstableKeyCompare(ucmp_, prev_range->largest_internal_key,
-                               range->smallest_internal_key) >= 0;
+    if (known_sorted) {
+      return sstableKeyCompare(ucmp_, range1.largest_internal_key,
+                               range2_smallest) >= 0;
     }
 
-    return sstableKeyCompare(ucmp_, prev_range->largest_internal_key,
-                             range->smallest_internal_key) >= 0 &&
-           sstableKeyCompare(ucmp_, prev_range->smallest_internal_key,
-                             range->largest_internal_key) <= 0;
+    return sstableKeyCompare(ucmp_, range1.largest_internal_key,
+                             range2_smallest) >= 0 &&
+           sstableKeyCompare(ucmp_, range1.smallest_internal_key,
+                             range2_largest) <= 0;
+  }
+
+  bool Contains(const KeyRangeInfo& range1, const KeyRangeInfo& range2) {
+    return Contains(range1, range2.smallest_internal_key,
+                    range2.largest_internal_key);
+  }
+  bool Contains(const KeyRangeInfo& range1, const InternalKey& range2_smallest,
+                const InternalKey& range2_largest) {
+    bool any_unset =
+        range1.unset() || range2_smallest.unset() || range2_largest.unset();
+    if (any_unset) {
+      assert(!any_unset);
+      return false;
+    }
+    return sstableKeyCompare(ucmp_, range1.smallest_internal_key,
+                             range2_smallest) <= 0 &&
+           sstableKeyCompare(ucmp_, range1.largest_internal_key,
+                             range2_largest) >= 0;
   }
 
   void MaybeUpdateRange(const InternalKey& start_key,
@@ -153,6 +180,9 @@ struct IngestedFileInfo : public KeyRangeInfo {
   // the user key's format in the external file matches the column family's
   // setting.
   bool user_defined_timestamps_persisted = true;
+
+  SequenceNumber largest_seqno = kMaxSequenceNumber;
+  SequenceNumber smallest_seqno = kMaxSequenceNumber;
 };
 
 // A batch of files.
@@ -203,7 +233,7 @@ class ExternalSstFileIngestionJob {
         directories_(directories),
         event_logger_(event_logger),
         job_start_time_(clock_->NowMicros()),
-        consumed_seqno_count_(0),
+        max_assigned_seqno_(0),
         io_tracer_(io_tracer) {
     assert(directories != nullptr);
     assert(cfd_);
@@ -218,6 +248,7 @@ class ExternalSstFileIngestionJob {
   Status Prepare(const std::vector<std::string>& external_files_paths,
                  const std::vector<std::string>& files_checksums,
                  const std::vector<std::string>& files_checksum_func_names,
+                 const std::optional<RangeOpt>& atomic_replace_range,
                  const Temperature& file_temperature, uint64_t next_file_number,
                  SuperVersion* sv);
 
@@ -259,8 +290,16 @@ class ExternalSstFileIngestionJob {
     return files_to_ingest_;
   }
 
-  // How many sequence numbers did we consume as part of the ingestion job?
-  int ConsumedSequenceNumbersCount() const { return consumed_seqno_count_; }
+  // Return the maximum assigned sequence number for all files in this job.
+  // When allow_db_generated_files = false, we may assign global sequence
+  // numbers to ingested files. The global sequence numbers are sequence numbers
+  // following versions_->LastSequence().
+  // When allow_db_generated_files = true, we ingest files that already have
+  // sequence numbers assigned. max_assigned_seqno_ will be the max sequence
+  // number among ingested files.
+  SequenceNumber MaxAssignedSequenceNumber() const {
+    return max_assigned_seqno_;
+  }
 
  private:
   Status ResetTableReader(const std::string& external_file,
@@ -321,7 +360,7 @@ class ExternalSstFileIngestionJob {
       std::optional<int> prev_batch_uppermost_level);
 
   // File that we want to ingest behind always goes to the lowest level;
-  // we just check that it fits in the level, that DB allows ingest_behind,
+  // we just check that it fits in the level, that the CF allows ingest_behind,
   // and that we don't have 0 seqnums at the upper levels.
   // REQUIRES: Mutex held
   Status CheckLevelForIngestedBehindFile(IngestedFileInfo* file_to_ingest);
@@ -341,6 +380,13 @@ class ExternalSstFileIngestionJob {
   template <typename TWritableFile>
   Status SyncIngestedFile(TWritableFile* file);
 
+  // Helper function to obtain the smallest and largest sequence number from a
+  // file. When OK is returned, file_to_ingest->smallest_seqno and
+  // file_to_ingest->largest_seqno will be updated.
+  Status GetSeqnoBoundaryForFile(TableReader* table_reader, SuperVersion* sv,
+                                 IngestedFileInfo* file_to_ingest,
+                                 bool allow_data_in_errors);
+
   // Create equivalent `Compaction` objects to this file ingestion job
   // , which will be used to check range conflict with other ongoing
   // compactions.
@@ -362,11 +408,12 @@ class ExternalSstFileIngestionJob {
   autovector<IngestedFileInfo> files_to_ingest_;
   std::vector<FileBatchInfo> file_batches_to_ingest_;
   const IngestExternalFileOptions& ingestion_options_;
+  std::optional<KeyRangeInfo> atomic_replace_range_;
   Directories* directories_;
   EventLogger* event_logger_;
   VersionEdit edit_;
   uint64_t job_start_time_;
-  int consumed_seqno_count_;
+  SequenceNumber max_assigned_seqno_;
   // Set in ExternalSstFileIngestionJob::Prepare(), if true all files are
   // ingested in L0
   bool files_overlap_{false};
diff --git a/db/external_sst_file_test.cc b/db/external_sst_file_test.cc
index de261af7a01b..c4cc09797af2 100644
--- a/db/external_sst_file_test.cc
+++ b/db/external_sst_file_test.cc
@@ -7,6 +7,7 @@
 
 #include <functional>
 #include <memory>
+#include <sstream>
 
 #include "db/db_test_util.h"
 #include "db/dbformat.h"
@@ -79,8 +80,7 @@ class ExternSSTFileLinkFailFallbackTest
   }
 
   void TearDown() override {
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     ASSERT_OK(DestroyDB(dbname_, options_));
   }
 
@@ -2417,102 +2417,130 @@ TEST_F(ExternalSSTFileTest, SnapshotInconsistencyBug) {
 }
 
 TEST_P(ExternalSSTFileTest, IngestBehind) {
-  Options options = CurrentOptions();
-  options.compaction_style = kCompactionStyleUniversal;
-  options.num_levels = 3;
-  options.disable_auto_compactions = false;
-  DestroyAndReopen(options);
-  std::vector<std::pair<std::string, std::string>> file_data;
-  std::map<std::string, std::string> true_data;
+  for (bool cf_option : {false, true}) {
+    SCOPED_TRACE("cf_option = " + std::to_string(cf_option));
+    Options options = CurrentOptions();
+    options.compaction_style = kCompactionStyleUniversal;
+    options.num_levels = 3;
+    options.disable_auto_compactions = false;
+    DestroyAndReopen(options);
+    std::vector<std::pair<std::string, std::string>> file_data;
+    std::map<std::string, std::string> true_data;
 
-  // Insert 100 -> 200 into the memtable
-  for (int i = 100; i <= 200; i++) {
-    ASSERT_OK(Put(Key(i), "memtable"));
-  }
+    // Insert 100 -> 200 into the memtable
+    for (int i = 100; i <= 200; i++) {
+      ASSERT_OK(Put(Key(i), "memtable"));
+    }
 
-  // Insert 100 -> 200 using IngestExternalFile
-  file_data.clear();
-  for (int i = 0; i <= 20; i++) {
-    file_data.emplace_back(Key(i), "ingest_behind");
-    true_data[Key(i)] = "ingest_behind";
-  }
+    // Insert 100 -> 200 using IngestExternalFile
+    file_data.clear();
+    for (int i = 0; i <= 20; i++) {
+      file_data.emplace_back(Key(i), "ingest_behind");
+      true_data[Key(i)] = "ingest_behind";
+    }
 
-  bool allow_global_seqno = true;
-  bool ingest_behind = true;
-  bool write_global_seqno = std::get<0>(GetParam());
-  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+    bool allow_global_seqno = true;
+    bool ingest_behind = true;
+    bool write_global_seqno = std::get<0>(GetParam());
+    bool verify_checksums_before_ingest = std::get<1>(GetParam());
 
-  // Can't ingest behind since allow_ingest_behind isn't set to true
-  ASSERT_NOK(GenerateAndAddExternalFile(
-      options, file_data, -1, allow_global_seqno, write_global_seqno,
-      verify_checksums_before_ingest, ingest_behind, false /*sort_data*/,
-      &true_data));
+    // Can't ingest behind since allow_ingest_behind isn't set to true
+    ASSERT_NOK(GenerateAndAddExternalFile(
+        options, file_data, -1, allow_global_seqno, write_global_seqno,
+        verify_checksums_before_ingest, ingest_behind, false /*sort_data*/,
+        &true_data));
 
-  options.allow_ingest_behind = true;
-  // check that we still can open the DB, as num_levels should be
-  // sanitized to 3
-  options.num_levels = 2;
-  DestroyAndReopen(options);
+    if (cf_option) {
+      options.cf_allow_ingest_behind = true;
+    } else {
+      options.allow_ingest_behind = true;
+    }
+    // check that we still can open the DB, as num_levels should be
+    // sanitized to 3
+    options.num_levels = 2;
+    DestroyAndReopen(options);
 
-  options.num_levels = 3;
-  DestroyAndReopen(options);
-  true_data.clear();
-  // Insert 100 -> 200 into the memtable
-  for (int i = 100; i <= 200; i++) {
-    ASSERT_OK(Put(Key(i), "memtable"));
-    true_data[Key(i)] = "memtable";
-  }
-  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  // Universal picker should go at second from the bottom level
-  ASSERT_EQ("0,1", FilesPerLevel());
-  ASSERT_OK(GenerateAndAddExternalFile(
-      options, file_data, -1, allow_global_seqno, write_global_seqno,
-      verify_checksums_before_ingest, true /*ingest_behind*/,
-      false /*sort_data*/, &true_data));
-  ASSERT_EQ("0,1,1", FilesPerLevel());
-  // this time ingest should fail as the file doesn't fit to the bottom level
-  ASSERT_NOK(GenerateAndAddExternalFile(
-      options, file_data, -1, allow_global_seqno, write_global_seqno,
-      verify_checksums_before_ingest, true /*ingest_behind*/,
-      false /*sort_data*/, &true_data));
-  ASSERT_EQ("0,1,1", FilesPerLevel());
-  std::vector<std::vector<FileMetaData>> level_to_files;
-  dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &level_to_files);
-  uint64_t ingested_file_number = level_to_files[2][0].fd.GetNumber();
-  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  // Last level should not be compacted
-  ASSERT_EQ("0,1,1", FilesPerLevel());
-  dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &level_to_files);
-  ASSERT_EQ(ingested_file_number, level_to_files[2][0].fd.GetNumber());
-  size_t kcnt = 0;
-  VerifyDBFromMap(true_data, &kcnt, false);
+    options.num_levels = 3;
+    DestroyAndReopen(options);
+    true_data.clear();
+    // Insert 100 -> 200 into the memtable
+    for (int i = 100; i <= 200; i++) {
+      ASSERT_OK(Put(Key(i), "memtable"));
+      true_data[Key(i)] = "memtable";
+    }
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    // Universal picker should go at second from the bottom level
+    ASSERT_EQ("0,1", FilesPerLevel());
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, file_data, -1, allow_global_seqno, write_global_seqno,
+        verify_checksums_before_ingest, true /*ingest_behind*/,
+        false /*sort_data*/, &true_data));
+    ASSERT_EQ("0,1,1", FilesPerLevel());
+    // this time ingest should fail as the file doesn't fit to the bottom level
+    ASSERT_NOK(GenerateAndAddExternalFile(
+        options, file_data, -1, allow_global_seqno, write_global_seqno,
+        verify_checksums_before_ingest, true /*ingest_behind*/,
+        false /*sort_data*/, &true_data));
+    ASSERT_EQ("0,1,1", FilesPerLevel());
+    std::vector<std::vector<FileMetaData>> level_to_files;
+    dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(),
+                                    &level_to_files);
+    uint64_t ingested_file_number = level_to_files[2][0].fd.GetNumber();
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    // Last level should not be compacted
+    ASSERT_EQ("0,1,1", FilesPerLevel());
+    dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(),
+                                    &level_to_files);
+    ASSERT_EQ(ingested_file_number, level_to_files[2][0].fd.GetNumber());
+    size_t kcnt = 0;
+    VerifyDBFromMap(true_data, &kcnt, false);
 
-  // Auto-compaction should not include the last level.
-  // Trigger compaction if size amplification exceeds 110%.
-  options.compaction_options_universal.max_size_amplification_percent = 110;
-  options.level0_file_num_compaction_trigger = 4;
-  ASSERT_OK(TryReopen(options));
-  Random rnd(301);
-  for (int i = 0; i < 4; ++i) {
-    for (int j = 0; j < 10; j++) {
-      true_data[Key(j)] = rnd.RandomString(1000);
-      ASSERT_OK(Put(Key(j), true_data[Key(j)]));
+    // Auto-compaction should not include the last level.
+    // Trigger compaction if size amplification exceeds 110%.
+    options.compaction_options_universal.max_size_amplification_percent = 110;
+    options.level0_file_num_compaction_trigger = 4;
+    ASSERT_OK(TryReopen(options));
+    Random rnd(301);
+    for (int i = 0; i < 4; ++i) {
+      for (int j = 0; j < 10; j++) {
+        true_data[Key(j)] = rnd.RandomString(1000);
+        ASSERT_OK(Put(Key(j), true_data[Key(j)]));
+      }
+      ASSERT_OK(Flush());
     }
-    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(),
+                                    &level_to_files);
+    ASSERT_EQ(1, level_to_files[2].size());
+    ASSERT_EQ(ingested_file_number, level_to_files[2][0].fd.GetNumber());
+
+    // Turning off the option allows DB to compact ingested files.
+    if (cf_option) {
+      // Test that another CF does not allow ingest behind
+      ColumnFamilyHandle* new_cfh;
+      Options new_cf_option;
+      ASSERT_OK(db_->CreateColumnFamily(new_cf_option, "new_cf", &new_cfh));
+      ASSERT_TRUE(GenerateAndAddExternalFile(
+                      new_cf_option, file_data, -1, allow_global_seqno,
+                      write_global_seqno, verify_checksums_before_ingest,
+                      true /*ingest_behind*/, false /*sort_data*/, nullptr,
+                      /*cfh=*/new_cfh)
+                      .IsInvalidArgument());
+      ASSERT_OK(db_->DropColumnFamily(new_cfh));
+      ASSERT_OK(db_->DestroyColumnFamilyHandle(new_cfh));
+
+      options.cf_allow_ingest_behind = false;
+    } else {
+      options.allow_ingest_behind = false;
+    }
+    ASSERT_OK(TryReopen(options));
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(),
+                                    &level_to_files);
+    ASSERT_EQ(1, level_to_files[2].size());
+    ASSERT_NE(ingested_file_number, level_to_files[2][0].fd.GetNumber());
+    VerifyDBFromMap(true_data, &kcnt, false);
   }
-  ASSERT_OK(dbfull()->TEST_WaitForCompact());
-  dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &level_to_files);
-  ASSERT_EQ(1, level_to_files[2].size());
-  ASSERT_EQ(ingested_file_number, level_to_files[2][0].fd.GetNumber());
-
-  // Turning off the option allows DB to compact ingested files.
-  options.allow_ingest_behind = false;
-  ASSERT_OK(TryReopen(options));
-  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &level_to_files);
-  ASSERT_EQ(1, level_to_files[2].size());
-  ASSERT_NE(ingested_file_number, level_to_files[2][0].fd.GetNumber());
-  VerifyDBFromMap(true_data, &kcnt, false);
 }
 
 TEST_F(ExternalSSTFileTest, SkipBloomFilter) {
@@ -2541,14 +2569,19 @@ TEST_F(ExternalSSTFileTest, SkipBloomFilter) {
         options.statistics->getTickerCount(Tickers::BLOCK_CACHE_FILTER_ADD), 1);
   }
 
-  // Create external SST file but skip bloom filters
+  // Create external SST file but skip bloom filters by using options
+  // with no filter policy
   options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
   DestroyAndReopen(options);
   {
     std::string file_path = sst_files_dir_ + "sst_with_no_bloom.sst";
-    SstFileWriter sst_file_writer(EnvOptions(), options, nullptr, true,
-                                  Env::IOPriority::IO_TOTAL,
-                                  true /* skip_filters */);
+    // Use options with no filter policy to skip bloom filters
+    Options no_filter_options = options;
+    BlockBasedTableOptions no_filter_table_options = table_options;
+    no_filter_table_options.filter_policy.reset();
+    no_filter_options.table_factory.reset(
+        NewBlockBasedTableFactory(no_filter_table_options));
+    SstFileWriter sst_file_writer(EnvOptions(), no_filter_options);
     ASSERT_OK(sst_file_writer.Open(file_path));
     ASSERT_OK(sst_file_writer.Put("Key1", "Value1"));
     ASSERT_OK(sst_file_writer.Finish());
@@ -3514,19 +3547,26 @@ TEST_F(ExternalSSTFileWithTimestampTest, SanityCheck) {
   // overlapping key ranges.
   ASSERT_TRUE(IngestExternalUDTFile({file1, file2}).IsNotSupported());
 
-  options.allow_ingest_behind = true;
-  DestroyAndReopen(options);
-  IngestExternalFileOptions opts;
+  for (bool cf_option : {false, true}) {
+    SCOPED_TRACE("cf_option = " + std::to_string(cf_option));
+    if (cf_option) {
+      options.cf_allow_ingest_behind = true;
+    } else {
+      options.allow_ingest_behind = true;
+    }
+    DestroyAndReopen(options);
+    IngestExternalFileOptions opts;
 
-  // TODO(yuzhangyu): support ingestion behind for user-defined timestamps?
-  // Ingesting external files with user-defined timestamps requires searching
-  // through the whole lsm tree to make sure there is no key range overlap with
-  // the db. Ingestion behind currently is doing a simply placing it at the
-  // bottom level step without a search, so we don't allow it either.
-  opts.ingest_behind = true;
-  ASSERT_TRUE(db_->IngestExternalFile({file1}, opts).IsNotSupported());
+    // TODO(yuzhangyu): support ingestion behind for user-defined timestamps?
+    // Ingesting external files with user-defined timestamps requires searching
+    // through the whole lsm tree to make sure there is no key range overlap
+    // with the db. Ingestion behind currently is doing a simply placing it at
+    // the bottom level step without a search, so we don't allow it either.
+    opts.ingest_behind = true;
+    ASSERT_TRUE(db_->IngestExternalFile({file1}, opts).IsNotSupported());
 
-  DestroyAndRecreateExternalSSTFilesDir();
+    DestroyAndRecreateExternalSSTFilesDir();
+  }
 }
 
 TEST_F(ExternalSSTFileWithTimestampTest, UDTSettingsCompatibilityCheck) {
@@ -3818,106 +3858,37 @@ TEST_P(IngestDBGeneratedFileTest, FailureCase) {
       ASSERT_OK(Put(1, Key(k), "cf1_" + Key(k)));
     }
     ASSERT_OK(Flush(/*cf=*/1));
-    {
-      // Verify that largest key of the file has non-zero seqno.
-      std::vector<std::vector<FileMetaData>> metadata;
-      dbfull()->TEST_GetFilesMetaData(handles_[1], &metadata, nullptr);
-      const FileMetaData& file = metadata[0][0];
-      ValueType vtype;
-      SequenceNumber seq;
-      UnPackSequenceAndType(ExtractInternalKeyFooter(file.largest.Encode()),
-                            &seq, &vtype);
-      ASSERT_GE(seq, 0);
-    }
-    std::vector<LiveFileMetaData> live_meta;
-    db_->GetLiveFilesMetaData(&live_meta);
-    ASSERT_EQ(live_meta.size(), 1);
-    std::vector<std::string> to_ingest_files;
-    to_ingest_files.emplace_back(live_meta[0].directory + "/" +
-                                 live_meta[0].relative_filename);
-    // Ingesting a file whose boundary key has non-zero seqno.
-    Status s = db_->IngestExternalFile(to_ingest_files, ingest_opts);
-    // This error msg is from checking seqno of boundary keys.
-    ASSERT_TRUE(
-        s.ToString().find("External file has non zero sequence number") !=
-        std::string::npos);
-    ASSERT_NOK(s);
-
-    {
-      // Only non-boundary key with non-zero seqno.
-      const Snapshot* snapshot = db_->GetSnapshot();
-      ASSERT_OK(Put(1, Key(70), "cf1_" + Key(70)));
-      ASSERT_OK(Flush(1));
-      CompactRangeOptions cro;
-      cro.bottommost_level_compaction =
-          BottommostLevelCompaction::kForceOptimized;
-      ASSERT_OK(db_->CompactRange(cro, handles_[1], nullptr, nullptr));
-
-      // Verify that only the non-boundary key of the file has non-zero seqno.
-      std::vector<std::vector<FileMetaData>> metadata;
-      // File may be at different level for different options.
-      dbfull()->TEST_GetFilesMetaData(handles_[1], &metadata, nullptr);
-      bool found_file = false;
-      for (const auto& level : metadata) {
-        if (level.empty()) {
-          continue;
-        }
-        ASSERT_FALSE(found_file);
-        found_file = true;
-        ASSERT_EQ(1, level.size());
-        const FileMetaData& file = level[0];
-        ValueType vtype;
-        SequenceNumber seq;
-        UnPackSequenceAndType(ExtractInternalKeyFooter(file.largest.Encode()),
-                              &seq, &vtype);
-        ASSERT_EQ(seq, 0);
-        UnPackSequenceAndType(ExtractInternalKeyFooter(file.smallest.Encode()),
-                              &seq, &vtype);
-        ASSERT_EQ(seq, 0);
-        ASSERT_GT(file.fd.largest_seqno, 0);
-      }
-      ASSERT_TRUE(found_file);
-      live_meta.clear();
-      db_->GetLiveFilesMetaData(&live_meta);
-      ASSERT_EQ(live_meta.size(), 1);
-      to_ingest_files[0] =
-          live_meta[0].directory + "/" + live_meta[0].relative_filename;
-      s = db_->IngestExternalFile(to_ingest_files, ingest_opts);
-      ASSERT_NOK(s);
-      // This error msg is from checking largest seqno in table property.
-      ASSERT_TRUE(s.ToString().find("non zero largest sequence number") !=
-                  std::string::npos);
-      db_->ReleaseSnapshot(snapshot);
-    }
 
+    Status s;
     CompactRangeOptions cro;
     cro.bottommost_level_compaction =
         BottommostLevelCompaction::kForceOptimized;
     ASSERT_OK(db_->CompactRange(cro, handles_[1], nullptr, nullptr));
-    live_meta.clear();
+
+    std::vector<LiveFileMetaData> live_meta;
+    std::vector<std::string> to_ingest_files;
     db_->GetLiveFilesMetaData(&live_meta);
     ASSERT_EQ(live_meta.size(), 1);
+    ASSERT_EQ(live_meta[0].column_family_name, "toto");
     ASSERT_EQ(0, live_meta[0].largest_seqno);
-    to_ingest_files[0] =
-        live_meta[0].directory + "/" + live_meta[0].relative_filename;
+    to_ingest_files.emplace_back(live_meta[0].directory + "/" +
+                                 live_meta[0].relative_filename);
 
+    // Ingesting a DB generated file with allow_db_generated_files = false
     ingest_opts.allow_db_generated_files = false;
-    // Ingesting a DB genrate file with allow_db_generated_files = false;
     s = db_->IngestExternalFile(to_ingest_files, ingest_opts);
     ASSERT_TRUE(s.ToString().find("External file version not found") !=
                 std::string::npos);
     ASSERT_NOK(s);
 
     const std::string err =
-        "An ingested file is assigned to a non-zero sequence number, which is "
-        "incompatible with ingestion option allow_db_generated_files";
+        "An ingested file overlaps with existing data in the DB and has been "
+        "assigned a non-zero sequence number";
     ingest_opts.allow_db_generated_files = true;
     s = db_->IngestExternalFile(to_ingest_files, ingest_opts);
     ASSERT_TRUE(s.ToString().find(err) != std::string::npos);
     ASSERT_NOK(s);
-    if (options.compaction_style != kCompactionStyleUniversal) {
-      // FIXME: after fixing ingestion with universal compaction, currently
-      //  will always ingest into L0.
+    if (options.num_levels > 1) {
       ingest_opts.fail_if_not_bottommost_level = true;
       s = db_->IngestExternalFile(to_ingest_files, ingest_opts);
       ASSERT_NOK(s);
@@ -4073,7 +4044,7 @@ TEST_P(IngestDBGeneratedFileTest2, NotOverlapWithDB) {
       std::string db2_path = test::PerThreadDBPath("DB2");
       Options db2_options;
       db2_options.create_if_missing = true;
-      DB* db2 = nullptr;
+      std::unique_ptr<DB> db2;
       ASSERT_OK(DB::Open(db2_options, db2_path, &db2));
       // Write some base data.
       expected_value.emplace_back(rnd.RandomString(100));
@@ -4102,10 +4073,10 @@ TEST_P(IngestDBGeneratedFileTest2, NotOverlapWithDB) {
       ASSERT_OK(db_->DropColumnFamily(temp_cfh));
       ASSERT_OK(db_->DestroyColumnFamilyHandle(temp_cfh));
       ASSERT_OK(db2->Close());
-      delete db2;
+      db2.reset();
       ASSERT_OK(DB::Open(db2_options, db2_path, &db2));
       ASSERT_OK(db2->Close());
-      delete db2;
+      db2.reset();
       ASSERT_OK(DestroyDB(db2_path, db2_options));
     } else {
       ASSERT_OK(db_->DropColumnFamily(temp_cfh));
@@ -4113,6 +4084,472 @@ TEST_P(IngestDBGeneratedFileTest2, NotOverlapWithDB) {
     }
   } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction));
 }
+
+TEST_P(IngestDBGeneratedFileTest2, NonZeroSeqno) {
+  // Test ingestion of DB-generated SST files that contain non-zero sequence
+  // numbers.
+  IngestExternalFileOptions ingest_opts;
+  ingest_opts.allow_db_generated_files = true;
+  // This only works since we are ingesting without snapshot
+  // Failure case will be tested below.
+  ingest_opts.snapshot_consistency = std::get<0>(GetParam());
+  ingest_opts.allow_global_seqno = std::get<1>(GetParam());
+  ingest_opts.allow_blocking_flush = std::get<2>(GetParam());
+  ingest_opts.fail_if_not_bottommost_level = std::get<3>(GetParam());
+  ingest_opts.link_files = std::get<4>(GetParam());
+  Random* rnd = Random::GetTLSInstance();
+  rnd->Reset(std::random_device{}());
+  std::ostringstream ingest_opts_trace;
+  ingest_opts_trace << "ingest_opts params: " << "snapshot_consistency="
+                    << ingest_opts.snapshot_consistency << ", "
+                    << "allow_global_seqno=" << ingest_opts.allow_global_seqno
+                    << ", " << "allow_blocking_flush="
+                    << ingest_opts.allow_blocking_flush << ", "
+                    << "fail_if_not_bottommost_level="
+                    << ingest_opts.fail_if_not_bottommost_level << ", "
+                    << "link_files=" << ingest_opts.link_files;
+  SCOPED_TRACE(ingest_opts_trace.str());
+
+  do {
+    SCOPED_TRACE("option_config_ = " + std::to_string(option_config_));
+
+    Options options = CurrentOptions();
+    options.statistics = CreateDBStatistics();
+    options.allow_concurrent_memtable_write =
+        false;  // Required for VectorRepFactory
+    CreateAndReopenWithCF({"non_overlap", "overlap"}, options);
+
+    ColumnFamilyHandle* non_overlap_cf = handles_[1];
+    ColumnFamilyHandle* overlap_cf = handles_[2];
+
+    std::vector<std::string> expected_values;
+    expected_values.resize(100);
+    WriteOptions wo;
+    // Setup target CF with non-overlapping base data Key1 and Key99
+    // Will ingest keys [1, 98] below.
+    expected_values[0] = rnd->RandomString(100);
+    ASSERT_OK(db_->Put(wo, non_overlap_cf, Key(0), expected_values[0]));
+    ASSERT_OK(db_->Flush({}, non_overlap_cf));
+    expected_values[99] = rnd->RandomString(100);
+    ASSERT_OK(db_->Put(wo, non_overlap_cf, Key(99), expected_values[99]));
+
+    // Set up overlapping cf
+    ASSERT_OK(db_->Put(wo, overlap_cf, Key(50), rnd->RandomString(100)));
+
+    // Create temp CF/DB
+    Options temp_cf_opts;
+    ColumnFamilyHandle* temp_cfh = nullptr;
+    std::unique_ptr<DB> temp_db_holder;
+    DB* from_db = nullptr;
+    std::string temp_db_name;
+    // Using a separate DB also validates that latest sequence number
+    // of target db is updated after ingestion (to the max sequence number
+    // in ingested files).
+    const bool use_temp_db = rnd->OneIn(2);
+    SCOPED_TRACE("use_temp_db: " + std::to_string(use_temp_db));
+
+    std::vector<std::string> sst_file_paths;
+    // optional L5: files in key range [70, 98]
+    // L6: files in key range [1, 79]
+    temp_cf_opts.target_file_size_base =
+        20 << 10;  // Small files to create multiple SSTs
+    temp_cf_opts.num_levels = 7;
+    temp_cf_opts.disable_auto_compactions = true;  // Manually set up LSM
+    temp_cf_opts.env = options.env;
+
+    if (use_temp_db) {
+      temp_cf_opts.create_if_missing = true;
+      temp_db_name = dbname_ + "/temp_db_" + std::to_string(rnd->Next());
+      ASSERT_OK(DB::Open(temp_cf_opts, temp_db_name, &temp_db_holder));
+      from_db = temp_db_holder.get();
+      temp_cfh = from_db->DefaultColumnFamily();
+    } else {
+      from_db = db_.get();
+      ASSERT_OK(
+          from_db->CreateColumnFamily(temp_cf_opts, "temp_cf", &temp_cfh));
+    }
+
+    // Use snapshot to ensure non-zero sequence numbers after compaction
+    const Snapshot* snapshot = from_db->GetSnapshot();
+
+    for (int k = 1; k < 99; ++k) {
+      expected_values[k] = rnd->RandomString(2000);
+      ASSERT_OK(from_db->Put(wo, temp_cfh, Key(k), expected_values[k]));
+    }
+    ASSERT_OK(from_db->Flush({}, temp_cfh));
+    CompactRangeOptions cro;
+    cro.bottommost_level_compaction =
+        BottommostLevelCompaction::kForceOptimized;
+    ASSERT_OK(from_db->CompactRange(cro, temp_cfh, nullptr, nullptr));
+
+    ASSERT_GT(NumTableFilesAtLevel(6, temp_cfh, from_db), 1);
+
+    const bool multi_level_ingestion = rnd->OneIn(2);
+    SCOPED_TRACE("Multi-level ingestion: " +
+                 std::to_string(multi_level_ingestion));
+    if (multi_level_ingestion) {
+      for (int k = 80; k < 99; ++k) {
+        expected_values[k] = rnd->RandomString(500);
+        ASSERT_OK(from_db->Put(wo, temp_cfh, Key(k), expected_values[k]));
+      }
+      ASSERT_OK(from_db->Flush({}, temp_cfh));
+
+      // Do some overwrites, and overlap with previous L0 to avoid trivial move
+      for (int k = 70; k < 82; ++k) {
+        expected_values[k] = rnd->RandomString(500);
+        ASSERT_OK(from_db->Put(wo, temp_cfh, Key(k), expected_values[k]));
+      }
+      ASSERT_OK(from_db->Flush({}, temp_cfh));
+
+      if (rnd->OneIn(2)) {
+        MoveFilesToLevel(5, temp_cfh, from_db);
+        ASSERT_GT(NumTableFilesAtLevel(5, temp_cfh, from_db), 0);
+      }
+      ASSERT_GT(NumTableFilesAtLevel(6, temp_cfh, from_db), 0);
+    }
+    SCOPED_TRACE("LSM of from_db " + FilesPerLevel(temp_cfh, from_db));
+
+    ColumnFamilyMetaData cf_meta;
+    from_db->GetColumnFamilyMetaData(temp_cfh, &cf_meta);
+
+    // Iterate in reverse since IngestExternalFiles expect files to be ordered
+    // from old to new
+    for (auto level_meta = cf_meta.levels.rbegin();
+         level_meta != cf_meta.levels.rend(); ++level_meta) {
+      // L0 files need to be added in reverse order.
+      for (auto file_meta = level_meta->files.rbegin();
+           file_meta != level_meta->files.rend(); ++file_meta) {
+        // Validate that files contain non-zero sequence numbers
+        ASSERT_GT(file_meta->smallest_seqno, 0);
+        ASSERT_GE(file_meta->largest_seqno, file_meta->smallest_seqno);
+        sst_file_paths.emplace_back(file_meta->directory + "/" +
+                                    file_meta->relative_filename);
+      }
+    }
+    from_db->ReleaseSnapshot(snapshot);
+
+    Status s;
+    // Perform ingestion and validate results
+    if (multi_level_ingestion && options.num_levels > 1) {
+      // fail_if_bottommost requres ingesting all files into the last level,
+      // so it fails if we are assiging files to multiple levels.
+      ingest_opts.fail_if_not_bottommost_level = true;
+      s = db_->IngestExternalFile(non_overlap_cf, sst_file_paths, ingest_opts);
+      ASSERT_NOK(s);
+      ASSERT_TRUE(s.ToString().find("Files cannot be ingested to Lmax") !=
+                  std::string::npos);
+      ingest_opts.fail_if_not_bottommost_level = false;
+    }
+    if (ingest_opts.snapshot_consistency) {
+      // snapshot_consisteny requires global sequence number assignment to
+      // ingested files if there is any live snapshot.
+      snapshot = db_->GetSnapshot();
+      s = db_->IngestExternalFile(non_overlap_cf, sst_file_paths, ingest_opts);
+      ASSERT_NOK(s);
+      ASSERT_TRUE(s.ToString().find(
+          "An ingested file overlaps with existing data in the DB and has been "
+          "assigned a non-zero sequence number"));
+      db_->ReleaseSnapshot(snapshot);
+    }
+
+    std::atomic<int> file_scan_count{0};
+    SyncPoint::GetInstance()->SetCallBack(
+        "ExternalSstFileIngestionJob::GetSeqnoBoundaryForFile:FileScan",
+        [&](void* /*arg*/) { file_scan_count++; });
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    ASSERT_OK(
+        db_->IngestExternalFile(non_overlap_cf, sst_file_paths, ingest_opts));
+
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+
+    EXPECT_EQ(file_scan_count, 0);
+
+    // Validate ingested data.
+    ReadOptions ro;
+    std::string val;
+    for (int k = 0; k < 100; ++k) {
+      s = db_->Get(ro, handles_[1], Key(k), &val);
+      ASSERT_OK(s) << "Should find ingested key " << Key(k);
+      ASSERT_EQ(val, expected_values[k]) << "key: " << Key(k);
+    }
+
+    // Overlap with data in the CF
+    if (ingest_opts.allow_blocking_flush) {
+      s = db_->IngestExternalFile(overlap_cf, sst_file_paths, ingest_opts);
+
+      ASSERT_NOK(s);
+      if (ingest_opts.fail_if_not_bottommost_level) {
+        ASSERT_TRUE(s.ToString().find("Files cannot be ingested to Lmax") !=
+                    std::string::npos)
+            << s.ToString();
+      } else {
+        ASSERT_TRUE(s.ToString().find("An ingested file overlaps with existing "
+                                      "data in the DB and has been "
+                                      "assigned a non-zero sequence number") !=
+                    std::string::npos)
+            << s.ToString();
+      }
+    }
+
+    // Cleanup
+    // FIXME: Without this, the test triggers some data race between dropping
+    // CF and background compaction.
+    ASSERT_OK(db_->WaitForCompact({}));
+    if (use_temp_db) {
+      ASSERT_OK(from_db->Close());
+      temp_db_holder.reset();
+      ASSERT_OK(DestroyDB(temp_db_name, temp_cf_opts));
+    } else {
+      ASSERT_OK(db_->DropColumnFamily(temp_cfh));
+      ASSERT_OK(db_->DestroyColumnFamilyHandle(temp_cfh));
+    }
+  } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction));
+}
+
+std::string GenSecondaryKey(const std::string& pk, const std::string& val) {
+  return "index_" + val + "_" + pk;
+};
+
+TEST_P(IngestDBGeneratedFileTest2, ZeroAndNonZeroSeqno) {
+  // Test ingestion of SST files with zero and with non-zero sequence numbers.
+  // Generate data using a temp CF and a temp DB:
+  // 1. Temp CF with cf_allow_ingest_behind enabled to preserve non-zero seqno.
+  // 2. Temp DB with everything compacted to have zero seqno.
+  // Then ingest both types of files together into a target CF.
+  // This mimics a user case where temp DB contains data read from a
+  // snapshot while temp CF contains live writes after a snapshot is taken.
+  IngestExternalFileOptions ingest_opts;
+  ingest_opts.allow_db_generated_files = true;
+  ingest_opts.snapshot_consistency = std::get<0>(GetParam());
+  ingest_opts.allow_global_seqno = std::get<1>(GetParam());
+  ingest_opts.allow_blocking_flush = std::get<2>(GetParam());
+  ingest_opts.fail_if_not_bottommost_level = std::get<3>(GetParam());
+  ingest_opts.link_files = std::get<4>(GetParam());
+
+  Random* rnd = Random::GetTLSInstance();
+
+  do {
+    SCOPED_TRACE("option_config_ = " + std::to_string(option_config_));
+    Options options = CurrentOptions();
+    options.allow_concurrent_memtable_write = false;
+    // Force more flushes/compactions and more files to be generated
+    options.target_file_size_base = 1 << 10;     // 1KB
+    options.max_bytes_for_level_base = 2 << 10;  // 2KB
+    options.max_bytes_for_level_multiplier = 2;
+    options.level0_file_num_compaction_trigger = 2;
+    options.level_compaction_dynamic_level_bytes = true;
+    DestroyAndReopen(options);
+    CreateAndReopenWithCF({"target_cf"}, options);
+    auto* target_cfh = handles_[1];
+
+    Options live_write_cf_opts = options;
+    live_write_cf_opts.memtable_factory.reset(new VectorRepFactory());
+    live_write_cf_opts.compaction_style = kCompactionStyleUniversal;
+    live_write_cf_opts.cf_allow_ingest_behind = true;
+    live_write_cf_opts.num_levels = 50;
+    ColumnFamilyHandle* live_write_cfh;
+    ASSERT_OK(db_->CreateColumnFamily(live_write_cf_opts, "live_write_cf",
+                                      &live_write_cfh));
+
+    // Expected value and key
+    std::map<std::string, std::string> expected;
+    std::unordered_set<std::string> deleted;
+    std::stringstream debug_info;
+
+    // Setup base data in target CF, will ingest keys with different prefixes
+    // so they don't overlap with the base data.
+    WriteOptions wo;
+    for (int k = 0; k < 100; ++k) {
+      int random_val = rnd->Uniform(20);
+      expected[Key(k)] = std::to_string(random_val);
+      ASSERT_OK(db_->Put(wo, target_cfh, Key(k), expected[Key(k)]));
+
+      // Force flush every 20 keys to create multiple SST files
+      if (rnd->OneIn(20)) {
+        ASSERT_OK(db_->Flush({}, target_cfh));
+        debug_info << "Flush after " << k
+                   << ", LSM state: " << FilesPerLevel(target_cfh) << "\n";
+      }
+    }
+
+    // Temp DB for snapshot data
+    Options temp_db_opts;
+    temp_db_opts.create_if_missing = true;
+    temp_db_opts.target_file_size_base = 1 << 10;
+    temp_db_opts.write_buffer_size = 1 << 10;
+    temp_db_opts.memtable_factory.reset(new VectorRepFactory());
+    temp_db_opts.allow_concurrent_memtable_write = false;
+    temp_db_opts.compaction_style = kCompactionStyleUniversal;
+    temp_db_opts.env = env_;
+    temp_db_opts.num_levels = 7;
+
+    std::string temp_db_name =
+        dbname_ + "/temp_db_" + std::to_string(rnd->Next());
+    std::unique_ptr<DB> temp_db;
+    ASSERT_OK(DB::Open(temp_db_opts, temp_db_name, &temp_db));
+
+    const Snapshot* snapshot = db_->GetSnapshot();
+    ReadOptions ro;
+    ro.snapshot = snapshot;
+    ro.total_order_seek = true;
+    std::unique_ptr<Iterator> iter{db_->NewIterator(ro, target_cfh)};
+    // transform data read from snapshot and write to temp DB
+    // Varying the number of files in temp DB.
+    const int kValSize = rnd->Uniform(200);
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      std::string key = iter->key().ToString();
+      std::string value = iter->value().ToString();
+      std::string sk = GenSecondaryKey(key, value);
+      // Usually value is empty, here we use a larger value to generate
+      // multiple SST files in temp_db.
+      std::string sk_val = rnd->RandomString(kValSize);
+      ASSERT_OK(temp_db->Put(wo, sk, sk_val));
+      expected[sk] = sk_val;
+      debug_info << "Snapshot data: " << sk << " -> \n";
+    }
+    ASSERT_OK(iter->status());
+
+    // Do some live writes into target CF and live write CF.
+    for (int i = 0; i < 10; ++i) {
+      WriteBatch wb;
+      for (int j = 0; j < 5; ++j) {
+        std::string key = Key(rnd->Uniform(100));
+        std::string old_val = expected[key];
+        // Value range is 0-19, allow some PK to have the same value.
+        int random_val = rnd->Uniform(20);
+        std::string new_val = std::to_string(random_val);
+        std::string old_index_key = GenSecondaryKey(key, old_val);
+        std::string new_index_key = GenSecondaryKey(key, new_val);
+        ASSERT_OK(wb.SingleDelete(live_write_cfh, old_index_key));
+        std::string sk_val = rnd->RandomString(kValSize);
+        ASSERT_OK(wb.Put(live_write_cfh, new_index_key, sk_val));
+        ASSERT_OK(wb.Put(target_cfh, key, new_val));
+        expected[key] = new_val;
+        expected.erase(old_index_key);
+        expected[new_index_key] = sk_val;
+        deleted.insert(old_index_key);
+        deleted.erase(new_index_key);
+
+        debug_info << "Live write: SD " << old_index_key << "\n";
+        debug_info << "Live write: " << key << " -> " << new_val << "\n";
+        debug_info << "Live write: " << new_index_key << " -> \n";
+      }
+      ASSERT_OK(db_->Write(wo, &wb));
+      if (rnd->OneIn(3)) {
+        debug_info << "Flush after " << i << " live writes\n";
+        ASSERT_OK(db_->Flush({}, live_write_cfh));
+      }
+    }
+    iter.reset();
+    db_->ReleaseSnapshot(snapshot);
+
+    // Compact temp_db to ensure zero sequence numbers
+    CompactRangeOptions cro;
+    cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+    ASSERT_OK(temp_db->CompactRange(cro, nullptr, nullptr));
+    SCOPED_TRACE("Temp DB LSM: " +
+                 FilesPerLevel(temp_db->DefaultColumnFamily(), temp_db.get()));
+
+    // Base data from snapshot
+    std::vector<std::string> sst_file_paths_zero_seqno;
+
+    // Collect SST file paths with zero sequence numbers
+    ASSERT_OK(temp_db->DisableFileDeletions());
+    ColumnFamilyMetaData cf_meta_temp_db;
+    temp_db->GetColumnFamilyMetaData(&cf_meta_temp_db);
+    for (const auto& level_meta : cf_meta_temp_db.levels) {
+      if (level_meta.level == 6) {
+        for (const auto& file_meta : level_meta.files) {
+          // Verify files have zero sequence numbers
+          ASSERT_EQ(0, file_meta.largest_seqno)
+              << "File " << file_meta.relative_filename
+              << " should have zero sequence number\n"
+              << debug_info.str();
+          sst_file_paths_zero_seqno.emplace_back(file_meta.directory + "/" +
+                                                 file_meta.relative_filename);
+        }
+      } else {
+        // All files should be in L6
+        ASSERT_EQ(0, level_meta.files.size()) << debug_info.str();
+      }
+    }
+
+    // Flush remaining catch up writes in memtable
+    ASSERT_OK(db_->Flush({}, live_write_cfh));
+    SCOPED_TRACE("LSM of live write cfh " + FilesPerLevel(live_write_cfh));
+    // Collect SST file paths with non-zero sequence numbers
+    ColumnFamilyMetaData live_write_cf_meta;
+    ASSERT_OK(db_->DisableFileDeletions());
+    db_->GetColumnFamilyMetaData(live_write_cfh, &live_write_cf_meta);
+
+    // Live writes after snapshot
+    std::vector<std::string> sst_file_paths_nonzero_seqno;
+    for (auto level_meta = live_write_cf_meta.levels.rbegin();
+         level_meta != live_write_cf_meta.levels.rend(); ++level_meta) {
+      // Reverse order is important for L0, where recent updates are ordered
+      // first
+      for (auto file_meta = level_meta->files.rbegin();
+           file_meta != level_meta->files.rend(); ++file_meta) {
+        sst_file_paths_nonzero_seqno.emplace_back(file_meta->directory + "/" +
+                                                  file_meta->relative_filename);
+        ASSERT_GT(file_meta->smallest_seqno, 0) << debug_info.str();
+      }
+      if (level_meta->level == 49) {
+        // Ingest behind does not compact to the last level
+        ASSERT_EQ(level_meta->files.size(), 0) << debug_info.str();
+      }
+    }
+
+    ASSERT_GT(sst_file_paths_zero_seqno.size(), 0) << debug_info.str();
+    ASSERT_GT(sst_file_paths_nonzero_seqno.size(), 0) << debug_info.str();
+
+    // Combine all SST file paths.
+    // File ingestion takes files from old to new.
+    std::vector<std::string> all_sst_files;
+    all_sst_files.insert(all_sst_files.end(), sst_file_paths_zero_seqno.begin(),
+                         sst_file_paths_zero_seqno.end());
+    all_sst_files.insert(all_sst_files.end(),
+                         sst_file_paths_nonzero_seqno.begin(),
+                         sst_file_paths_nonzero_seqno.end());
+    if (ingest_opts.fail_if_not_bottommost_level && options.num_levels > 1) {
+      // overlapping files will be ingested into different levels, including non
+      // Lmax
+      Status s =
+          db_->IngestExternalFile(target_cfh, all_sst_files, ingest_opts);
+      ASSERT_NOK(s);
+      ASSERT_TRUE(s.ToString().find("Files cannot be ingested to Lmax") !=
+                  std::string::npos);
+    } else {
+      ASSERT_OK(
+          db_->IngestExternalFile(target_cfh, all_sst_files, ingest_opts));
+
+      debug_info << "Zero seqno files: " << sst_file_paths_zero_seqno.size()
+                 << "\nNon-zero seqno files: "
+                 << sst_file_paths_nonzero_seqno.size() << "\n";
+
+      SCOPED_TRACE("Debug info:\n" + debug_info.str());
+      VerifyDBFromMap(expected, nullptr, false, nullptr, target_cfh, &deleted);
+    }
+
+    // clean up
+    ASSERT_OK(db_->EnableFileDeletions());
+    ASSERT_OK(temp_db->EnableFileDeletions());
+
+    // FIXME: Without this, the test triggers some data race between dropping
+    // CF and background compaction.
+    ASSERT_OK(db_->WaitForCompact({}));
+
+    ASSERT_OK(db_->DropColumnFamily(live_write_cfh));
+    ASSERT_OK(db_->DestroyColumnFamilyHandle(live_write_cfh));
+
+    ASSERT_OK(temp_db->Close());
+    temp_db.reset();
+    ASSERT_OK(DestroyDB(temp_db_name, temp_db_opts));
+  } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction));
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc
index 3152c7635bea..9e7ec6ddd2ed 100644
--- a/db/fault_injection_test.cc
+++ b/db/fault_injection_test.cc
@@ -76,7 +76,7 @@ class FaultInjectionTest
   std::string dbname_;
   std::shared_ptr<Cache> tiny_cache_;
   Options options_;
-  DB* db_;
+  std::unique_ptr<DB> db_;
 
   FaultInjectionTest()
       : option_config_(std::get<1>(GetParam())),
@@ -260,10 +260,7 @@ class FaultInjectionTest
     return Slice(*storage);
   }
 
-  void CloseDB() {
-    delete db_;
-    db_ = nullptr;
-  }
+  void CloseDB() { db_.reset(); }
 
   Status OpenDB() {
     CloseDB();
@@ -348,7 +345,8 @@ class FaultInjectionTest
   }
 
   void WaitCompactionFinish() {
-    ASSERT_OK(static_cast<DBImpl*>(db_->GetRootDB())->TEST_WaitForCompact());
+    ASSERT_OK(static_cast_with_check<DBImpl>(db_->GetRootDB())
+                  ->TEST_WaitForCompact());
     ASSERT_OK(db_->Put(WriteOptions(), "", ""));
   }
 
diff --git a/db/flush_job.cc b/db/flush_job.cc
index ac2eaeb6c55c..e5221afca878 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -92,12 +92,10 @@ FlushJob::FlushJob(
     const MutableCFOptions& mutable_cf_options, uint64_t max_memtable_id,
     const FileOptions& file_options, VersionSet* versions,
     InstrumentedMutex* db_mutex, std::atomic<bool>* shutting_down,
-    std::vector<SequenceNumber> existing_snapshots,
-    SequenceNumber earliest_write_conflict_snapshot,
-    SnapshotChecker* snapshot_checker, JobContext* job_context,
-    FlushReason flush_reason, LogBuffer* log_buffer, FSDirectory* db_directory,
-    FSDirectory* output_file_directory, CompressionType output_compression,
-    Statistics* stats, EventLogger* event_logger, bool measure_io_stats,
+    JobContext* job_context, FlushReason flush_reason, LogBuffer* log_buffer,
+    FSDirectory* db_directory, FSDirectory* output_file_directory,
+    CompressionType output_compression, Statistics* stats,
+    EventLogger* event_logger, bool measure_io_stats,
     const bool sync_output_directory, const bool write_manifest,
     Env::Priority thread_pri, const std::shared_ptr<IOTracer>& io_tracer,
     std::shared_ptr<const SeqnoToTimeMapping> seqno_to_time_mapping,
@@ -114,12 +112,7 @@ FlushJob::FlushJob(
       versions_(versions),
       db_mutex_(db_mutex),
       shutting_down_(shutting_down),
-      existing_snapshots_(std::move(existing_snapshots)),
-      earliest_snapshot_(existing_snapshots_.empty()
-                             ? kMaxSequenceNumber
-                             : existing_snapshots_.at(0)),
-      earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
-      snapshot_checker_(snapshot_checker),
+      earliest_snapshot_(job_context->GetEarliestSnapshotSequence()),
       job_context_(job_context),
       flush_reason_(flush_reason),
       log_buffer_(log_buffer),
@@ -140,6 +133,7 @@ FlushJob::FlushJob(
       full_history_ts_low_(std::move(full_history_ts_low)),
       blob_callback_(blob_callback),
       seqno_to_time_mapping_(std::move(seqno_to_time_mapping)) {
+  assert(job_context->snapshot_context_initialized);
   // Update the thread status to indicate flush.
   ReportStartedFlush();
   TEST_SYNC_POINT("FlushJob::FlushJob()");
@@ -456,7 +450,7 @@ Status FlushJob::MemPurge() {
   const std::string* const full_history_ts_low = &(cfd_->GetFullHistoryTsLow());
   std::unique_ptr<CompactionRangeDelAggregator> range_del_agg(
       new CompactionRangeDelAggregator(&(cfd_->internal_comparator()),
-                                       existing_snapshots_,
+                                       job_context_->snapshot_seqs,
                                        full_history_ts_low));
   for (auto& rd_iter : range_del_iters) {
     range_del_agg->AddTombstones(std::move(rd_iter));
@@ -495,21 +489,20 @@ Status FlushJob::MemPurge() {
 
     Env* env = db_options_.env;
     assert(env);
-    MergeHelper merge(
-        env, (cfd_->internal_comparator()).user_comparator(),
-        (ioptions.merge_operator).get(), compaction_filter.get(),
-        ioptions.logger, true /* internal key corruption is not ok */,
-        existing_snapshots_.empty() ? 0 : existing_snapshots_.back(),
-        snapshot_checker_);
+    MergeHelper merge(env, (cfd_->internal_comparator()).user_comparator(),
+                      (ioptions.merge_operator).get(), compaction_filter.get(),
+                      ioptions.logger,
+                      true /* internal key corruption is not ok */,
+                      job_context_->GetLatestSnapshotSequence(),
+                      job_context_->snapshot_checker);
     assert(job_context_);
-    SequenceNumber job_snapshot_seq = job_context_->GetJobSnapshotSequence();
     const std::atomic<bool> kManualCompactionCanceledFalse{false};
     CompactionIterator c_iter(
         iter.get(), (cfd_->internal_comparator()).user_comparator(), &merge,
-        kMaxSequenceNumber, &existing_snapshots_, earliest_snapshot_,
-        earliest_write_conflict_snapshot_, job_snapshot_seq, snapshot_checker_,
-        env, ShouldReportDetailedTime(env, ioptions.stats),
-        true /* internal key corruption is not ok */, range_del_agg.get(),
+        kMaxSequenceNumber, &job_context_->snapshot_seqs, earliest_snapshot_,
+        job_context_->earliest_write_conflict_snapshot,
+        job_context_->GetJobSnapshotSequence(), job_context_->snapshot_checker,
+        env, ShouldReportDetailedTime(env, ioptions.stats), range_del_agg.get(),
         nullptr, ioptions.allow_data_in_errors,
         ioptions.enforce_single_del_contracts,
         /*manual_compaction_canceled=*/kManualCompactionCanceledFalse,
@@ -761,7 +754,7 @@ bool FlushJob::MemPurgeDecider(double threshold) {
       // Pick the oldest existing snapshot that is more recent
       // than the sequence number of the sampled entry.
       min_seqno_snapshot = kMaxSequenceNumber;
-      for (SequenceNumber seq_num : existing_snapshots_) {
+      for (SequenceNumber seq_num : job_context_->snapshot_seqs) {
         if (seq_num > res.sequence && seq_num < min_seqno_snapshot) {
           min_seqno_snapshot = seq_num;
         }
@@ -868,9 +861,12 @@ Status FlushJob::WriteLevel0Table() {
       ts_sz > 0 && !cfd_->ioptions().persist_user_defined_timestamps;
 
   std::vector<BlobFileAddition> blob_file_additions;
-
+  // Note that here we treat flush as level 0 compaction in internal stats
+  InternalStats::CompactionStats flush_stats(CompactionReason::kFlush,
+                                             1 /* count**/);
   {
-    auto write_hint = base_->storage_info()->CalculateSSTWriteHint(/*level=*/0);
+    auto write_hint = base_->storage_info()->CalculateSSTWriteHint(
+        /*level=*/0, db_options_.calculate_sst_write_lifetime_hint_set);
     Env::IOPriority io_priority = GetRateLimiterPriority();
     db_mutex_->Unlock();
     if (log_buffer_) {
@@ -886,7 +882,7 @@ Status FlushJob::WriteLevel0Table() {
     ro.total_order_seek = true;
     ro.io_activity = Env::IOActivity::kFlush;
     Arena arena;
-    uint64_t total_num_entries = 0, total_num_deletes = 0;
+    uint64_t total_num_input_entries = 0, total_num_deletes = 0;
     uint64_t total_data_size = 0;
     size_t total_memory_usage = 0;
     uint64_t total_num_range_deletes = 0;
@@ -900,9 +896,9 @@ Status FlushJob::WriteLevel0Table() {
     for (ReadOnlyMemTable* m : mems_) {
       ROCKS_LOG_INFO(db_options_.info_log,
                      "[%s] [JOB %d] Flushing memtable id %" PRIu64
-                     " with next log file: %" PRIu64 "\n",
+                     " with next log file: %" PRIu64 ", marked_for_flush: %d\n",
                      cfd_->GetName().c_str(), job_context_->job_id, m->GetID(),
-                     m->GetNextLogNumber());
+                     m->GetNextLogNumber(), m->IsMarkedForFlush());
       if (logical_strip_timestamp) {
         memtables.push_back(m->NewTimestampStrippingIterator(
             ro, /*seqno_to_time_mapping=*/nullptr, &arena,
@@ -921,7 +917,7 @@ Status FlushJob::WriteLevel0Table() {
       if (range_del_iter != nullptr) {
         range_del_iters.emplace_back(range_del_iter);
       }
-      total_num_entries += m->NumEntries();
+      total_num_input_entries += m->NumEntries();
       total_num_deletes += m->NumDeletion();
       total_data_size += m->GetDataSize();
       total_memory_usage += m->ApproximateMemoryUsage();
@@ -933,11 +929,12 @@ Status FlushJob::WriteLevel0Table() {
     //  "Write Buffer Full", should make update flush_reason_ accordingly.
     event_logger_->Log() << "job" << job_context_->job_id << "event"
                          << "flush_started" << "num_memtables" << mems_.size()
-                         << "num_entries" << total_num_entries << "num_deletes"
-                         << total_num_deletes << "total_data_size"
-                         << total_data_size << "memory_usage"
-                         << total_memory_usage << "num_range_deletes"
-                         << total_num_range_deletes << "flush_reason"
+                         << "total_num_input_entries" << total_num_input_entries
+                         << "num_deletes" << total_num_deletes
+                         << "total_data_size" << total_data_size
+                         << "memory_usage" << total_memory_usage
+                         << "num_range_deletes" << total_num_range_deletes
+                         << "flush_reason"
                          << GetFlushReasonString(flush_reason_);
 
     {
@@ -975,7 +972,6 @@ Status FlushJob::WriteLevel0Table() {
       meta_.oldest_ancester_time = oldest_ancester_time;
       meta_.file_creation_time = current_time;
 
-      uint64_t num_input_entries = 0;
       uint64_t memtable_payload_bytes = 0;
       uint64_t memtable_garbage_bytes = 0;
       IOStatus io_s;
@@ -997,28 +993,49 @@ Status FlushJob::WriteLevel0Table() {
           preclude_last_level_min_seqno_ == kMaxSequenceNumber
               ? preclude_last_level_min_seqno_
               : std::min(earliest_snapshot_, preclude_last_level_min_seqno_));
-      const SequenceNumber job_snapshot_seq =
-          job_context_->GetJobSnapshotSequence();
-
       s = BuildTable(
           dbname_, versions_, db_options_, tboptions, file_options_,
           cfd_->table_cache(), iter.get(), std::move(range_del_iters), &meta_,
-          &blob_file_additions, existing_snapshots_, earliest_snapshot_,
-          earliest_write_conflict_snapshot_, job_snapshot_seq,
-          snapshot_checker_, mutable_cf_options_.paranoid_file_checks,
-          cfd_->internal_stats(), &io_s, io_tracer_,
-          BlobFileCreationReason::kFlush, seqno_to_time_mapping_.get(),
-          event_logger_, job_context_->job_id, &table_properties_, write_hint,
-          full_history_ts_low, blob_callback_, base_, &num_input_entries,
-          &memtable_payload_bytes, &memtable_garbage_bytes);
+          &blob_file_additions, job_context_->snapshot_seqs, earliest_snapshot_,
+          job_context_->earliest_write_conflict_snapshot,
+          job_context_->GetJobSnapshotSequence(),
+          job_context_->snapshot_checker,
+          mutable_cf_options_.paranoid_file_checks, cfd_->internal_stats(),
+          &io_s, io_tracer_, BlobFileCreationReason::kFlush,
+          seqno_to_time_mapping_.get(), event_logger_, job_context_->job_id,
+          &table_properties_, write_hint, full_history_ts_low, blob_callback_,
+          base_, &memtable_payload_bytes, &memtable_garbage_bytes,
+          &flush_stats);
       TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table:s", &s);
       // TODO: Cleanup io_status in BuildTable and table builders
       assert(!s.ok() || io_s.ok());
       io_s.PermitUncheckedError();
-      if (num_input_entries != total_num_entries && s.ok()) {
-        std::string msg = "Expected " + std::to_string(total_num_entries) +
+      if (s.ok() && total_num_input_entries != flush_stats.num_input_records) {
+        std::string msg = "Expected " +
+                          std::to_string(total_num_input_entries) +
                           " entries in memtables, but read " +
-                          std::to_string(num_input_entries);
+                          std::to_string(flush_stats.num_input_records);
+        ROCKS_LOG_WARN(db_options_.info_log, "[%s] [JOB %d] Level-0 flush %s",
+                       cfd_->GetName().c_str(), job_context_->job_id,
+                       msg.c_str());
+        if (db_options_.flush_verify_memtable_count) {
+          s = Status::Corruption(msg);
+        }
+      }
+
+      // Only verify on table with format collects table properties
+      if (s.ok() &&
+          (mutable_cf_options_.table_factory->IsInstanceOf(
+               TableFactory::kBlockBasedTableName()) ||
+           mutable_cf_options_.table_factory->IsInstanceOf(
+               TableFactory::kPlainTableName())) &&
+          flush_stats.num_output_records != table_properties_.num_entries) {
+        std::string msg =
+            "Number of keys in flush output SST files does not match "
+            "number of keys added to the table. Expected " +
+            std::to_string(flush_stats.num_output_records) + " but there are " +
+            std::to_string(table_properties_.num_entries) +
+            " in output SST files";
         ROCKS_LOG_WARN(db_options_.info_log, "[%s] [JOB %d] Level-0 flush %s",
                        cfd_->GetName().c_str(), job_context_->job_id,
                        msg.c_str());
@@ -1078,42 +1095,42 @@ Status FlushJob::WriteLevel0Table() {
                    meta_.file_creation_time, meta_.epoch_number,
                    meta_.file_checksum, meta_.file_checksum_func_name,
                    meta_.unique_id, meta_.compensated_range_deletion_size,
-                   meta_.tail_size, meta_.user_defined_timestamps_persisted);
+                   meta_.tail_size, meta_.user_defined_timestamps_persisted,
+                   meta_.min_timestamp, meta_.max_timestamp);
     edit_->SetBlobFileAdditions(std::move(blob_file_additions));
   }
   // Piggyback FlushJobInfo on the first first flushed memtable.
   mems_[0]->SetFlushJobInfo(GetFlushJobInfo());
 
-  // Note that here we treat flush as level 0 compaction in internal stats
-  InternalStats::CompactionStats stats(CompactionReason::kFlush, 1);
   const uint64_t micros = clock_->NowMicros() - start_micros;
   const uint64_t cpu_micros = clock_->CPUMicros() - start_cpu_micros;
-  stats.micros = micros;
-  stats.cpu_micros = cpu_micros;
+  flush_stats.micros = micros;
+  flush_stats.cpu_micros += cpu_micros;
 
   ROCKS_LOG_INFO(db_options_.info_log,
                  "[%s] [JOB %d] Flush lasted %" PRIu64
                  " microseconds, and %" PRIu64 " cpu microseconds.\n",
                  cfd_->GetName().c_str(), job_context_->job_id, micros,
-                 cpu_micros);
+                 flush_stats.cpu_micros);
 
   if (has_output) {
-    stats.bytes_written = meta_.fd.GetFileSize();
-    stats.num_output_files = 1;
+    flush_stats.bytes_written = meta_.fd.GetFileSize();
+    flush_stats.num_output_files = 1;
   }
 
   const auto& blobs = edit_->GetBlobFileAdditions();
   for (const auto& blob : blobs) {
-    stats.bytes_written_blob += blob.GetTotalBlobBytes();
+    flush_stats.bytes_written_blob += blob.GetTotalBlobBytes();
   }
 
-  stats.num_output_files_blob = static_cast<int>(blobs.size());
+  flush_stats.num_output_files_blob = static_cast<int>(blobs.size());
 
-  RecordTimeToHistogram(stats_, FLUSH_TIME, stats.micros);
-  cfd_->internal_stats()->AddCompactionStats(0 /* level */, thread_pri_, stats);
+  RecordTimeToHistogram(stats_, FLUSH_TIME, flush_stats.micros);
+  cfd_->internal_stats()->AddCompactionStats(0 /* level */, thread_pri_,
+                                             flush_stats);
   cfd_->internal_stats()->AddCFStats(
       InternalStats::BYTES_FLUSHED,
-      stats.bytes_written + stats.bytes_written_blob);
+      flush_stats.bytes_written + flush_stats.bytes_written_blob);
   RecordFlushIOStats();
 
   return s;
@@ -1193,13 +1210,12 @@ void FlushJob::GetEffectiveCutoffUDTForPickedMemTables() {
 }
 
 void FlushJob::GetPrecludeLastLevelMinSeqno() {
-  if (mutable_cf_options_.preclude_last_level_data_seconds == 0 ||
-      // FIXME: create FlushJob and build SuperVersions such that
-      // preclude_last_level_data_seconds > 0 implies
-      // seqno_to_time_mapping_ != nullptr
-      seqno_to_time_mapping_ == nullptr) {
+  if (mutable_cf_options_.preclude_last_level_data_seconds == 0) {
     return;
   }
+  // SuperVersion should guarantee this
+  assert(seqno_to_time_mapping_);
+  assert(!seqno_to_time_mapping_->Empty());
   int64_t current_time = 0;
   Status s = db_options_.clock->GetCurrentTime(&current_time);
   if (!s.ok()) {
diff --git a/db/flush_job.h b/db/flush_job.h
index 1c1f15d1b1dc..aa95c7b41aef 100644
--- a/db/flush_job.h
+++ b/db/flush_job.h
@@ -63,11 +63,9 @@ class FlushJob {
            const MutableCFOptions& mutable_cf_options, uint64_t max_memtable_id,
            const FileOptions& file_options, VersionSet* versions,
            InstrumentedMutex* db_mutex, std::atomic<bool>* shutting_down,
-           std::vector<SequenceNumber> existing_snapshots,
-           SequenceNumber earliest_write_conflict_snapshot,
-           SnapshotChecker* snapshot_checker, JobContext* job_context,
-           FlushReason flush_reason, LogBuffer* log_buffer,
-           FSDirectory* db_directory, FSDirectory* output_file_directory,
+           JobContext* job_context, FlushReason flush_reason,
+           LogBuffer* log_buffer, FSDirectory* db_directory,
+           FSDirectory* output_file_directory,
            CompressionType output_compression, Statistics* stats,
            EventLogger* event_logger, bool measure_io_stats,
            const bool sync_output_directory, const bool write_manifest,
@@ -167,10 +165,7 @@ class FlushJob {
   VersionSet* versions_;
   InstrumentedMutex* db_mutex_;
   std::atomic<bool>* shutting_down_;
-  std::vector<SequenceNumber> existing_snapshots_;
   SequenceNumber earliest_snapshot_;
-  SequenceNumber earliest_write_conflict_snapshot_;
-  SnapshotChecker* snapshot_checker_;
   JobContext* job_context_;
   FlushReason flush_reason_;
   LogBuffer* log_buffer_;
@@ -234,7 +229,7 @@ class FlushJob {
 
   // The current minimum seqno that compaction jobs will preclude the data from
   // the last level. Data with seqnos larger than this or larger than
-  // `earliest_snapshot_` will be output to the penultimate level had it gone
+  // `earliest_snapshot_` will be output to the proximal level had it gone
   // through a compaction to the last level.
   SequenceNumber preclude_last_level_min_seqno_ = kMaxSequenceNumber;
 };
diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc
index f37eaf829be5..3d4cf1d8debd 100644
--- a/db/flush_job_test.cc
+++ b/db/flush_job_test.cc
@@ -142,13 +142,13 @@ class FlushJobTestBase : public testing::Test {
       column_families.emplace_back(cf_name, cf_options_);
     }
 
-    versions_.reset(
-        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
-                       &write_buffer_manager_, &write_controller_,
-                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
-                       test::kUnitTestDbId, /*db_session_id=*/"",
-                       /*daily_offpeak_time_utc=*/"",
-                       /*error_handler=*/nullptr, /*read_only=*/false));
+    versions_.reset(new VersionSet(
+        dbname_, &db_options_, MutableDBOptions{options_}, env_options_,
+        table_cache_.get(), &write_buffer_manager_, &write_controller_,
+        /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+        test::kUnitTestDbId, /*db_session_id=*/"",
+        /*daily_offpeak_time_utc=*/"",
+        /*error_handler=*/nullptr, /*read_only=*/false));
     EXPECT_OK(versions_->Recover(column_families, false));
   }
 
@@ -186,16 +186,16 @@ TEST_F(FlushJobTest, Empty) {
   JobContext job_context(0);
   auto cfd = versions_->GetColumnFamilySet()->GetDefault();
   EventLogger event_logger(db_options_.info_log.get());
-  SnapshotChecker* snapshot_checker = nullptr;  // not relavant
-  FlushJob flush_job(
-      dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
-      cfd->GetLatestMutableCFOptions(),
-      std::numeric_limits<uint64_t>::max() /* memtable_id */, env_options_,
-      versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber,
-      snapshot_checker, &job_context, FlushReason::kTest, nullptr, nullptr,
-      nullptr, kNoCompression, nullptr, &event_logger, false,
-      true /* sync_output_directory */, true /* write_manifest */,
-      Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_);
+  job_context.InitSnapshotContext(nullptr, nullptr, kMaxSequenceNumber, {});
+  FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
+                     db_options_, cfd->GetLatestMutableCFOptions(),
+                     std::numeric_limits<uint64_t>::max() /* memtable_id */,
+                     env_options_, versions_.get(), &mutex_, &shutting_down_,
+                     &job_context, FlushReason::kTest, nullptr, nullptr,
+                     nullptr, kNoCompression, nullptr, &event_logger, false,
+                     true /* sync_output_directory */,
+                     true /* write_manifest */, Env::Priority::USER,
+                     nullptr /*IOTracer*/, empty_seqno_to_time_mapping_);
   {
     InstrumentedMutexLock l(&mutex_);
     flush_job.PickMemTable();
@@ -272,16 +272,16 @@ TEST_F(FlushJobTest, NonEmpty) {
   }
 
   EventLogger event_logger(db_options_.info_log.get());
-  SnapshotChecker* snapshot_checker = nullptr;  // not relavant
-  FlushJob flush_job(
-      dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
-      cfd->GetLatestMutableCFOptions(),
-      std::numeric_limits<uint64_t>::max() /* memtable_id */, env_options_,
-      versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber,
-      snapshot_checker, &job_context, FlushReason::kTest, nullptr, nullptr,
-      nullptr, kNoCompression, db_options_.statistics.get(), &event_logger,
-      true, true /* sync_output_directory */, true /* write_manifest */,
-      Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_);
+  job_context.InitSnapshotContext(nullptr, nullptr, kMaxSequenceNumber, {});
+  FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
+                     db_options_, cfd->GetLatestMutableCFOptions(),
+                     std::numeric_limits<uint64_t>::max() /* memtable_id */,
+                     env_options_, versions_.get(), &mutex_, &shutting_down_,
+                     &job_context, FlushReason::kTest, nullptr, nullptr,
+                     nullptr, kNoCompression, db_options_.statistics.get(),
+                     &event_logger, true, true /* sync_output_directory */,
+                     true /* write_manifest */, Env::Priority::USER,
+                     nullptr /*IOTracer*/, empty_seqno_to_time_mapping_);
 
   HistogramData hist;
   FileMetaData file_meta;
@@ -332,18 +332,18 @@ TEST_F(FlushJobTest, FlushMemTablesSingleColumnFamily) {
   }
 
   EventLogger event_logger(db_options_.info_log.get());
-  SnapshotChecker* snapshot_checker = nullptr;  // not relavant
 
   assert(memtable_ids.size() == num_mems);
   uint64_t smallest_memtable_id = memtable_ids.front();
   uint64_t flush_memtable_id = smallest_memtable_id + num_mems_to_flush - 1;
+  job_context.InitSnapshotContext(nullptr, nullptr, kMaxSequenceNumber, {});
   FlushJob flush_job(
       dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
       cfd->GetLatestMutableCFOptions(), flush_memtable_id, env_options_,
-      versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber,
-      snapshot_checker, &job_context, FlushReason::kTest, nullptr, nullptr,
-      nullptr, kNoCompression, db_options_.statistics.get(), &event_logger,
-      true, true /* sync_output_directory */, true /* write_manifest */,
+      versions_.get(), &mutex_, &shutting_down_, &job_context,
+      FlushReason::kTest, nullptr, nullptr, nullptr, kNoCompression,
+      db_options_.statistics.get(), &event_logger, true,
+      true /* sync_output_directory */, true /* write_manifest */,
       Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_);
   HistogramData hist;
   FileMetaData file_meta;
@@ -405,18 +405,17 @@ TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) {
   }
 
   EventLogger event_logger(db_options_.info_log.get());
-  SnapshotChecker* snapshot_checker = nullptr;  // not relevant
   std::vector<std::unique_ptr<FlushJob>> flush_jobs;
   k = 0;
+  job_context.InitSnapshotContext(nullptr, nullptr, kMaxSequenceNumber, {});
   for (auto cfd : all_cfds) {
     std::vector<SequenceNumber> snapshot_seqs;
     flush_jobs.emplace_back(new FlushJob(
         dbname_, cfd, db_options_, cfd->GetLatestMutableCFOptions(),
         memtable_ids[k], env_options_, versions_.get(), &mutex_,
-        &shutting_down_, snapshot_seqs, kMaxSequenceNumber, snapshot_checker,
-        &job_context, FlushReason::kTest, nullptr, nullptr, nullptr,
-        kNoCompression, db_options_.statistics.get(), &event_logger, true,
-        false /* sync_output_directory */, false /* write_manifest */,
+        &shutting_down_, &job_context, FlushReason::kTest, nullptr, nullptr,
+        nullptr, kNoCompression, db_options_.statistics.get(), &event_logger,
+        true, false /* sync_output_directory */, false /* write_manifest */,
         Env::Priority::USER, nullptr /*IOTracer*/,
         empty_seqno_to_time_mapping_));
     k++;
@@ -532,16 +531,17 @@ TEST_F(FlushJobTest, Snapshots) {
   }
 
   EventLogger event_logger(db_options_.info_log.get());
-  SnapshotChecker* snapshot_checker = nullptr;  // not relavant
-  FlushJob flush_job(
-      dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
-      cfd->GetLatestMutableCFOptions(),
-      std::numeric_limits<uint64_t>::max() /* memtable_id */, env_options_,
-      versions_.get(), &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber,
-      snapshot_checker, &job_context, FlushReason::kTest, nullptr, nullptr,
-      nullptr, kNoCompression, db_options_.statistics.get(), &event_logger,
-      true, true /* sync_output_directory */, true /* write_manifest */,
-      Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_);
+  job_context.InitSnapshotContext(nullptr, nullptr, kMaxSequenceNumber,
+                                  std::move(snapshots));
+  FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
+                     db_options_, cfd->GetLatestMutableCFOptions(),
+                     std::numeric_limits<uint64_t>::max() /* memtable_id */,
+                     env_options_, versions_.get(), &mutex_, &shutting_down_,
+                     &job_context, FlushReason::kTest, nullptr, nullptr,
+                     nullptr, kNoCompression, db_options_.statistics.get(),
+                     &event_logger, true, true /* sync_output_directory */,
+                     true /* write_manifest */, Env::Priority::USER,
+                     nullptr /*IOTracer*/, empty_seqno_to_time_mapping_);
   mutex_.Lock();
   flush_job.PickMemTable();
   ASSERT_OK(flush_job.Run());
@@ -585,18 +585,18 @@ TEST_F(FlushJobTest, GetRateLimiterPriorityForWrite) {
   }
 
   EventLogger event_logger(db_options_.info_log.get());
-  SnapshotChecker* snapshot_checker = nullptr;  // not relavant
 
   assert(memtable_ids.size() == num_mems);
   uint64_t smallest_memtable_id = memtable_ids.front();
   uint64_t flush_memtable_id = smallest_memtable_id + num_mems_to_flush - 1;
+  job_context.InitSnapshotContext(nullptr, nullptr, kMaxSequenceNumber, {});
   FlushJob flush_job(
       dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
       cfd->GetLatestMutableCFOptions(), flush_memtable_id, env_options_,
-      versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber,
-      snapshot_checker, &job_context, FlushReason::kTest, nullptr, nullptr,
-      nullptr, kNoCompression, db_options_.statistics.get(), &event_logger,
-      true, true /* sync_output_directory */, true /* write_manifest */,
+      versions_.get(), &mutex_, &shutting_down_, &job_context,
+      FlushReason::kTest, nullptr, nullptr, nullptr, kNoCompression,
+      db_options_.statistics.get(), &event_logger, true,
+      true /* sync_output_directory */, true /* write_manifest */,
       Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_);
 
   // When the state from WriteController is normal.
@@ -658,16 +658,16 @@ TEST_F(FlushJobTest, ReplaceTimedPutWriteTimeWithPreferredSeqno) {
   }
 
   EventLogger event_logger(db_options_.info_log.get());
-  SnapshotChecker* snapshot_checker = nullptr;  // not relevant
-  FlushJob flush_job(
-      dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
-      cfd->GetLatestMutableCFOptions(),
-      std::numeric_limits<uint64_t>::max() /* memtable_id */, env_options_,
-      versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber,
-      snapshot_checker, &job_context, FlushReason::kTest, nullptr, nullptr,
-      nullptr, kNoCompression, db_options_.statistics.get(), &event_logger,
-      true, true /* sync_output_directory */, true /* write_manifest */,
-      Env::Priority::USER, nullptr /*IOTracer*/, seqno_to_time_mapping);
+  job_context.InitSnapshotContext(nullptr, nullptr, kMaxSequenceNumber, {});
+  FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
+                     db_options_, cfd->GetLatestMutableCFOptions(),
+                     std::numeric_limits<uint64_t>::max() /* memtable_id */,
+                     env_options_, versions_.get(), &mutex_, &shutting_down_,
+                     &job_context, FlushReason::kTest, nullptr, nullptr,
+                     nullptr, kNoCompression, db_options_.statistics.get(),
+                     &event_logger, true, true /* sync_output_directory */,
+                     true /* write_manifest */, Env::Priority::USER,
+                     nullptr /*IOTracer*/, seqno_to_time_mapping);
 
   FileMetaData file_meta;
   mutex_.Lock();
@@ -761,19 +761,19 @@ TEST_P(FlushJobTimestampTest, AllKeysExpired) {
   }
 
   std::vector<SequenceNumber> snapshots;
-  constexpr SnapshotChecker* const snapshot_checker = nullptr;
   JobContext job_context(0);
   EventLogger event_logger(db_options_.info_log.get());
   std::string full_history_ts_low;
   PutFixed64(&full_history_ts_low, std::numeric_limits<uint64_t>::max());
   cfd->SetFullHistoryTsLow(full_history_ts_low);
+  job_context.InitSnapshotContext(nullptr, nullptr, kMaxSequenceNumber, {});
   FlushJob flush_job(
       dbname_, cfd, db_options_, cfd->GetLatestMutableCFOptions(),
       std::numeric_limits<uint64_t>::max() /* memtable_id */, env_options_,
-      versions_.get(), &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber,
-      snapshot_checker, &job_context, FlushReason::kTest, nullptr, nullptr,
-      nullptr, kNoCompression, db_options_.statistics.get(), &event_logger,
-      true, true /* sync_output_directory */, true /* write_manifest */,
+      versions_.get(), &mutex_, &shutting_down_, &job_context,
+      FlushReason::kTest, nullptr, nullptr, nullptr, kNoCompression,
+      db_options_.statistics.get(), &event_logger, true,
+      true /* sync_output_directory */, true /* write_manifest */,
       Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_,
       /*db_id=*/"",
       /*db_session_id=*/"", full_history_ts_low);
@@ -823,8 +823,8 @@ TEST_P(FlushJobTimestampTest, NoKeyExpired) {
   }
 
   std::vector<SequenceNumber> snapshots;
-  SnapshotChecker* const snapshot_checker = nullptr;
   JobContext job_context(0);
+  job_context.InitSnapshotContext(nullptr, nullptr, kMaxSequenceNumber, {});
   EventLogger event_logger(db_options_.info_log.get());
   std::string full_history_ts_low;
   PutFixed64(&full_history_ts_low, 0);
@@ -832,10 +832,10 @@ TEST_P(FlushJobTimestampTest, NoKeyExpired) {
   FlushJob flush_job(
       dbname_, cfd, db_options_, cfd->GetLatestMutableCFOptions(),
       std::numeric_limits<uint64_t>::max() /* memtable_id */, env_options_,
-      versions_.get(), &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber,
-      snapshot_checker, &job_context, FlushReason::kTest, nullptr, nullptr,
-      nullptr, kNoCompression, db_options_.statistics.get(), &event_logger,
-      true, true /* sync_output_directory */, true /* write_manifest */,
+      versions_.get(), &mutex_, &shutting_down_, &job_context,
+      FlushReason::kTest, nullptr, nullptr, nullptr, kNoCompression,
+      db_options_.statistics.get(), &event_logger, true,
+      true /* sync_output_directory */, true /* write_manifest */,
       Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_,
       /*db_id=*/"",
       /*db_session_id=*/"", full_history_ts_low);
diff --git a/db/forward_iterator.h b/db/forward_iterator.h
index 11dde54777e7..81a7f3132980 100644
--- a/db/forward_iterator.h
+++ b/db/forward_iterator.h
@@ -42,6 +42,7 @@ using MinIterHeap =
     std::priority_queue<InternalIterator*, std::vector<InternalIterator*>,
                         MinIterComparator>;
 
+// TODO: name to TailingIterator
 /**
  * ForwardIterator is a special type of iterator that only supports Seek()
  * and Next(). It is expected to perform better than TailingIterator by
diff --git a/db/forward_iterator_bench.cc b/db/forward_iterator_bench.cc
index b57b119e484a..ecab01168474 100644
--- a/db/forward_iterator_bench.cc
+++ b/db/forward_iterator_bench.cc
@@ -344,19 +344,18 @@ int main(int argc, char** argv) {
 
   status = ROCKSDB_NAMESPACE::DestroyDB(path, options);
   assert(status.ok());
-  ROCKSDB_NAMESPACE::DB* db_raw;
-  status = ROCKSDB_NAMESPACE::DB::Open(options, path, &db_raw);
+  std::unique_ptr<ROCKSDB_NAMESPACE::DB> db;
+  status = ROCKSDB_NAMESPACE::DB::Open(options, path, &db);
   assert(status.ok());
-  std::unique_ptr<ROCKSDB_NAMESPACE::DB> db(db_raw);
 
   std::vector<ShardState> shard_states(FLAGS_shards + 1);
   std::deque<Reader> readers;
   while (static_cast<int>(readers.size()) < FLAGS_readers) {
-    readers.emplace_back(&shard_states, db_raw);
+    readers.emplace_back(&shard_states, db.get());
   }
   std::deque<Writer> writers;
   while (static_cast<int>(writers.size()) < FLAGS_writers) {
-    writers.emplace_back(&shard_states, db_raw);
+    writers.emplace_back(&shard_states, db.get());
   }
 
   // Each shard gets a random reader and random writer assigned to it
@@ -367,7 +366,7 @@ int main(int argc, char** argv) {
     shard_states[i].writer = &writers[writer_dist(rng)];
   }
 
-  StatsThread stats_thread(db_raw);
+  StatsThread stats_thread(db.get());
   for (Writer& w : writers) {
     w.start();
   }
diff --git a/db/import_column_family_job.cc b/db/import_column_family_job.cc
index 44a1c5d099a9..3033f1cf41e2 100644
--- a/db/import_column_family_job.cc
+++ b/db/import_column_family_job.cc
@@ -201,15 +201,8 @@ Status ImportColumnFamilyJob::Run() {
       const auto& f = files_to_import_[i][j];
       const auto& file_metadata = *metadatas_[i][j];
 
-      uint64_t tail_size = 0;
-      bool contain_no_data_blocks = f.table_properties.num_entries > 0 &&
-                                    (f.table_properties.num_entries ==
-                                     f.table_properties.num_range_deletions);
-      if (f.table_properties.tail_start_offset > 0 || contain_no_data_blocks) {
-        uint64_t file_size = f.fd.GetFileSize();
-        assert(f.table_properties.tail_start_offset <= file_size);
-        tail_size = file_size - f.table_properties.tail_start_offset;
-      }
+      uint64_t tail_size = FileMetaData::CalculateTailSize(f.fd.GetFileSize(),
+                                                           f.table_properties);
 
       VersionEdit dummy_version_edit;
       dummy_version_edit.AddFile(
@@ -317,8 +310,10 @@ Status ImportColumnFamilyJob::GetIngestedFileInfo(
   std::unique_ptr<FSRandomAccessFile> sst_file;
   std::unique_ptr<RandomAccessFileReader> sst_file_reader;
 
-  status =
-      fs_->NewRandomAccessFile(external_file, env_options_, &sst_file, nullptr);
+  FileOptions fo{env_options_};
+  fo.file_checksum = file_meta.file_checksum;
+  fo.file_checksum_func_name = file_meta.file_checksum_func_name;
+  status = fs_->NewRandomAccessFile(external_file, fo, &sst_file, nullptr);
   if (!status.ok()) {
     return status;
   }
@@ -331,7 +326,8 @@ Status ImportColumnFamilyJob::GetIngestedFileInfo(
   status = sv->mutable_cf_options.table_factory->NewTableReader(
       TableReaderOptions(
           cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor,
-          env_options_, cfd_->internal_comparator(),
+          sv->mutable_cf_options.compression_manager.get(), env_options_,
+          cfd_->internal_comparator(),
           sv->mutable_cf_options.block_protection_bytes_per_key,
           /*skip_filters*/ false, /*immortal*/ false,
           /*force_direct_prefetch*/ false, /*level*/ -1,
diff --git a/db/import_column_family_test.cc b/db/import_column_family_test.cc
index 5a0139017754..0a6f9d6a3905 100644
--- a/db/import_column_family_test.cc
+++ b/db/import_column_family_test.cc
@@ -371,7 +371,7 @@ TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherCF) {
   ASSERT_OK(Flush(1));
 
   Checkpoint* checkpoint;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
   ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_,
                                            &metadata_ptr_));
   ASSERT_NE(metadata_ptr_, nullptr);
@@ -481,14 +481,14 @@ TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherDB) {
   ASSERT_OK(Flush(1));
 
   Checkpoint* checkpoint;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
   ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_,
                                            &metadata_ptr_));
   ASSERT_NE(metadata_ptr_, nullptr);
   delete checkpoint;
 
   // Create a new db and import the files.
-  DB* db_copy;
+  std::unique_ptr<DB> db_copy;
   ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy"));
   ASSERT_OK(DB::Open(options, dbname_ + "/db_copy", &db_copy));
   ColumnFamilyHandle* cfh = nullptr;
@@ -504,7 +504,7 @@ TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherDB) {
   }
   ASSERT_OK(db_copy->DropColumnFamily(cfh));
   ASSERT_OK(db_copy->DestroyColumnFamilyHandle(cfh));
-  delete db_copy;
+  db_copy.reset();
   ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy"));
 }
 
@@ -529,7 +529,7 @@ TEST_F(ImportColumnFamilyTest,
   ASSERT_OK(db_->DeleteRange(WriteOptions(), handles_[1], Key(0), Key(2)));
 
   Checkpoint* checkpoint;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
   ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_,
                                            &metadata_ptr_));
   ASSERT_NE(metadata_ptr_, nullptr);
@@ -605,14 +605,14 @@ TEST_F(ImportColumnFamilyTest, LevelFilesOverlappingAtEndpoints) {
   ASSERT_GT(NumTableFilesAtLevel(1, 1), 1);
 
   Checkpoint* checkpoint;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
   ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_,
                                            &metadata_ptr_));
   ASSERT_NE(metadata_ptr_, nullptr);
   delete checkpoint;
 
   // Create a new db and import the files.
-  DB* db_copy;
+  std::unique_ptr<DB> db_copy;
   ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy"));
   ASSERT_OK(DB::Open(options, dbname_ + "/db_copy", &db_copy));
   ColumnFamilyHandle* cfh = nullptr;
@@ -627,7 +627,7 @@ TEST_F(ImportColumnFamilyTest, LevelFilesOverlappingAtEndpoints) {
   }
   ASSERT_OK(db_copy->DropColumnFamily(cfh));
   ASSERT_OK(db_copy->DestroyColumnFamilyHandle(cfh));
-  delete db_copy;
+  db_copy.reset();
   ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy"));
   for (const Snapshot* snapshot : snapshots) {
     db_->ReleaseSnapshot(snapshot);
@@ -771,12 +771,12 @@ TEST_F(ImportColumnFamilyTest, ImportMultiColumnFamilyTest) {
 
   Checkpoint* checkpoint1;
   Checkpoint* checkpoint2;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint1));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint1));
   ASSERT_OK(checkpoint1->ExportColumnFamily(handles_[1], export_files_dir_,
                                             &metadata_ptr_));
 
   // Create a new db and import the files.
-  DB* db_copy;
+  std::unique_ptr<DB> db_copy;
   ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy"));
   ASSERT_OK(DB::Open(options, dbname_ + "/db_copy", &db_copy));
   ColumnFamilyHandle* copy_cfh = nullptr;
@@ -796,7 +796,7 @@ TEST_F(ImportColumnFamilyTest, ImportMultiColumnFamilyTest) {
   ASSERT_OK(db_copy->Flush(FlushOptions()));
 
   // Flush again to create another L0 file. It should have higher sequencer.
-  ASSERT_OK(Checkpoint::Create(db_copy, &checkpoint2));
+  ASSERT_OK(Checkpoint::Create(db_copy.get(), &checkpoint2));
   ASSERT_OK(checkpoint2->ExportColumnFamily(copy_cfh, export_files_dir2_,
                                             &metadata_ptr2_));
 
@@ -826,7 +826,7 @@ TEST_F(ImportColumnFamilyTest, ImportMultiColumnFamilyTest) {
 
   ASSERT_OK(db_copy->DropColumnFamily(copy_cfh));
   ASSERT_OK(db_copy->DestroyColumnFamilyHandle(copy_cfh));
-  delete db_copy;
+  db_copy.reset();
   ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy"));
 }
 
@@ -840,12 +840,12 @@ TEST_F(ImportColumnFamilyTest, ImportMultiColumnFamilyWithOverlap) {
 
   Checkpoint* checkpoint1;
   Checkpoint* checkpoint2;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint1));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint1));
   ASSERT_OK(checkpoint1->ExportColumnFamily(handles_[1], export_files_dir_,
                                             &metadata_ptr_));
 
   // Create a new db and import the files.
-  DB* db_copy;
+  std::unique_ptr<DB> db_copy;
   ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy"));
   ASSERT_OK(DB::Open(options, dbname_ + "/db_copy", &db_copy));
   ColumnFamilyHandle* copy_cfh = nullptr;
@@ -857,7 +857,7 @@ TEST_F(ImportColumnFamilyTest, ImportMultiColumnFamilyWithOverlap) {
   ASSERT_OK(db_copy->Flush(FlushOptions()));
 
   // Flush again to create another L0 file. It should have higher sequencer.
-  ASSERT_OK(Checkpoint::Create(db_copy, &checkpoint2));
+  ASSERT_OK(Checkpoint::Create(db_copy.get(), &checkpoint2));
   ASSERT_OK(checkpoint2->ExportColumnFamily(copy_cfh, export_files_dir2_,
                                             &metadata_ptr2_));
 
@@ -877,7 +877,7 @@ TEST_F(ImportColumnFamilyTest, ImportMultiColumnFamilyWithOverlap) {
 
   ASSERT_OK(db_copy->DropColumnFamily(copy_cfh));
   ASSERT_OK(db_copy->DestroyColumnFamilyHandle(copy_cfh));
-  delete db_copy;
+  db_copy.reset();
   ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy"));
 }
 
@@ -1017,7 +1017,7 @@ TEST_F(ImportColumnFamilyTest, AssignEpochNumberToMultipleCF) {
   // corruption where two L0 files can have the same epoch number but
   // with overlapping key range.
   Checkpoint* checkpoint1;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint1));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint1));
   ASSERT_OK(checkpoint1->ExportColumnFamily(handles_[1], export_files_dir_,
                                             &metadata_ptr_));
   ASSERT_OK(checkpoint1->ExportColumnFamily(handles_[2], export_files_dir2_,
diff --git a/db/internal_stats.cc b/db/internal_stats.cc
index 8e8e6d27ef10..6b2d75385ba4 100644
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@@ -45,6 +45,8 @@ const std::map<LevelStatType, LevelStat> InternalStats::compaction_level_stats =
         {LevelStatType::RN_GB, LevelStat{"RnGB", "Rn(GB)"}},
         {LevelStatType::RNP1_GB, LevelStat{"Rnp1GB", "Rnp1(GB)"}},
         {LevelStatType::WRITE_GB, LevelStat{"WriteGB", "Write(GB)"}},
+        {LevelStatType::WRITE_PRE_COMP_GB,
+         LevelStat{"WPreCompGB", "WPreComp(GB)"}},
         {LevelStatType::W_NEW_GB, LevelStat{"WnewGB", "Wnew(GB)"}},
         {LevelStatType::MOVED_GB, LevelStat{"MovedGB", "Moved(GB)"}},
         {LevelStatType::WRITE_AMP, LevelStat{"WriteAmp", "W-Amp"}},
@@ -100,19 +102,20 @@ void PrintLevelStatsHeader(char* buf, size_t len, const std::string& cf_name,
   int line_size = snprintf(
       buf + written_size, len - written_size,
       "%s    %s   %s     %s %s  %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s "
+      "%s "
       "%s\n",
       // Note that we skip COMPACTED_FILES and merge it with Files column
       group_by.c_str(), hdr(LevelStatType::NUM_FILES),
       hdr(LevelStatType::SIZE_BYTES), hdr(LevelStatType::SCORE),
       hdr(LevelStatType::READ_GB), hdr(LevelStatType::RN_GB),
       hdr(LevelStatType::RNP1_GB), hdr(LevelStatType::WRITE_GB),
-      hdr(LevelStatType::W_NEW_GB), hdr(LevelStatType::MOVED_GB),
-      hdr(LevelStatType::WRITE_AMP), hdr(LevelStatType::READ_MBPS),
-      hdr(LevelStatType::WRITE_MBPS), hdr(LevelStatType::COMP_SEC),
-      hdr(LevelStatType::COMP_CPU_SEC), hdr(LevelStatType::COMP_COUNT),
-      hdr(LevelStatType::AVG_SEC), hdr(LevelStatType::KEY_IN),
-      hdr(LevelStatType::KEY_DROP), hdr(LevelStatType::R_BLOB_GB),
-      hdr(LevelStatType::W_BLOB_GB));
+      hdr(LevelStatType::WRITE_PRE_COMP_GB), hdr(LevelStatType::W_NEW_GB),
+      hdr(LevelStatType::MOVED_GB), hdr(LevelStatType::WRITE_AMP),
+      hdr(LevelStatType::READ_MBPS), hdr(LevelStatType::WRITE_MBPS),
+      hdr(LevelStatType::COMP_SEC), hdr(LevelStatType::COMP_CPU_SEC),
+      hdr(LevelStatType::COMP_COUNT), hdr(LevelStatType::AVG_SEC),
+      hdr(LevelStatType::KEY_IN), hdr(LevelStatType::KEY_DROP),
+      hdr(LevelStatType::R_BLOB_GB), hdr(LevelStatType::W_BLOB_GB));
 
   written_size += line_size;
   written_size = std::min(written_size, static_cast<int>(len));
@@ -140,6 +143,8 @@ void PrepareLevelStats(std::map<LevelStatType, double>* level_stats,
       stats.bytes_read_non_output_levels / kGB;
   (*level_stats)[LevelStatType::RNP1_GB] = stats.bytes_read_output_level / kGB;
   (*level_stats)[LevelStatType::WRITE_GB] = stats.bytes_written / kGB;
+  (*level_stats)[LevelStatType::WRITE_PRE_COMP_GB] =
+      stats.bytes_written_pre_comp / kGB;
   (*level_stats)[LevelStatType::W_NEW_GB] = bytes_new / kGB;
   (*level_stats)[LevelStatType::MOVED_GB] = stats.bytes_moved / kGB;
   (*level_stats)[LevelStatType::WRITE_AMP] = w_amp;
@@ -164,12 +169,13 @@ void PrintLevelStats(char* buf, size_t len, const std::string& name,
       buf, len,
       "%4s "      /*  Level */
       "%6d/%-3d " /*  Files */
-      "%8s "      /*  Size */
+      "%10s "     /*  Size */
       "%5.1f "    /*  Score */
       "%8.1f "    /*  Read(GB) */
       "%7.1f "    /*  Rn(GB) */
       "%8.1f "    /*  Rnp1(GB) */
       "%9.1f "    /*  Write(GB) */
+      "%9.1f "    /*  WPreComp(GB) */
       "%8.1f "    /*  Wnew(GB) */
       "%9.1f "    /*  Moved(GB) */
       "%5.1f "    /*  W-Amp */
@@ -193,6 +199,7 @@ void PrintLevelStats(char* buf, size_t len, const std::string& name,
       stat_value.at(LevelStatType::RN_GB),
       stat_value.at(LevelStatType::RNP1_GB),
       stat_value.at(LevelStatType::WRITE_GB),
+      stat_value.at(LevelStatType::WRITE_PRE_COMP_GB),
       stat_value.at(LevelStatType::W_NEW_GB),
       stat_value.at(LevelStatType::MOVED_GB),
       stat_value.at(LevelStatType::WRITE_AMP),
@@ -303,6 +310,7 @@ static const std::string aggregated_table_properties_at_level =
 static const std::string num_running_compactions = "num-running-compactions";
 static const std::string num_running_compaction_sorted_runs =
     "num-running-compaction-sorted-runs";
+static const std::string compaction_abort_count = "compaction-abort-count";
 static const std::string num_running_flushes = "num-running-flushes";
 static const std::string actual_delayed_write_rate =
     "actual-delayed-write-rate";
@@ -355,6 +363,8 @@ const std::string DB::Properties::kNumRunningCompactions =
     rocksdb_prefix + num_running_compactions;
 const std::string DB::Properties::kNumRunningCompactionSortedRuns =
     rocksdb_prefix + num_running_compaction_sorted_runs;
+const std::string DB::Properties::kCompactionAbortCount =
+    rocksdb_prefix + compaction_abort_count;
 const std::string DB::Properties::kNumRunningFlushes =
     rocksdb_prefix + num_running_flushes;
 const std::string DB::Properties::kBackgroundErrors =
@@ -587,6 +597,9 @@ const UnorderedMap<std::string, DBPropertyInfo>
         {DB::Properties::kNumRunningCompactionSortedRuns,
          {false, nullptr, &InternalStats::HandleNumRunningCompactionSortedRuns,
           nullptr, nullptr}},
+        {DB::Properties::kCompactionAbortCount,
+         {false, nullptr, &InternalStats::HandleCompactionAbortCount, nullptr,
+          nullptr}},
         {DB::Properties::kActualDelayedWriteRate,
          {false, nullptr, &InternalStats::HandleActualDelayedWriteRate, nullptr,
           nullptr}},
@@ -1285,6 +1298,13 @@ bool InternalStats::HandleNumRunningCompactionSortedRuns(uint64_t* value,
   return true;
 }
 
+bool InternalStats::HandleCompactionAbortCount(uint64_t* value, DBImpl* db,
+                                               Version* /*version*/) {
+  *value = static_cast<uint64_t>(
+      db->compaction_aborted_.load(std::memory_order_acquire));
+  return true;
+}
+
 bool InternalStats::HandleBackgroundErrors(uint64_t* value, DBImpl* /*db*/,
                                            Version* /*version*/) {
   // Accumulated number of  errors in background flushes or compactions.
diff --git a/db/internal_stats.h b/db/internal_stats.h
index 7ebd406db757..347b3a617aae 100644
--- a/db/internal_stats.h
+++ b/db/internal_stats.h
@@ -71,6 +71,7 @@ enum class LevelStatType {
   RN_GB,
   RNP1_GB,
   WRITE_GB,
+  WRITE_PRE_COMP_GB,
   W_NEW_GB,
   MOVED_GB,
   WRITE_AMP,
@@ -153,23 +154,6 @@ class InternalStats {
 
   InternalStats(int num_levels, SystemClock* clock, ColumnFamilyData* cfd);
 
-  // Per level compaction stats
-  struct CompactionOutputsStats {
-    uint64_t num_output_records = 0;
-    uint64_t bytes_written = 0;
-    uint64_t bytes_written_blob = 0;
-    uint64_t num_output_files = 0;
-    uint64_t num_output_files_blob = 0;
-
-    void Add(const CompactionOutputsStats& stats) {
-      this->num_output_records += stats.num_output_records;
-      this->bytes_written += stats.bytes_written;
-      this->bytes_written_blob += stats.bytes_written_blob;
-      this->num_output_files += stats.num_output_files;
-      this->num_output_files_blob += stats.num_output_files_blob;
-    }
-  };
-
   // Per level compaction stats.  comp_stats_[level] stores the stats for
   // compactions that produced data for the specified "level".
   struct CompactionStats {
@@ -196,6 +180,9 @@ class InternalStats {
     // Total number of bytes written to table files during compaction
     uint64_t bytes_written;
 
+    // Total number of bytes written pre-compression during compaction
+    uint64_t bytes_written_pre_comp;
+
     // Total number of bytes written to blob files during compaction
     uint64_t bytes_written_blob;
 
@@ -248,6 +235,7 @@ class InternalStats {
           bytes_skipped_output_level(0),
           bytes_read_blob(0),
           bytes_written(0),
+          bytes_written_pre_comp(0),
           bytes_written_blob(0),
           bytes_moved(0),
           num_input_files_in_non_output_levels(0),
@@ -275,6 +263,7 @@ class InternalStats {
           bytes_skipped_output_level(0),
           bytes_read_blob(0),
           bytes_written(0),
+          bytes_written_pre_comp(0),
           bytes_written_blob(0),
           bytes_moved(0),
           num_input_files_in_non_output_levels(0),
@@ -308,6 +297,7 @@ class InternalStats {
           bytes_skipped_output_level(c.bytes_skipped_output_level),
           bytes_read_blob(c.bytes_read_blob),
           bytes_written(c.bytes_written),
+          bytes_written_pre_comp(c.bytes_written_pre_comp),
           bytes_written_blob(c.bytes_written_blob),
           bytes_moved(c.bytes_moved),
           num_input_files_in_non_output_levels(
@@ -338,6 +328,7 @@ class InternalStats {
       bytes_skipped_output_level = c.bytes_skipped_output_level;
       bytes_read_blob = c.bytes_read_blob;
       bytes_written = c.bytes_written;
+      bytes_written_pre_comp = c.bytes_written_pre_comp;
       bytes_written_blob = c.bytes_written_blob;
       bytes_moved = c.bytes_moved;
       num_input_files_in_non_output_levels =
@@ -370,6 +361,7 @@ class InternalStats {
       this->bytes_skipped_output_level = 0;
       this->bytes_read_blob = 0;
       this->bytes_written = 0;
+      this->bytes_written_pre_comp = 0;
       this->bytes_written_blob = 0;
       this->bytes_moved = 0;
       this->num_input_files_in_non_output_levels = 0;
@@ -398,6 +390,7 @@ class InternalStats {
       this->bytes_skipped_output_level += c.bytes_skipped_output_level;
       this->bytes_read_blob += c.bytes_read_blob;
       this->bytes_written += c.bytes_written;
+      this->bytes_written_pre_comp += c.bytes_written_pre_comp;
       this->bytes_written_blob += c.bytes_written_blob;
       this->bytes_moved += c.bytes_moved;
       this->num_input_files_in_non_output_levels +=
@@ -420,15 +413,6 @@ class InternalStats {
       }
     }
 
-    void Add(const CompactionOutputsStats& stats) {
-      this->num_output_files += static_cast<int>(stats.num_output_files);
-      this->num_output_records += stats.num_output_records;
-      this->bytes_written += stats.bytes_written;
-      this->bytes_written_blob += stats.bytes_written_blob;
-      this->num_output_files_blob +=
-          static_cast<int>(stats.num_output_files_blob);
-    }
-
     void Subtract(const CompactionStats& c) {
       this->micros -= c.micros;
       this->cpu_micros -= c.cpu_micros;
@@ -439,6 +423,7 @@ class InternalStats {
       this->bytes_skipped_output_level -= c.bytes_skipped_output_level;
       this->bytes_read_blob -= c.bytes_read_blob;
       this->bytes_written -= c.bytes_written;
+      this->bytes_written_pre_comp -= c.bytes_written_pre_comp;
       this->bytes_written_blob -= c.bytes_written_blob;
       this->bytes_moved -= c.bytes_moved;
       this->num_input_files_in_non_output_levels -=
@@ -473,49 +458,51 @@ class InternalStats {
     }
   };
 
-  // Compaction stats, for per_key_placement compaction, it includes 2 levels
-  // stats: the last level and the penultimate level.
+  // Compaction internal stats, for per_key_placement compaction, it includes 2
+  // output level stats: the last level and the proximal level.
   struct CompactionStatsFull {
     // the stats for the target primary output level
-    CompactionStats stats;
+    CompactionStats output_level_stats;
 
-    // stats for penultimate level output if exist
-    bool has_penultimate_level_output = false;
-    CompactionStats penultimate_level_stats;
+    // stats for proximal level output if exist
+    bool has_proximal_level_output = false;
+    CompactionStats proximal_level_stats;
 
-    explicit CompactionStatsFull() : stats(), penultimate_level_stats() {}
+    explicit CompactionStatsFull()
+        : output_level_stats(), proximal_level_stats() {}
 
     explicit CompactionStatsFull(CompactionReason reason, int c)
-        : stats(reason, c), penultimate_level_stats(reason, c) {}
+        : output_level_stats(reason, c), proximal_level_stats(reason, c) {}
 
     uint64_t TotalBytesWritten() const {
-      uint64_t bytes_written = stats.bytes_written + stats.bytes_written_blob;
-      if (has_penultimate_level_output) {
-        bytes_written += penultimate_level_stats.bytes_written +
-                         penultimate_level_stats.bytes_written_blob;
+      uint64_t bytes_written = output_level_stats.bytes_written +
+                               output_level_stats.bytes_written_blob;
+      if (has_proximal_level_output) {
+        bytes_written += proximal_level_stats.bytes_written +
+                         proximal_level_stats.bytes_written_blob;
       }
       return bytes_written;
     }
 
     uint64_t DroppedRecords() {
-      uint64_t output_records = stats.num_output_records;
-      if (has_penultimate_level_output) {
-        output_records += penultimate_level_stats.num_output_records;
+      uint64_t output_records = output_level_stats.num_output_records;
+      if (has_proximal_level_output) {
+        output_records += proximal_level_stats.num_output_records;
       }
-      if (stats.num_input_records > output_records) {
-        return stats.num_input_records - output_records;
+      if (output_level_stats.num_input_records > output_records) {
+        return output_level_stats.num_input_records - output_records;
       }
       return 0;
     }
 
     void SetMicros(uint64_t val) {
-      stats.micros = val;
-      penultimate_level_stats.micros = val;
+      output_level_stats.micros = val;
+      proximal_level_stats.micros = val;
     }
 
     void AddCpuMicros(uint64_t val) {
-      stats.cpu_micros += val;
-      penultimate_level_stats.cpu_micros += val;
+      output_level_stats.cpu_micros += val;
+      proximal_level_stats.cpu_micros += val;
     }
   };
 
@@ -587,10 +574,9 @@ class InternalStats {
 
   void AddCompactionStats(int level, Env::Priority thread_pri,
                           const CompactionStatsFull& comp_stats_full) {
-    AddCompactionStats(level, thread_pri, comp_stats_full.stats);
-    if (comp_stats_full.has_penultimate_level_output) {
-      per_key_placement_comp_stats_.Add(
-          comp_stats_full.penultimate_level_stats);
+    AddCompactionStats(level, thread_pri, comp_stats_full.output_level_stats);
+    if (comp_stats_full.has_proximal_level_output) {
+      per_key_placement_comp_stats_.Add(comp_stats_full.proximal_level_stats);
     }
   }
 
@@ -722,7 +708,10 @@ class InternalStats {
   // a full cache, which would force a re-scan on the next GetStats.
   std::shared_ptr<CacheEntryStatsCollector<CacheEntryRoleStats>>
       cache_entry_stats_collector_;
-  // Per-ColumnFamily/level compaction stats
+
+  // Per-column family and level compaction statistics, including flush and file
+  // ingestion. These are treated as compactions to L0 or the level where the
+  // file was ingested.
   std::vector<CompactionStats> comp_stats_;
   std::vector<CompactionStats> comp_stats_by_pri_;
   CompactionStats per_key_placement_comp_stats_;
@@ -863,6 +852,8 @@ class InternalStats {
                                    Version* version);
   bool HandleNumRunningCompactionSortedRuns(uint64_t* value, DBImpl* db,
                                             Version* version);
+  bool HandleCompactionAbortCount(uint64_t* value, DBImpl* db,
+                                  Version* version);
   bool HandleBackgroundErrors(uint64_t* value, DBImpl* db, Version* version);
   bool HandleCurSizeActiveMemTable(uint64_t* value, DBImpl* db,
                                    Version* version);
diff --git a/db/job_context.h b/db/job_context.h
index 83e9f5facafd..365a820d5f48 100644
--- a/db/job_context.h
+++ b/db/job_context.h
@@ -22,6 +22,9 @@ namespace ROCKSDB_NAMESPACE {
 class MemTable;
 struct SuperVersion;
 
+// The purpose of this struct is to simplify pushing work such as
+// allocation/construction, de-allocation/destruction, and notifications to
+// outside of holding the DB mutex.
 struct SuperVersionContext {
   struct WriteStallNotification {
     WriteStallInfo write_stall_info;
@@ -35,12 +38,6 @@ struct SuperVersionContext {
   std::unique_ptr<SuperVersion>
       new_superversion;  // if nullptr no new superversion
 
-  // If not nullptr, a new seqno to time mapping is available to be installed.
-  // Otherwise, make a shared copy of the one in the existing SuperVersion and
-  // carry it over to the new SuperVersion. This is moved to the SuperVersion
-  // during installation.
-  std::shared_ptr<const SeqnoToTimeMapping> new_seqno_to_time_mapping{nullptr};
-
   explicit SuperVersionContext(bool create_superversion = false)
       : new_superversion(create_superversion ? new SuperVersion() : nullptr) {}
 
@@ -126,7 +123,7 @@ struct JobContext {
         break;
       }
     }
-    return memtables_to_free.size() > 0 || logs_to_free.size() > 0 ||
+    return memtables_to_free.size() > 0 || wals_to_free.size() > 0 ||
            job_snapshot != nullptr || sv_have_sth;
   }
 
@@ -138,6 +135,37 @@ struct JobContext {
     return kMaxSequenceNumber;
   }
 
+  SequenceNumber GetLatestSnapshotSequence() const {
+    assert(snapshot_context_initialized);
+    if (snapshot_seqs.empty()) {
+      return 0;
+    }
+    return snapshot_seqs.back();
+  }
+
+  SequenceNumber GetEarliestSnapshotSequence() const {
+    assert(snapshot_context_initialized);
+    if (snapshot_seqs.empty()) {
+      return kMaxSequenceNumber;
+    }
+    return snapshot_seqs.front();
+  }
+
+  void InitSnapshotContext(SnapshotChecker* checker,
+                           std::unique_ptr<ManagedSnapshot> managed_snapshot,
+                           SequenceNumber earliest_write_conflict,
+                           std::vector<SequenceNumber>&& snapshots) {
+    if (snapshot_context_initialized) {
+      return;
+    }
+    snapshot_context_initialized = true;
+    snapshot_checker = checker;
+    assert(!job_snapshot);
+    job_snapshot = std::move(managed_snapshot);
+    earliest_write_conflict_snapshot = earliest_write_conflict;
+    snapshot_seqs = std::move(snapshots);
+  }
+
   // Structure to store information for candidate files to delete.
   struct CandidateFileInfo {
     std::string file_name;
@@ -149,9 +177,6 @@ struct JobContext {
     }
   };
 
-  // Unique job id
-  int job_id;
-
   // a list of all files that we'll consider deleting
   // (every once in a while this is filled up with all files
   // in the DB directory)
@@ -196,37 +221,47 @@ struct JobContext {
   // contexts for installing superversions for multiple column families
   std::vector<SuperVersionContext> superversion_contexts;
 
-  autovector<log::Writer*> logs_to_free;
+  autovector<log::Writer*> wals_to_free;
 
   // the current manifest_file_number, log_number and prev_log_number
   // that corresponds to the set of files in 'live'.
-  uint64_t manifest_file_number;
-  uint64_t pending_manifest_file_number;
+  uint64_t manifest_file_number = 0;
+  uint64_t pending_manifest_file_number = 0;
 
   // Used for remote compaction. To prevent OPTIONS files from getting
   // purged by PurgeObsoleteFiles() of the primary host
   uint64_t min_options_file_number;
-  uint64_t log_number;
-  uint64_t prev_log_number;
+  uint64_t log_number = 0;
+  uint64_t prev_log_number = 0;
 
   uint64_t min_pending_output = 0;
-  uint64_t prev_total_log_size = 0;
-  size_t num_alive_log_files = 0;
+  uint64_t prev_wals_total_size = 0;
+  size_t num_alive_wal_files = 0;
   uint64_t size_log_to_delete = 0;
 
   // Snapshot taken before flush/compaction job.
   std::unique_ptr<ManagedSnapshot> job_snapshot;
+  SnapshotChecker* snapshot_checker = nullptr;
+  std::vector<SequenceNumber> snapshot_seqs;
+  // This is the earliest snapshot that could be used for write-conflict
+  // checking by a transaction.  For any user-key newer than this snapshot, we
+  // should make sure not to remove evidence that a write occurred.
+  SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber;
+
+  // Unique job id
+  int job_id;
+
+  bool snapshot_context_initialized = false;
 
   explicit JobContext(int _job_id, bool create_superversion = false) {
     job_id = _job_id;
-    manifest_file_number = 0;
-    pending_manifest_file_number = 0;
-    log_number = 0;
-    prev_log_number = 0;
     superversion_contexts.emplace_back(
         SuperVersionContext(create_superversion));
   }
 
+  // Delete the default constructor
+  JobContext() = delete;
+
   // For non-empty JobContext Clean() has to be called at least once before
   // before destruction (see asserts in ~JobContext()). Should be called with
   // unlocked DB mutex. Destructor doesn't call Clean() to avoid accidentally
@@ -240,18 +275,18 @@ struct JobContext {
     for (auto m : memtables_to_free) {
       delete m;
     }
-    for (auto l : logs_to_free) {
+    for (auto l : wals_to_free) {
       delete l;
     }
 
     memtables_to_free.clear();
-    logs_to_free.clear();
+    wals_to_free.clear();
     job_snapshot.reset();
   }
 
   ~JobContext() {
     assert(memtables_to_free.size() == 0);
-    assert(logs_to_free.size() == 0);
+    assert(wals_to_free.size() == 0);
   }
 };
 
diff --git a/db/listener_test.cc b/db/listener_test.cc
index bfd5953668ff..989de3583c7b 100644
--- a/db/listener_test.cc
+++ b/db/listener_test.cc
@@ -105,7 +105,7 @@ class TestCompactionListener : public EventListener {
     ASSERT_EQ(ci.output_files.size(), ci.output_file_infos.size());
 
     ASSERT_TRUE(test_);
-    ASSERT_EQ(test_->db_, db);
+    ASSERT_EQ(test_->db_.get(), db);
 
     std::vector<std::vector<FileMetaData>> files_by_level;
     test_->dbfull()->TEST_GetFilesMetaData(test_->handles_[ci.cf_id],
@@ -163,9 +163,7 @@ TEST_F(EventListenerTest, OnSingleDBCompactionTest) {
   options.max_bytes_for_level_base = options.target_file_size_base * 2;
   options.max_bytes_for_level_multiplier = 2;
   options.compression = kNoCompression;
-#ifdef ROCKSDB_USING_THREAD_STATUS
-  options.enable_thread_tracking = true;
-#endif  // ROCKSDB_USING_THREAD_STATUS
+  options.enable_thread_tracking = ThreadStatus::kEnabled;
   options.level0_file_num_compaction_trigger = kNumL0Files;
   options.table_properties_collector_factories.push_back(
       std::make_shared<TestPropertiesCollectorFactory>());
@@ -199,7 +197,7 @@ TEST_F(EventListenerTest, OnSingleDBCompactionTest) {
 
   ASSERT_EQ(listener->compacted_dbs_.size(), cf_names.size());
   for (size_t i = 0; i < cf_names.size(); ++i) {
-    ASSERT_EQ(listener->compacted_dbs_[i], db_);
+    ASSERT_EQ(listener->compacted_dbs_[i], db_.get());
   }
 }
 
@@ -229,7 +227,7 @@ class TestFlushListener : public EventListener {
     ASSERT_EQ(info.file_checksum, kUnknownFileChecksum);
     ASSERT_EQ(info.file_checksum_func_name, kUnknownFileChecksumFuncName);
 
-#ifdef ROCKSDB_USING_THREAD_STATUS
+#ifndef NROCKSDB_THREAD_STATUS
     // Verify the id of the current thread that created this table
     // file matches the id of any active flush or compaction thread.
     uint64_t thread_id = env_->GetThreadID();
@@ -246,7 +244,7 @@ class TestFlushListener : public EventListener {
       }
     }
     ASSERT_TRUE(found_match);
-#endif  // ROCKSDB_USING_THREAD_STATUS
+#endif  // !NROCKSDB_THREAD_STATUS
   }
 
   void OnFlushCompleted(DB* db, const FlushJobInfo& info) override {
@@ -270,7 +268,7 @@ class TestFlushListener : public EventListener {
     // that assumption does not hold (see the test case MultiDBMultiListeners
     // below).
     ASSERT_TRUE(test_);
-    if (db == test_->db_) {
+    if (db == test_->db_.get()) {
       std::vector<std::vector<FileMetaData>> files_by_level;
       ASSERT_LT(info.cf_id, test_->handles_.size());
       ASSERT_GE(info.cf_id, 0u);
@@ -310,9 +308,7 @@ TEST_F(EventListenerTest, OnSingleDBFlushTest) {
   Options options;
   options.env = CurrentOptions().env;
   options.write_buffer_size = k110KB;
-#ifdef ROCKSDB_USING_THREAD_STATUS
-  options.enable_thread_tracking = true;
-#endif  // ROCKSDB_USING_THREAD_STATUS
+  options.enable_thread_tracking = ThreadStatus::kEnabled;
   TestFlushListener* listener = new TestFlushListener(options.env, this);
   options.listeners.emplace_back(listener);
   std::vector<std::string> cf_names = {"pikachu",  "ilya",     "muromec",
@@ -347,7 +343,7 @@ TEST_F(EventListenerTest, OnSingleDBFlushTest) {
 
   // make sure callback functions are called in the right order
   for (size_t i = 0; i < cf_names.size(); ++i) {
-    ASSERT_EQ(listener->flushed_dbs_[i], db_);
+    ASSERT_EQ(listener->flushed_dbs_[i], db_.get());
     ASSERT_EQ(listener->flushed_column_family_names_[i], cf_names[i]);
   }
 }
@@ -357,9 +353,7 @@ TEST_F(EventListenerTest, MultiCF) {
     Options options;
     options.env = CurrentOptions().env;
     options.write_buffer_size = k110KB;
-#ifdef ROCKSDB_USING_THREAD_STATUS
-    options.enable_thread_tracking = true;
-#endif  // ROCKSDB_USING_THREAD_STATUS
+    options.enable_thread_tracking = ThreadStatus::kEnabled;
     options.atomic_flush = atomic_flush;
     options.create_if_missing = true;
     DestroyAndReopen(options);
@@ -393,7 +387,7 @@ TEST_F(EventListenerTest, MultiCF) {
       // make sure callback functions are called in the right order
       if (i == 7) {
         for (size_t j = 0; j < cf_names.size(); j++) {
-          ASSERT_EQ(listener->flushed_dbs_[j], db_);
+          ASSERT_EQ(listener->flushed_dbs_[j], db_.get());
           ASSERT_EQ(listener->flushed_column_family_names_[j], cf_names[j]);
         }
       }
@@ -407,9 +401,7 @@ TEST_F(EventListenerTest, MultiCF) {
 TEST_F(EventListenerTest, MultiDBMultiListeners) {
   Options options;
   options.env = CurrentOptions().env;
-#ifdef ROCKSDB_USING_THREAD_STATUS
-  options.enable_thread_tracking = true;
-#endif  // ROCKSDB_USING_THREAD_STATUS
+  options.enable_thread_tracking = ThreadStatus::kEnabled;
   options.table_properties_collector_factories.push_back(
       std::make_shared<TestPropertiesCollectorFactory>());
   std::vector<TestFlushListener*> listeners;
@@ -430,22 +422,21 @@ TEST_F(EventListenerTest, MultiDBMultiListeners) {
   DBOptions db_opts(options);
   ColumnFamilyOptions cf_opts(options);
 
-  std::vector<DB*> dbs;
+  std::vector<std::unique_ptr<DB>> dbs;
   std::vector<std::vector<ColumnFamilyHandle*>> vec_handles;
 
   for (int d = 0; d < kNumDBs; ++d) {
     ASSERT_OK(DestroyDB(dbname_ + std::to_string(d), options));
-    DB* db;
+    ASSERT_OK(
+        DB::Open(options, dbname_ + std::to_string(d), &dbs.emplace_back()));
     std::vector<ColumnFamilyHandle*> handles;
-    ASSERT_OK(DB::Open(options, dbname_ + std::to_string(d), &db));
     for (size_t c = 0; c < cf_names.size(); ++c) {
       ColumnFamilyHandle* handle;
-      ASSERT_OK(db->CreateColumnFamily(cf_opts, cf_names[c], &handle));
+      ASSERT_OK(dbs.back()->CreateColumnFamily(cf_opts, cf_names[c], &handle));
       handles.push_back(handle);
     }
 
     vec_handles.push_back(std::move(handles));
-    dbs.push_back(db);
   }
 
   for (int d = 0; d < kNumDBs; ++d) {
@@ -458,23 +449,23 @@ TEST_F(EventListenerTest, MultiDBMultiListeners) {
   for (size_t c = 0; c < cf_names.size(); ++c) {
     for (int d = 0; d < kNumDBs; ++d) {
       ASSERT_OK(dbs[d]->Flush(FlushOptions(), vec_handles[d][c]));
-      ASSERT_OK(
-          static_cast_with_check<DBImpl>(dbs[d])->TEST_WaitForFlushMemTable());
+      ASSERT_OK(static_cast_with_check<DBImpl>(dbs[d].get())
+                    ->TEST_WaitForFlushMemTable());
     }
   }
 
   for (int d = 0; d < kNumDBs; ++d) {
     // Ensure background work is fully finished including listener callbacks
     // before accessing listener state.
-    ASSERT_OK(
-        static_cast_with_check<DBImpl>(dbs[d])->TEST_WaitForBackgroundWork());
+    ASSERT_OK(static_cast_with_check<DBImpl>(dbs[d].get())
+                  ->TEST_WaitForBackgroundWork());
   }
 
   for (auto* listener : listeners) {
     int pos = 0;
     for (size_t c = 0; c < cf_names.size(); ++c) {
       for (int d = 0; d < kNumDBs; ++d) {
-        ASSERT_EQ(listener->flushed_dbs_[pos], dbs[d]);
+        ASSERT_EQ(listener->flushed_dbs_[pos], dbs[d].get());
         ASSERT_EQ(listener->flushed_column_family_names_[pos], cf_names[c]);
         pos++;
       }
@@ -489,17 +480,15 @@ TEST_F(EventListenerTest, MultiDBMultiListeners) {
   }
   vec_handles.clear();
 
-  for (auto db : dbs) {
-    delete db;
+  for (auto& db : dbs) {
+    db.reset();
   }
 }
 
 TEST_F(EventListenerTest, DisableBGCompaction) {
   Options options;
   options.env = CurrentOptions().env;
-#ifdef ROCKSDB_USING_THREAD_STATUS
-  options.enable_thread_tracking = true;
-#endif  // ROCKSDB_USING_THREAD_STATUS
+  options.enable_thread_tracking = ThreadStatus::kEnabled;
   TestFlushListener* listener = new TestFlushListener(options.env, this);
   const int kCompactionTrigger = 1;
   const int kSlowdownTrigger = 5;
@@ -537,6 +526,47 @@ TEST_F(EventListenerTest, DisableBGCompaction) {
   ASSERT_GE(listener->slowdown_count, kSlowdownTrigger * 9);
 }
 
+class TestNumInputFilesTotalInputBytesPouplatedInListener
+    : public EventListener {
+ public:
+  void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    num_input_files = ci.stats.num_input_files;
+    total_num_of_bytes = ci.stats.total_input_bytes;
+  }
+  size_t num_input_files = 0;
+  size_t total_num_of_bytes = 0;
+  std::mutex mutex_;
+};
+
+TEST_F(EventListenerTest, NumInputFilesTotalBytesPopulated) {
+  Options options;
+  options.level_compaction_dynamic_level_bytes = false;
+  options.env = CurrentOptions().env;
+  options.create_if_missing = true;
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(
+      DBTestBase::kNumKeysByGenerateNewRandomFile));
+
+  TestNumInputFilesTotalInputBytesPouplatedInListener* listener =
+      new TestNumInputFilesTotalInputBytesPouplatedInListener();
+  options.listeners.emplace_back(listener);
+
+  options.level0_file_num_compaction_trigger = 4;
+  options.compaction_style = kCompactionStyleLevel;
+
+  DestroyAndReopen(options);
+  Random rnd(301);
+  ASSERT_EQ(listener->num_input_files, 0);
+  ASSERT_EQ(listener->total_num_of_bytes, 0);
+  // Write 4 files in L0
+  for (int i = 0; i < 4; i++) {
+    GenerateNewRandomFile(&rnd);
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(listener->num_input_files, 4);
+  ASSERT_NE(listener->total_num_of_bytes, 0);
+}
+
 class TestCompactionReasonListener : public EventListener {
  public:
   void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
@@ -1278,16 +1308,21 @@ class BlobDBJobLevelEventListenerTest : public EventListener {
   explicit BlobDBJobLevelEventListenerTest(EventListenerTest* test)
       : test_(test), call_count_(0) {}
 
-  const VersionStorageInfo* GetVersionStorageInfo() const {
-    VersionSet* const versions = test_->dbfull()->GetVersionSet();
+  // NOTE: it's not safe to rely on test_->db_ for these functions because
+  // the DB may be in the process of closing when these are called, and the
+  // unique_ptr is set to nullptr before invoking ~DB()
+
+  const VersionStorageInfo* GetVersionStorageInfo(DB* db) const {
+    DBImpl* db_impl = static_cast_with_check<DBImpl>(db);
+    VersionSet* const versions = db_impl->GetVersionSet();
     assert(versions);
 
     ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
     EXPECT_NE(cfd, nullptr);
 
-    test_->dbfull()->TEST_LockMutex();
+    db_impl->TEST_LockMutex();
     Version* const current = cfd->current();
-    test_->dbfull()->TEST_UnlockMutex();
+    db_impl->TEST_UnlockMutex();
     EXPECT_NE(current, nullptr);
 
     const VersionStorageInfo* const storage_info = current->storage_info();
@@ -1297,8 +1332,9 @@ class BlobDBJobLevelEventListenerTest : public EventListener {
   }
 
   void CheckBlobFileAdditions(
+      DB* db,
       const std::vector<BlobFileAdditionInfo>& blob_file_addition_infos) const {
-    const auto* vstorage = GetVersionStorageInfo();
+    const auto* vstorage = GetVersionStorageInfo(db);
 
     EXPECT_FALSE(blob_file_addition_infos.empty());
 
@@ -1326,7 +1362,7 @@ class BlobDBJobLevelEventListenerTest : public EventListener {
     return result;
   }
 
-  void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
+  void OnFlushCompleted(DB* db, const FlushJobInfo& info) override {
     {
       std::lock_guard<std::mutex> lock(mutex_);
       IncreaseCallCount(/*mutex_locked*/ true);
@@ -1335,16 +1371,15 @@ class BlobDBJobLevelEventListenerTest : public EventListener {
 
     EXPECT_EQ(info.blob_compression_type, kNoCompression);
 
-    CheckBlobFileAdditions(info.blob_file_addition_infos);
+    CheckBlobFileAdditions(db, info.blob_file_addition_infos);
   }
 
-  void OnCompactionCompleted(DB* /*db*/,
-                             const CompactionJobInfo& info) override {
+  void OnCompactionCompleted(DB* db, const CompactionJobInfo& info) override {
     IncreaseCallCount(/*mutex_locked*/ false);
 
     EXPECT_EQ(info.blob_compression_type, kNoCompression);
 
-    CheckBlobFileAdditions(info.blob_file_addition_infos);
+    CheckBlobFileAdditions(db, info.blob_file_addition_infos);
 
     EXPECT_FALSE(info.blob_file_garbage_infos.empty());
 
diff --git a/db/log_reader.cc b/db/log_reader.cc
index 0f0e25033ab5..2650b4c97a9a 100644
--- a/db/log_reader.cc
+++ b/db/log_reader.cc
@@ -95,7 +95,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
   uint64_t prospective_record_offset = 0;
 
   Slice fragment;
-  while (true) {
+  for (;;) {
     uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
     size_t drop_size = 0;
     const uint8_t record_type =
@@ -140,7 +140,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
         prospective_record_offset = physical_record_offset;
         scratch->assign(fragment.data(), fragment.size());
         in_fragmented_record = true;
-        break;
+        break;  // switch
 
       case kMiddleType:
       case kRecyclableMiddleType:
@@ -153,7 +153,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
           }
           scratch->append(fragment.data(), fragment.size());
         }
-        break;
+        break;  // switch
 
       case kLastType:
       case kRecyclableLastType:
@@ -171,7 +171,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
           first_record_read_ = true;
           return true;
         }
-        break;
+        break;  // switch
 
       case kSetCompressionType: {
         if (compression_type_record_read_) {
@@ -193,7 +193,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
         } else {
           InitCompression(compression_record);
         }
-        break;
+        break;  // switch
       }
       case kPredecessorWALInfoType:
       case kRecyclePredecessorWALInfoType: {
@@ -210,7 +210,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
           MaybeVerifyPredecessorWALInfo(wal_recovery_mode, fragment,
                                         recorded_predecessor_wal_info);
         }
-        break;
+        break;  // switch
       }
       case kUserDefinedTimestampSizeType:
       case kRecyclableUserDefinedTimestampSizeType: {
@@ -235,7 +235,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
             ReportCorruption(fragment.size(), s.getState());
           }
         }
-        break;
+        break;  // switch
       }
 
       case kBadHeader:
@@ -304,7 +304,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
           in_fragmented_record = false;
           scratch->clear();
         }
-        break;
+        break;  // switch
 
       case kBadRecordLen:
         if (eof_) {
@@ -337,7 +337,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
           in_fragmented_record = false;
           scratch->clear();
         }
-        break;
+        break;  // switch
 
       default: {
         if ((record_type & kRecordTypeSafeIgnoreMask) == 0) {
@@ -349,11 +349,11 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
         }
         in_fragmented_record = false;
         scratch->clear();
-        break;
+        break;  // switch
       }
     }
   }
-  return false;
+  // unreachable
 }
 
 void Reader::MaybeVerifyPredecessorWALInfo(
@@ -380,8 +380,11 @@ void Reader::MaybeVerifyPredecessorWALInfo(
   } else {
     if (observed_predecessor_wal_info_.GetLogNumber() !=
         recorded_predecessor_log_number) {
-      std::string reason = "Missing WAL of log number " +
-                           std::to_string(recorded_predecessor_log_number);
+      std::string reason =
+          "Mismatched predecessor log number of WAL file " +
+          file_->file_name() + " Recorded " +
+          std::to_string(recorded_predecessor_log_number) + ". Observed " +
+          std::to_string(observed_predecessor_wal_info_.GetLogNumber());
       ReportCorruption(fragment.size(), reason.c_str(),
                        recorded_predecessor_log_number);
     } else if (observed_predecessor_wal_info_.GetLastSeqnoRecorded() !=
diff --git a/db/log_reader.h b/db/log_reader.h
index dfcd6b7690f3..b2c43f076414 100644
--- a/db/log_reader.h
+++ b/db/log_reader.h
@@ -59,7 +59,7 @@ class Reader {
   // live while this Reader is in use.
   //
   // If "checksum" is true, verify checksums if available.
-  // TODO(hx235): seperate WAL related parameters from general `Reader`
+  // TODO(hx235): separate WAL related parameters from general `Reader`
   // parameters
   Reader(std::shared_ptr<Logger> info_log,
          std::unique_ptr<SequentialFileReader>&& file, Reporter* reporter,
@@ -155,7 +155,7 @@ class Reader {
   // which log number this is
   uint64_t const log_number_;
 
-  // See `Optinos::track_and_verify_wals`
+  // See `Options::track_and_verify_wals`
   bool track_and_verify_wals_;
   // Below variables are used for WAL verification
   // TODO(hx235): To revise `stop_replay_for_corruption_` inside `LogReader`
@@ -208,8 +208,8 @@ class Reader {
   };
 
   // Return type, or one of the preceding special values
-  // If WAL compressioned is enabled, fragment_checksum is the checksum of the
-  // fragment computed from the orginal buffer containinng uncompressed
+  // If WAL compression is enabled, fragment_checksum is the checksum of the
+  // fragment computed from the original buffer containing uncompressed
   // fragment.
   uint8_t ReadPhysicalRecord(Slice* result, size_t* drop_size,
                              uint64_t* fragment_checksum = nullptr);
diff --git a/db/log_writer.h b/db/log_writer.h
index f7aef75197d5..3a76faab771b 100644
--- a/db/log_writer.h
+++ b/db/log_writer.h
@@ -77,7 +77,7 @@ class Writer {
   // Create a writer that will append data to "*dest".
   // "*dest" must be initially empty.
   // "*dest" must remain live while this Writer is in use.
-  // TODO(hx235): seperate WAL related parameters from general `Reader`
+  // TODO(hx235): separate WAL related parameters from general `Reader`
   // parameters
   explicit Writer(std::unique_ptr<WritableFileWriter>&& dest,
                   uint64_t log_number, bool recycle_log_files,
diff --git a/db/manual_compaction_test.cc b/db/manual_compaction_test.cc
index e84031065426..d740c7d2d630 100644
--- a/db/manual_compaction_test.cc
+++ b/db/manual_compaction_test.cc
@@ -98,7 +98,7 @@ class LogCompactionFilter : public CompactionFilter {
 
 TEST_F(ManualCompactionTest, CompactTouchesAllKeys) {
   for (int iter = 0; iter < 2; ++iter) {
-    DB* db;
+    std::unique_ptr<DB> db;
     Options options;
     if (iter == 0) {  // level compaction
       options.num_levels = 3;
@@ -128,7 +128,7 @@ TEST_F(ManualCompactionTest, CompactTouchesAllKeys) {
     delete itr;
 
     delete options.compaction_filter;
-    delete db;
+    db.reset();
     ASSERT_OK(DestroyDB(dbname_, options));
   }
 }
@@ -137,7 +137,7 @@ TEST_F(ManualCompactionTest, Test) {
   // Open database.  Disable compression since it affects the creation
   // of layers and the code below is trying to test against a very
   // specific scenario.
-  DB* db;
+  std::unique_ptr<DB> db;
   Options db_options;
   db_options.write_buffer_size = 1024;
   db_options.create_if_missing = true;
@@ -185,12 +185,12 @@ TEST_F(ManualCompactionTest, Test) {
   ASSERT_EQ(kNumKeys, num_keys) << "Bad number of keys";
 
   // close database
-  delete db;
+  db.reset();
   ASSERT_OK(DestroyDB(dbname_, Options()));
 }
 
 TEST_F(ManualCompactionTest, SkipLevel) {
-  DB* db;
+  std::unique_ptr<DB> db;
   Options options;
   options.level_compaction_dynamic_level_bytes = false;
   options.num_levels = 3;
@@ -298,7 +298,7 @@ TEST_F(ManualCompactionTest, SkipLevel) {
   }
 
   delete filter;
-  delete db;
+  db.reset();
   ASSERT_OK(DestroyDB(dbname_, options));
 }
 
diff --git a/db/memtable.cc b/db/memtable.cc
index 5f5450276b38..7a2b0fe6880a 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -70,7 +70,9 @@ ImmutableMemTableOptions::ImmutableMemTableOptions(
       protection_bytes_per_key(
           mutable_cf_options.memtable_protection_bytes_per_key),
       allow_data_in_errors(ioptions.allow_data_in_errors),
-      paranoid_memory_checks(mutable_cf_options.paranoid_memory_checks) {}
+      paranoid_memory_checks(mutable_cf_options.paranoid_memory_checks),
+      memtable_veirfy_per_key_checksum_on_seek(
+          mutable_cf_options.memtable_veirfy_per_key_checksum_on_seek) {}
 
 MemTable::MemTable(const InternalKeyComparator& cmp,
                    const ImmutableOptions& ioptions,
@@ -115,7 +117,13 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
       oldest_key_time_(std::numeric_limits<uint64_t>::max()),
       approximate_memory_usage_(0),
       memtable_max_range_deletions_(
-          mutable_cf_options.memtable_max_range_deletions) {
+          mutable_cf_options.memtable_max_range_deletions),
+      key_validation_callback_(
+          (moptions_.protection_bytes_per_key != 0 &&
+           moptions_.memtable_veirfy_per_key_checksum_on_seek)
+              ? std::bind(&MemTable::ValidateKey, this, std::placeholders::_1,
+                          std::placeholders::_2)
+              : std::function<Status(const char*, bool)>(nullptr)) {
   UpdateFlushState();
   // something went wrong if we need to flush before inserting anything
   assert(!ShouldScheduleFlush());
@@ -134,6 +142,16 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
   auto new_cache = std::make_shared<FragmentedRangeTombstoneListCache>();
   size_t size = cached_range_tombstone_.Size();
   for (size_t i = 0; i < size; ++i) {
+#if defined(__cpp_lib_atomic_shared_ptr)
+    std::atomic<std::shared_ptr<FragmentedRangeTombstoneListCache>>*
+        local_cache_ref_ptr = cached_range_tombstone_.AccessAtCore(i);
+    auto new_local_cache_ref = std::make_shared<
+        const std::shared_ptr<FragmentedRangeTombstoneListCache>>(new_cache);
+    std::shared_ptr<FragmentedRangeTombstoneListCache> aliased_ptr(
+        new_local_cache_ref, new_cache.get());
+    local_cache_ref_ptr->store(std::move(aliased_ptr),
+                               std::memory_order_relaxed);
+#else
     std::shared_ptr<FragmentedRangeTombstoneListCache>* local_cache_ref_ptr =
         cached_range_tombstone_.AccessAtCore(i);
     auto new_local_cache_ref = std::make_shared<
@@ -143,11 +161,11 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
         std::shared_ptr<FragmentedRangeTombstoneListCache>(new_local_cache_ref,
                                                            new_cache.get()),
         std::memory_order_relaxed);
+#endif
   }
   const Comparator* ucmp = cmp.user_comparator();
   assert(ucmp);
   ts_sz_ = ucmp->timestamp_size();
-  persist_user_defined_timestamps_ = ioptions.persist_user_defined_timestamps;
 }
 
 MemTable::~MemTable() {
@@ -169,21 +187,26 @@ size_t MemTable::ApproximateMemoryUsage() {
     }
     total_usage += usage;
   }
-  approximate_memory_usage_.store(total_usage, std::memory_order_relaxed);
+  approximate_memory_usage_.StoreRelaxed(total_usage);
   // otherwise, return the actual usage
   return total_usage;
 }
 
 bool MemTable::ShouldFlushNow() {
+  if (IsMarkedForFlush()) {
+    // TODO: dedicated flush reason when marked for flush
+    return true;
+  }
+
   // This is set if memtable_max_range_deletions is > 0,
   // and that many range deletions are done
   if (memtable_max_range_deletions_ > 0 &&
-      num_range_deletes_.load(std::memory_order_relaxed) >=
+      num_range_deletes_.LoadRelaxed() >=
           static_cast<uint64_t>(memtable_max_range_deletions_)) {
     return true;
   }
 
-  size_t write_buffer_size = write_buffer_size_.load(std::memory_order_relaxed);
+  size_t write_buffer_size = write_buffer_size_.LoadRelaxed();
   // In a lot of times, we cannot allocate arena blocks that exactly matches the
   // buffer size. Thus we have to decide if we should over-allocate or
   // under-allocate.
@@ -192,13 +215,14 @@ bool MemTable::ShouldFlushNow() {
   // allocate one more block.
   const double kAllowOverAllocationRatio = 0.6;
 
+  // range deletion use skip list which allocates all memeory through `arena_`
+  assert(range_del_table_->ApproximateMemoryUsage() == 0);
   // If arena still have room for new block allocation, we can safely say it
   // shouldn't flush.
-  auto allocated_memory = table_->ApproximateMemoryUsage() +
-                          range_del_table_->ApproximateMemoryUsage() +
-                          arena_.MemoryAllocatedBytes();
+  auto allocated_memory =
+      table_->ApproximateMemoryUsage() + arena_.MemoryAllocatedBytes();
 
-  approximate_memory_usage_.store(allocated_memory, std::memory_order_relaxed);
+  approximate_memory_usage_.StoreRelaxed(allocated_memory);
 
   // if we can still allocate one more block without exceeding the
   // over-allocation ratio, then we should not flush.
@@ -378,7 +402,11 @@ class MemTableIterator : public InternalIterator {
             !mem.GetImmutableMemTableOptions()->inplace_update_support),
         arena_mode_(arena != nullptr),
         paranoid_memory_checks_(mem.moptions_.paranoid_memory_checks),
-        allow_data_in_error(mem.moptions_.allow_data_in_errors) {
+        validate_on_seek_(
+            mem.moptions_.paranoid_memory_checks ||
+            mem.moptions_.memtable_veirfy_per_key_checksum_on_seek),
+        allow_data_in_error_(mem.moptions_.allow_data_in_errors),
+        key_validation_callback_(mem.key_validation_callback_) {
     if (kind == kRangeDelEntries) {
       iter_ = mem.range_del_table_->GetIterator(arena);
     } else if (prefix_extractor_ != nullptr &&
@@ -447,8 +475,10 @@ class MemTableIterator : public InternalIterator {
         }
       }
     }
-    if (paranoid_memory_checks_) {
-      status_ = iter_->SeekAndValidate(k, nullptr, allow_data_in_error);
+    if (validate_on_seek_) {
+      status_ = iter_->SeekAndValidate(k, nullptr, allow_data_in_error_,
+                                       paranoid_memory_checks_,
+                                       key_validation_callback_);
     } else {
       iter_->Seek(k, nullptr);
     }
@@ -472,8 +502,10 @@ class MemTableIterator : public InternalIterator {
         }
       }
     }
-    if (paranoid_memory_checks_) {
-      status_ = iter_->SeekAndValidate(k, nullptr, allow_data_in_error);
+    if (validate_on_seek_) {
+      status_ = iter_->SeekAndValidate(k, nullptr, allow_data_in_error_,
+                                       paranoid_memory_checks_,
+                                       key_validation_callback_);
     } else {
       iter_->Seek(k, nullptr);
     }
@@ -502,7 +534,7 @@ class MemTableIterator : public InternalIterator {
     PERF_COUNTER_ADD(next_on_memtable_count, 1);
     assert(Valid());
     if (paranoid_memory_checks_) {
-      status_ = iter_->NextAndValidate(allow_data_in_error);
+      status_ = iter_->NextAndValidate(allow_data_in_error_);
     } else {
       iter_->Next();
       TEST_SYNC_POINT_CALLBACK("MemTableIterator::Next:0", iter_);
@@ -524,7 +556,7 @@ class MemTableIterator : public InternalIterator {
     PERF_COUNTER_ADD(prev_on_memtable_count, 1);
     assert(Valid());
     if (paranoid_memory_checks_) {
-      status_ = iter_->PrevAndValidate(allow_data_in_error);
+      status_ = iter_->PrevAndValidate(allow_data_in_error_);
     } else {
       iter_->Prev();
     }
@@ -583,7 +615,9 @@ class MemTableIterator : public InternalIterator {
   bool value_pinned_;
   bool arena_mode_;
   const bool paranoid_memory_checks_;
-  const bool allow_data_in_error;
+  const bool validate_on_seek_;
+  const bool allow_data_in_error_;
+  const std::function<Status(const char*, bool)> key_validation_callback_;
 
   void VerifyEntryChecksum() {
     if (protection_bytes_per_key_ > 0 && Valid()) {
@@ -740,7 +774,7 @@ FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator(
     const ReadOptions& read_options, SequenceNumber read_seq,
     bool immutable_memtable) {
   if (read_options.ignore_range_deletions ||
-      is_range_del_table_empty_.load(std::memory_order_relaxed)) {
+      is_range_del_table_empty_.LoadRelaxed()) {
     return nullptr;
   }
   return NewRangeTombstoneIteratorInternal(read_options, read_seq,
@@ -751,7 +785,7 @@ FragmentedRangeTombstoneIterator*
 MemTable::NewTimestampStrippingRangeTombstoneIterator(
     const ReadOptions& read_options, SequenceNumber read_seq, size_t ts_sz) {
   if (read_options.ignore_range_deletions ||
-      is_range_del_table_empty_.load(std::memory_order_relaxed)) {
+      is_range_del_table_empty_.LoadRelaxed()) {
     return nullptr;
   }
   if (!timestamp_stripping_fragmented_range_tombstone_list_) {
@@ -785,8 +819,13 @@ FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIteratorInternal(
 
   // takes current cache
   std::shared_ptr<FragmentedRangeTombstoneListCache> cache =
+#if defined(__cpp_lib_atomic_shared_ptr)
+      cached_range_tombstone_.Access()->load(std::memory_order_relaxed)
+#else
       std::atomic_load_explicit(cached_range_tombstone_.Access(),
-                                std::memory_order_relaxed);
+                                std::memory_order_relaxed)
+#endif
+      ;
   // construct fragmented tombstone list if necessary
   if (!cache->initialized.load(std::memory_order_acquire)) {
     cache->reader_mutex.lock();
@@ -810,7 +849,7 @@ void MemTable::ConstructFragmentedRangeTombstones() {
   // There should be no concurrent Construction.
   // We could also check fragmented_range_tombstone_list_ to avoid repeate
   // constructions. We just construct them here again to be safe.
-  if (!is_range_del_table_empty_.load(std::memory_order_relaxed)) {
+  if (!is_range_del_table_empty_.LoadRelaxed()) {
     // TODO: plumb Env::IOActivity, Env::IOPriority
     auto* unfragmented_iter = new MemTableIterator(
         MemTableIterator::kRangeDelEntries, *this, ReadOptions());
@@ -833,7 +872,7 @@ ReadOnlyMemTable::MemTableStats MemTable::ApproximateStats(
   if (entry_count == 0) {
     return {0, 0};
   }
-  uint64_t n = num_entries_.load(std::memory_order_relaxed);
+  uint64_t n = num_entries_.LoadRelaxed();
   if (n == 0) {
     return {0, 0};
   }
@@ -843,7 +882,7 @@ ReadOnlyMemTable::MemTableStats MemTable::ApproximateStats(
     // the inaccuracy.
     entry_count = n;
   }
-  uint64_t data_size = data_size_.load(std::memory_order_relaxed);
+  uint64_t data_size = data_size_.LoadRelaxed();
   return {entry_count * (data_size / n), entry_count};
 }
 
@@ -973,17 +1012,14 @@ Status MemTable::Add(SequenceNumber s, ValueType type,
 
     // this is a bit ugly, but is the way to avoid locked instructions
     // when incrementing an atomic
-    num_entries_.store(num_entries_.load(std::memory_order_relaxed) + 1,
-                       std::memory_order_relaxed);
-    data_size_.store(data_size_.load(std::memory_order_relaxed) + encoded_len,
-                     std::memory_order_relaxed);
+    num_entries_.StoreRelaxed(num_entries_.LoadRelaxed() + 1);
+    data_size_.StoreRelaxed(data_size_.LoadRelaxed() + encoded_len);
     if (type == kTypeDeletion || type == kTypeSingleDeletion ||
         type == kTypeDeletionWithTimestamp) {
-      num_deletes_.store(num_deletes_.load(std::memory_order_relaxed) + 1,
-                         std::memory_order_relaxed);
+      num_deletes_.StoreRelaxed(num_deletes_.LoadRelaxed() + 1);
     } else if (type == kTypeRangeDeletion) {
-      uint64_t val = num_range_deletes_.load(std::memory_order_relaxed) + 1;
-      num_range_deletes_.store(val, std::memory_order_relaxed);
+      uint64_t val = num_range_deletes_.LoadRelaxed() + 1;
+      num_range_deletes_.StoreRelaxed(val);
     }
 
     if (bloom_filter_ && prefix_extractor_ &&
@@ -1054,6 +1090,16 @@ Status MemTable::Add(SequenceNumber s, ValueType type,
       range_del_mutex_.lock();
     }
     for (size_t i = 0; i < size; ++i) {
+#if defined(__cpp_lib_atomic_shared_ptr)
+      std::atomic<std::shared_ptr<FragmentedRangeTombstoneListCache>>*
+          local_cache_ref_ptr = cached_range_tombstone_.AccessAtCore(i);
+      auto new_local_cache_ref = std::make_shared<
+          const std::shared_ptr<FragmentedRangeTombstoneListCache>>(new_cache);
+      std::shared_ptr<FragmentedRangeTombstoneListCache> aliased_ptr(
+          new_local_cache_ref, new_cache.get());
+      local_cache_ref_ptr->store(std::move(aliased_ptr),
+                                 std::memory_order_relaxed);
+#else
       std::shared_ptr<FragmentedRangeTombstoneListCache>* local_cache_ref_ptr =
           cached_range_tombstone_.AccessAtCore(i);
       auto new_local_cache_ref = std::make_shared<
@@ -1068,12 +1114,13 @@ Status MemTable::Add(SequenceNumber s, ValueType type,
           std::shared_ptr<FragmentedRangeTombstoneListCache>(
               new_local_cache_ref, new_cache.get()),
           std::memory_order_relaxed);
+#endif
     }
 
     if (allow_concurrent) {
       range_del_mutex_.unlock();
     }
-    is_range_del_table_empty_.store(false, std::memory_order_relaxed);
+    is_range_del_table_empty_.StoreRelaxed(false);
   }
   UpdateOldestKeyTime();
 
@@ -1464,11 +1511,13 @@ void MemTable::GetFromTable(const LookupKey& key,
   saver.allow_data_in_errors = moptions_.allow_data_in_errors;
   saver.protection_bytes_per_key = moptions_.protection_bytes_per_key;
 
-  if (!moptions_.paranoid_memory_checks) {
+  if (!moptions_.paranoid_memory_checks &&
+      !moptions_.memtable_veirfy_per_key_checksum_on_seek) {
     table_->Get(key, &saver, SaveValue);
   } else {
-    Status check_s = table_->GetAndValidate(key, &saver, SaveValue,
-                                            moptions_.allow_data_in_errors);
+    Status check_s = table_->GetAndValidate(
+        key, &saver, SaveValue, moptions_.allow_data_in_errors,
+        moptions_.paranoid_memory_checks, key_validation_callback_);
     if (check_s.IsCorruption()) {
       *(saver.status) = check_s;
       // Should stop searching the LSM.
@@ -1479,6 +1528,11 @@ void MemTable::GetFromTable(const LookupKey& key,
   *seq = saver.seq;
 }
 
+Status MemTable::ValidateKey(const char* key, bool allow_data_in_errors) {
+  return VerifyEntryChecksum(key, moptions_.protection_bytes_per_key,
+                             allow_data_in_errors);
+}
+
 void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
                         ReadCallback* callback, bool immutable_memtable) {
   // The sequence number is updated synchronously in version_set.h
@@ -1492,7 +1546,7 @@ void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
   // range tombstones. This is the simplest way to ensure range tombstones are
   // handled. TODO: allow Bloom checks where max_covering_tombstone_seq==0
   bool no_range_del = read_options.ignore_range_deletions ||
-                      is_range_del_table_empty_.load(std::memory_order_relaxed);
+                      is_range_del_table_empty_.LoadRelaxed();
   MultiGetRange temp_range(*range, range->begin(), range->end());
   if (bloom_filter_ && no_range_del) {
     bool whole_key =
@@ -1801,7 +1855,7 @@ uint64_t MemTable::GetMinLogContainingPrepSection() {
 }
 
 void MemTable::MaybeUpdateNewestUDT(const Slice& user_key) {
-  if (ts_sz_ == 0 || persist_user_defined_timestamps_) {
+  if (ts_sz_ == 0) {
     return;
   }
   const Comparator* ucmp = GetInternalKeyComparator().user_comparator();
@@ -1812,9 +1866,7 @@ void MemTable::MaybeUpdateNewestUDT(const Slice& user_key) {
 }
 
 const Slice& MemTable::GetNewestUDT() const {
-  // This path should not be invoked for MemTables that does not enable the UDT
-  // in Memtable only feature.
-  assert(ts_sz_ > 0 && !persist_user_defined_timestamps_);
+  assert(ts_sz_ > 0);
   return newest_udt_;
 }
 
diff --git a/db/memtable.h b/db/memtable.h
index 7032a3af449c..fb3d2323156b 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -8,7 +8,6 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #pragma once
-#include <atomic>
 #include <deque>
 #include <functional>
 #include <memory>
@@ -30,6 +29,7 @@
 #include "rocksdb/db.h"
 #include "rocksdb/memtablerep.h"
 #include "table/multiget_context.h"
+#include "util/atomic.h"
 #include "util/cast_util.h"
 #include "util/dynamic_bloom.h"
 #include "util/hash.h"
@@ -64,6 +64,7 @@ struct ImmutableMemTableOptions {
   uint32_t protection_bytes_per_key;
   bool allow_data_in_errors;
   bool paranoid_memory_checks;
+  bool memtable_veirfy_per_key_checksum_on_seek;
 };
 
 // Batched counters to updated when inserting keys in one write batch.
@@ -354,13 +355,13 @@ class ReadOnlyMemTable {
   // be flushed to storage
   // REQUIRES: external synchronization to prevent simultaneous
   // operations on the same MemTable.
-  uint64_t GetNextLogNumber() const { return mem_next_logfile_number_; }
+  uint64_t GetNextLogNumber() const { return mem_next_walfile_number_; }
 
   // Sets the next active logfile number when this memtable is about to
   // be flushed to storage
   // REQUIRES: external synchronization to prevent simultaneous
   // operations on the same MemTable.
-  void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; }
+  void SetNextLogNumber(uint64_t num) { mem_next_walfile_number_ = num; }
 
   // REQUIRES: db_mutex held.
   void SetID(uint64_t id) { id_ = id; }
@@ -496,6 +497,10 @@ class ReadOnlyMemTable {
     return false;
   }
 
+  void MarkForFlush() { marked_for_flush_.StoreRelaxed(true); }
+
+  bool IsMarkedForFlush() const { return marked_for_flush_.LoadRelaxed(); }
+
  protected:
   friend class MemTableList;
 
@@ -511,7 +516,7 @@ class ReadOnlyMemTable {
   VersionEdit edit_;
 
   // The log files earlier than this number can be deleted.
-  uint64_t mem_next_logfile_number_{0};
+  uint64_t mem_next_walfile_number_{0};
 
   // Memtable id to track flush.
   uint64_t id_ = 0;
@@ -524,6 +529,8 @@ class ReadOnlyMemTable {
 
   // Flush job info of the current memtable.
   std::unique_ptr<FlushJobInfo> flush_job_info_;
+
+  RelaxedAtomic<bool> marked_for_flush_{false};
 };
 
 class MemTable final : public ReadOnlyMemTable {
@@ -561,7 +568,7 @@ class MemTable final : public ReadOnlyMemTable {
   // As a cheap version of `ApproximateMemoryUsage()`, this function doesn't
   // require external synchronization. The value may be less accurate though
   size_t ApproximateMemoryUsageFast() const {
-    return approximate_memory_usage_.load(std::memory_order_relaxed);
+    return approximate_memory_usage_.LoadRelaxed();
   }
 
   size_t MemoryAllocatedBytes() const override {
@@ -681,49 +688,42 @@ class MemTable final : public ReadOnlyMemTable {
   // Update counters and flush status after inserting a whole write batch
   // Used in concurrent memtable inserts.
   void BatchPostProcess(const MemTablePostProcessInfo& update_counters) {
-    num_entries_.fetch_add(update_counters.num_entries,
-                           std::memory_order_relaxed);
-    data_size_.fetch_add(update_counters.data_size, std::memory_order_relaxed);
+    table_->BatchPostProcess();
+    num_entries_.FetchAddRelaxed(update_counters.num_entries);
+    data_size_.FetchAddRelaxed(update_counters.data_size);
     if (update_counters.num_deletes != 0) {
-      num_deletes_.fetch_add(update_counters.num_deletes,
-                             std::memory_order_relaxed);
+      num_deletes_.FetchAddRelaxed(update_counters.num_deletes);
     }
     if (update_counters.num_range_deletes > 0) {
-      num_range_deletes_.fetch_add(update_counters.num_range_deletes,
-                                   std::memory_order_relaxed);
+      num_range_deletes_.FetchAddRelaxed(update_counters.num_range_deletes);
+      // noop for skip-list memtable
+      // Besides correctness test in stress test, memtable flush record count
+      // check will catch this if it were not noop.
+      // range_del_table_->BatchPostProcess();
     }
     UpdateFlushState();
   }
 
-  uint64_t NumEntries() const override {
-    return num_entries_.load(std::memory_order_relaxed);
-  }
+  uint64_t NumEntries() const override { return num_entries_.LoadRelaxed(); }
 
-  uint64_t NumDeletion() const override {
-    return num_deletes_.load(std::memory_order_relaxed);
-  }
+  uint64_t NumDeletion() const override { return num_deletes_.LoadRelaxed(); }
 
   uint64_t NumRangeDeletion() const override {
-    return num_range_deletes_.load(std::memory_order_relaxed);
+    return num_range_deletes_.LoadRelaxed();
   }
 
-  uint64_t GetDataSize() const override {
-    return data_size_.load(std::memory_order_relaxed);
-  }
+  uint64_t GetDataSize() const override { return data_size_.LoadRelaxed(); }
 
-  size_t write_buffer_size() const {
-    return write_buffer_size_.load(std::memory_order_relaxed);
-  }
+  size_t write_buffer_size() const { return write_buffer_size_.LoadRelaxed(); }
 
   // Dynamically change the memtable's capacity. If set below the current usage,
   // the next key added will trigger a flush. Can only increase size when
   // memtable prefix bloom is disabled, since we can't easily allocate more
-  // space.
+  // space. Non-atomic update ok because this is only called with DB mutex held.
   void UpdateWriteBufferSize(size_t new_write_buffer_size) {
     if (bloom_filter_ == nullptr ||
-        new_write_buffer_size < write_buffer_size_) {
-      write_buffer_size_.store(new_write_buffer_size,
-                               std::memory_order_relaxed);
+        new_write_buffer_size < write_buffer_size_.LoadRelaxed()) {
+      write_buffer_size_.StoreRelaxed(new_write_buffer_size);
     }
   }
 
@@ -815,9 +815,11 @@ class MemTable final : public ReadOnlyMemTable {
 
   bool IsFragmentedRangeTombstonesConstructed() const override {
     return fragmented_range_tombstone_list_.get() != nullptr ||
-           is_range_del_table_empty_;
+           is_range_del_table_empty_.LoadRelaxed();
   }
 
+  //  Gets the newest user defined timestamps in the memtable. This should only
+  //  be called when user defined timestamp is enabled.
   const Slice& GetNewestUDT() const override;
 
   // Returns Corruption status if verification fails.
@@ -825,6 +827,9 @@ class MemTable final : public ReadOnlyMemTable {
                                     uint32_t protection_bytes_per_key,
                                     bool allow_data_in_errors = false);
 
+  // Validate the checksum of the key/value pair.
+  Status ValidateKey(const char* key, bool allow_data_in_errors);
+
  private:
   enum FlushStateEnum { FLUSH_NOT_REQUESTED, FLUSH_REQUESTED, FLUSH_SCHEDULED };
 
@@ -839,16 +844,22 @@ class MemTable final : public ReadOnlyMemTable {
   ConcurrentArena arena_;
   std::unique_ptr<MemTableRep> table_;
   std::unique_ptr<MemTableRep> range_del_table_;
-  std::atomic_bool is_range_del_table_empty_;
+  // This is OK to be relaxed access because consistency between table_ and
+  // range_del_table_ is provided by explicit multi-versioning with sequence
+  // numbers. It's ok for stale memory to say the range_del_table_ is empty when
+  // it's actually not because if it was relevant to our read (based on sequence
+  // number), the relaxed memory read would get a sufficiently updated value
+  // because of the ordering provided by LastPublishedSequence().
+  RelaxedAtomic<bool> is_range_del_table_empty_;
 
   // Total data size of all data inserted
-  std::atomic<uint64_t> data_size_;
-  std::atomic<uint64_t> num_entries_;
-  std::atomic<uint64_t> num_deletes_;
-  std::atomic<uint64_t> num_range_deletes_;
+  RelaxedAtomic<uint64_t> data_size_;
+  RelaxedAtomic<uint64_t> num_entries_;
+  RelaxedAtomic<uint64_t> num_deletes_;
+  RelaxedAtomic<uint64_t> num_range_deletes_;
 
   // Dynamically changeable memtable option
-  std::atomic<size_t> write_buffer_size_;
+  RelaxedAtomic<size_t> write_buffer_size_;
 
   // The sequence number of the kv that was inserted first
   std::atomic<SequenceNumber> first_seqno_;
@@ -884,7 +895,7 @@ class MemTable final : public ReadOnlyMemTable {
 
   // keep track of memory usage in table_, arena_, and range_del_table_.
   // Gets refreshed inside `ApproximateMemoryUsage()` or `ShouldFlushNow`
-  std::atomic<uint64_t> approximate_memory_usage_;
+  RelaxedAtomic<uint64_t> approximate_memory_usage_;
 
   // max range deletions in a memtable,  before automatic flushing, 0 for
   // unlimited.
@@ -893,14 +904,10 @@ class MemTable final : public ReadOnlyMemTable {
   // Size in bytes for the user-defined timestamps.
   size_t ts_sz_;
 
-  // Whether to persist user-defined timestamps
-  bool persist_user_defined_timestamps_;
-
   // Newest user-defined timestamp contained in this MemTable. For ts1, and ts2
   // if Comparator::CompareTimestamp(ts1, ts2) > 0, ts1 is considered newer than
   // ts2. We track this field for a MemTable if its column family has UDT
-  // feature enabled and the `persist_user_defined_timestamp` flag is false.
-  // Otherwise, this field just contains an empty Slice.
+  // feature enabled.
   Slice newest_udt_;
 
   // Updates flush_state_ using ShouldFlushNow()
@@ -939,14 +946,22 @@ class MemTable final : public ReadOnlyMemTable {
 
   // makes sure there is a single range tombstone writer to invalidate cache
   std::mutex range_del_mutex_;
+#if defined(__cpp_lib_atomic_shared_ptr)
+  CoreLocalArray<
+      std::atomic<std::shared_ptr<FragmentedRangeTombstoneListCache>>>
+      cached_range_tombstone_;
+#else
   CoreLocalArray<std::shared_ptr<FragmentedRangeTombstoneListCache>>
       cached_range_tombstone_;
 
+#endif
   void UpdateEntryChecksum(const ProtectionInfoKVOS64* kv_prot_info,
                            const Slice& key, const Slice& value, ValueType type,
                            SequenceNumber s, char* checksum_ptr);
 
   void MaybeUpdateNewestUDT(const Slice& user_key);
+
+  const std::function<Status(const char*, bool)> key_validation_callback_;
 };
 
 const char* EncodeKey(std::string* scratch, const Slice& target);
diff --git a/db/memtable_list.cc b/db/memtable_list.cc
index 2643110a13c3..93d8b05f836d 100644
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@@ -51,9 +51,7 @@ void MemTableListVersion::UnrefMemTable(
 
 MemTableListVersion::MemTableListVersion(
     size_t* parent_memtable_list_memory_usage, const MemTableListVersion& old)
-    : max_write_buffer_number_to_maintain_(
-          old.max_write_buffer_number_to_maintain_),
-      max_write_buffer_size_to_maintain_(
+    : max_write_buffer_size_to_maintain_(
           old.max_write_buffer_size_to_maintain_),
       parent_memtable_list_memory_usage_(parent_memtable_list_memory_usage) {
   memlist_ = old.memlist_;
@@ -69,10 +67,8 @@ MemTableListVersion::MemTableListVersion(
 
 MemTableListVersion::MemTableListVersion(
     size_t* parent_memtable_list_memory_usage,
-    int max_write_buffer_number_to_maintain,
     int64_t max_write_buffer_size_to_maintain)
-    : max_write_buffer_number_to_maintain_(max_write_buffer_number_to_maintain),
-      max_write_buffer_size_to_maintain_(max_write_buffer_size_to_maintain),
+    : max_write_buffer_size_to_maintain_(max_write_buffer_size_to_maintain),
       parent_memtable_list_memory_usage_(parent_memtable_list_memory_usage) {}
 
 void MemTableListVersion::Ref() { ++refs_; }
@@ -323,8 +319,7 @@ void MemTableListVersion::Remove(ReadOnlyMemTable* m,
   memlist_.remove(m);
 
   m->MarkFlushed();
-  if (max_write_buffer_size_to_maintain_ > 0 ||
-      max_write_buffer_number_to_maintain_ > 0) {
+  if (max_write_buffer_size_to_maintain_ > 0) {
     memlist_history_.push_front(m);
     // Unable to get size of mutable memtable at this point, pass 0 to
     // TrimHistory as a best effort.
@@ -356,9 +351,6 @@ bool MemTableListVersion::MemtableLimitExceeded(size_t usage) {
     // whether to trim history
     return MemoryAllocatedBytesExcludingLast() + usage >=
            static_cast<size_t>(max_write_buffer_size_to_maintain_);
-  } else if (max_write_buffer_number_to_maintain_ > 0) {
-    return memlist_.size() + memlist_history_.size() >
-           static_cast<size_t>(max_write_buffer_number_to_maintain_);
   } else {
     return false;
   }
@@ -382,6 +374,19 @@ bool MemTableListVersion::TrimHistory(autovector<ReadOnlyMemTable*>* to_delete,
   return ret;
 }
 
+const Slice& MemTableListVersion::GetNewestUDT() const {
+  static Slice kEmptySlice;
+  for (auto it = memlist_.begin(); it != memlist_.end(); ++it) {
+    ReadOnlyMemTable* m = *it;
+    Slice timestamp = m->GetNewestUDT();
+    assert(!timestamp.empty() || m->IsEmpty());
+    if (!timestamp.empty()) {
+      return m->GetNewestUDT();
+    }
+  }
+  return kEmptySlice;
+}
+
 // Returns true if there is at least one memtable on which flush has
 // not yet started.
 bool MemTableList::IsFlushPending() const {
diff --git a/db/memtable_list.h b/db/memtable_list.h
index 155878bdc268..eb42e1c7276a 100644
--- a/db/memtable_list.h
+++ b/db/memtable_list.h
@@ -45,7 +45,6 @@ class MemTableListVersion {
   explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage,
                                const MemTableListVersion& old);
   explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage,
-                               int max_write_buffer_number_to_maintain,
                                int64_t max_write_buffer_size_to_maintain);
 
   void Ref();
@@ -150,6 +149,12 @@ class MemTableListVersion {
 
   int NumFlushed() const { return static_cast<int>(memlist_history_.size()); }
 
+  // Gets the newest user defined timestamps from the immutable memtables.
+  // This returns the newest user defined timestamp found in the most recent
+  // immutable memtable. This should only be called when user defined timestamp
+  // is enabled.
+  const Slice& GetNewestUDT() const;
+
  private:
   friend class MemTableList;
 
@@ -209,8 +214,6 @@ class MemTableListVersion {
   // (used during Transaction validation)
   std::list<ReadOnlyMemTable*> memlist_history_;
 
-  // Maximum number of MemTables to keep in memory (including both flushed
-  const int max_write_buffer_number_to_maintain_;
   // Maximum size of MemTables to keep in memory (including both flushed
   // and not-yet-flushed tables).
   const int64_t max_write_buffer_size_to_maintain_;
@@ -238,13 +241,11 @@ class MemTableList {
  public:
   // A list of memtables.
   explicit MemTableList(int min_write_buffer_number_to_merge,
-                        int max_write_buffer_number_to_maintain,
                         int64_t max_write_buffer_size_to_maintain)
       : imm_flush_needed(false),
         imm_trim_needed(false),
         min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge),
         current_(new MemTableListVersion(&current_memory_usage_,
-                                         max_write_buffer_number_to_maintain,
                                          max_write_buffer_size_to_maintain)),
         num_flush_not_started_(0),
         commit_in_progress_(false),
diff --git a/db/memtable_list_test.cc b/db/memtable_list_test.cc
index cefb4653d616..c5589b2643a0 100644
--- a/db/memtable_list_test.cc
+++ b/db/memtable_list_test.cc
@@ -33,12 +33,12 @@ std::string ValueWithWriteTime(std::string value, uint64_t write_time) {
 class MemTableListTest : public testing::Test {
  public:
   std::string dbname;
-  DB* db;
+  std::unique_ptr<DB> db;
   Options options;
   std::vector<ColumnFamilyHandle*> handles;
   std::atomic<uint64_t> file_number;
 
-  MemTableListTest() : db(nullptr), file_number(1) {
+  MemTableListTest() : file_number(1) {
     dbname = test::PerThreadDBPath("memtable_list_test");
     options.create_if_missing = true;
     EXPECT_OK(DestroyDB(dbname, options));
@@ -88,8 +88,7 @@ class MemTableListTest : public testing::Test {
         }
       }
       handles.clear();
-      delete db;
-      db = nullptr;
+      db.reset();
       EXPECT_OK(DestroyDB(dbname, options, cf_descs));
     }
   }
@@ -112,7 +111,8 @@ class MemTableListTest : public testing::Test {
     WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size);
     WriteController write_controller(10000000u);
 
-    VersionSet versions(dbname, &immutable_db_options, env_options,
+    VersionSet versions(dbname, &immutable_db_options,
+                        MutableDBOptions{db_options}, env_options,
                         table_cache.get(), &write_buffer_manager,
                         &write_controller, /*block_cache_tracer=*/nullptr,
                         /*io_tracer=*/nullptr, /*db_id=*/"",
@@ -163,7 +163,8 @@ class MemTableListTest : public testing::Test {
     WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size);
     WriteController write_controller(10000000u);
 
-    VersionSet versions(dbname, &immutable_db_options, env_options,
+    VersionSet versions(dbname, &immutable_db_options,
+                        MutableDBOptions{db_options}, env_options,
                         table_cache.get(), &write_buffer_manager,
                         &write_controller, /*block_cache_tracer=*/nullptr,
                         /*io_tracer=*/nullptr, /*db_id=*/"",
@@ -220,7 +221,7 @@ class MemTableListTest : public testing::Test {
 
 TEST_F(MemTableListTest, Empty) {
   // Create an empty MemTableList and validate basic functions.
-  MemTableList list(1, 0, 0);
+  MemTableList list(1, 0);
 
   ASSERT_EQ(0, list.NumNotFlushed());
   ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
@@ -239,10 +240,8 @@ TEST_F(MemTableListTest, Empty) {
 TEST_F(MemTableListTest, GetTest) {
   // Create MemTableList
   int min_write_buffer_number_to_merge = 2;
-  int max_write_buffer_number_to_maintain = 0;
   int64_t max_write_buffer_size_to_maintain = 0;
   MemTableList list(min_write_buffer_number_to_merge,
-                    max_write_buffer_number_to_maintain,
                     max_write_buffer_size_to_maintain);
 
   SequenceNumber seq = 1;
@@ -407,10 +406,8 @@ TEST_F(MemTableListTest, GetTest) {
 TEST_F(MemTableListTest, GetFromHistoryTest) {
   // Create MemTableList
   int min_write_buffer_number_to_merge = 2;
-  int max_write_buffer_number_to_maintain = 2;
   int64_t max_write_buffer_size_to_maintain = 2 * Arena::kInlineSize;
   MemTableList list(min_write_buffer_number_to_merge,
-                    max_write_buffer_number_to_maintain,
                     max_write_buffer_size_to_maintain);
 
   SequenceNumber seq = 1;
@@ -653,11 +650,9 @@ TEST_F(MemTableListTest, FlushPendingTest) {
 
   // Create MemTableList
   int min_write_buffer_number_to_merge = 3;
-  int max_write_buffer_number_to_maintain = 7;
   int64_t max_write_buffer_size_to_maintain =
       7 * static_cast<int>(options.write_buffer_size);
   MemTableList list(min_write_buffer_number_to_merge,
-                    max_write_buffer_number_to_maintain,
                     max_write_buffer_size_to_maintain);
 
   // Create some MemTables
@@ -949,13 +944,11 @@ TEST_F(MemTableListTest, AtomicFlushTest) {
 
   // Create MemTableLists
   int min_write_buffer_number_to_merge = 3;
-  int max_write_buffer_number_to_maintain = 7;
   int64_t max_write_buffer_size_to_maintain =
       7 * static_cast<int64_t>(options.write_buffer_size);
   autovector<MemTableList*> lists;
   for (int i = 0; i != num_cfs; ++i) {
     lists.emplace_back(new MemTableList(min_write_buffer_number_to_merge,
-                                        max_write_buffer_number_to_maintain,
                                         max_write_buffer_size_to_maintain));
   }
 
@@ -1104,11 +1097,9 @@ TEST_F(MemTableListWithTimestampTest, GetTableNewestUDT) {
 
   // Create MemTableList
   int min_write_buffer_number_to_merge = 1;
-  int max_write_buffer_number_to_maintain = 4;
   int64_t max_write_buffer_size_to_maintain =
       4 * static_cast<int>(options.write_buffer_size);
   MemTableList list(min_write_buffer_number_to_merge,
-                    max_write_buffer_number_to_maintain,
                     max_write_buffer_size_to_maintain);
 
   // Create some MemTables
diff --git a/db/merge_helper.cc b/db/merge_helper.cc
index 2576aae840d7..0261ba0e27db 100644
--- a/db/merge_helper.cc
+++ b/db/merge_helper.cc
@@ -497,6 +497,7 @@ Status MergeHelper::MergeUntil(InternalIterator* iter,
           ikey.sequence <= latest_snapshot_
               ? CompactionFilter::Decision::kKeep
               : FilterMerge(orig_ikey.user_key, value_slice);
+      // FIXME: should also check for kRemove here
       if (filter != CompactionFilter::Decision::kRemoveAndSkipUntil &&
           range_del_agg != nullptr &&
           range_del_agg->ShouldDelete(
diff --git a/db/merge_helper.h b/db/merge_helper.h
index 39bd15f60876..098b9b5baba6 100644
--- a/db/merge_helper.h
+++ b/db/merge_helper.h
@@ -250,7 +250,7 @@ class MergeHelper {
   // Parallel with keys_; stores the operands
   mutable MergeContext merge_context_;
 
-  StopWatchNano filter_timer_;
+  StopWatchNano<> filter_timer_;
   uint64_t total_filter_time_;
   Statistics* stats_;
 
@@ -307,7 +307,7 @@ class MergeOutputIterator {
 
   Slice key() { return Slice(*it_keys_); }
   Slice value() { return Slice(*it_values_); }
-  bool Valid() { return it_keys_ != merge_helper_->keys().rend(); }
+  bool Valid() const { return it_keys_ != merge_helper_->keys().rend(); }
 
  private:
   const MergeHelper* merge_helper_;
diff --git a/db/merge_operator.cc b/db/merge_operator.cc
index bb5dbbc36533..ef12f726d393 100644
--- a/db/merge_operator.cc
+++ b/db/merge_operator.cc
@@ -32,6 +32,7 @@ bool MergeOperator::FullMergeV3(const MergeOperationInputV3& merge_in,
                                 MergeOperationOutputV3* merge_out) const {
   assert(merge_out);
 
+  Slice value_of_default;  // avoid warning about in_v2 pointing at this
   MergeOperationInput in_v2(merge_in.key, nullptr, merge_in.operand_list,
                             merge_in.logger);
 
@@ -66,7 +67,6 @@ bool MergeOperator::FullMergeV3(const MergeOperationInputV3& merge_in,
             const bool has_default_column =
                 WideColumnsHelper::HasDefaultColumn(existing_columns);
 
-            Slice value_of_default;
             if (has_default_column) {
               value_of_default = existing_columns.front().value();
             }
diff --git a/db/merge_test.cc b/db/merge_test.cc
index 0592856b7353..5f3546d6ce93 100644
--- a/db/merge_test.cc
+++ b/db/merge_test.cc
@@ -19,6 +19,7 @@
 #include "rocksdb/utilities/db_ttl.h"
 #include "rocksdb/wide_columns.h"
 #include "test_util/testharness.h"
+#include "util/cast_util.h"
 #include "util/coding.h"
 #include "utilities/merge_operators.h"
 
@@ -96,9 +97,9 @@ class EnvMergeTest : public EnvWrapper {
 uint64_t EnvMergeTest::now_nanos_count_{0};
 std::unique_ptr<EnvMergeTest> EnvMergeTest::singleton_;
 
-std::shared_ptr<DB> OpenDb(const std::string& dbname, const bool ttl = false,
+std::unique_ptr<DB> OpenDb(const std::string& dbname, const bool ttl = false,
                            const size_t max_successive_merges = 0) {
-  DB* db;
+  std::unique_ptr<DB> db;
   Options options;
   options.create_if_missing = true;
   options.merge_operator = std::make_shared<CountMergeOperator>();
@@ -109,7 +110,7 @@ std::shared_ptr<DB> OpenDb(const std::string& dbname, const bool ttl = false,
   if (ttl) {
     DBWithTTL* db_with_ttl;
     s = DBWithTTL::Open(options, dbname, &db_with_ttl);
-    db = db_with_ttl;
+    db.reset(db_with_ttl);
   } else {
     s = DB::Open(options, dbname, &db);
   }
@@ -118,7 +119,7 @@ std::shared_ptr<DB> OpenDb(const std::string& dbname, const bool ttl = false,
   // Allowed to call NowNanos during DB creation (in GenerateRawUniqueId() for
   // session ID)
   EnvMergeTest::now_nanos_count_ = 0;
-  return std::shared_ptr<DB>(db);
+  return db;
 }
 
 // Imagine we are maintaining a set of uint64 counters.
@@ -128,7 +129,7 @@ std::shared_ptr<DB> OpenDb(const std::string& dbname, const bool ttl = false,
 // This is a quick implementation without a Merge operation.
 class Counters {
  protected:
-  std::shared_ptr<DB> db_;
+  UnownedPtr<DB> db_;
 
   WriteOptions put_option_;
   ReadOptions get_option_;
@@ -137,7 +138,7 @@ class Counters {
   uint64_t default_;
 
  public:
-  explicit Counters(std::shared_ptr<DB> db, uint64_t defaultCount = 0)
+  explicit Counters(UnownedPtr<DB> db, uint64_t defaultCount = 0)
       : db_(db),
         put_option_(),
         get_option_(),
@@ -242,7 +243,7 @@ class MergeBasedCounters : public Counters {
   WriteOptions merge_option_;  // for merge
 
  public:
-  explicit MergeBasedCounters(std::shared_ptr<DB> db, uint64_t defaultCount = 0)
+  explicit MergeBasedCounters(UnownedPtr<DB> db, uint64_t defaultCount = 0)
       : Counters(db, defaultCount), merge_option_() {}
 
   // mapped to a rocksdb Merge operation
@@ -261,7 +262,7 @@ class MergeBasedCounters : public Counters {
   }
 };
 
-void dumpDb(DB* db) {
+void dumpDb(const std::unique_ptr<DB>& db) {
   auto it = std::unique_ptr<Iterator>(db->NewIterator(ReadOptions()));
   for (it->SeekToFirst(); it->Valid(); it->Next()) {
     // uint64_t value = DecodeFixed64(it->value().data());
@@ -270,7 +271,8 @@ void dumpDb(DB* db) {
   assert(it->status().ok());  // Check for any errors found during the scan
 }
 
-void testCounters(Counters& counters, DB* db, bool test_compaction) {
+void testCounters(Counters& counters, const std::unique_ptr<DB>& db,
+                  bool test_compaction) {
   FlushOptions o;
   o.wait = true;
 
@@ -320,7 +322,8 @@ void testCounters(Counters& counters, DB* db, bool test_compaction) {
   }
 }
 
-void testCountersWithFlushAndCompaction(Counters& counters, DB* db) {
+void testCountersWithFlushAndCompaction(Counters& counters,
+                                        const std::unique_ptr<DB>& db) {
   ASSERT_OK(db->Put({}, "1", "1"));
   ASSERT_OK(db->Flush(FlushOptions()));
 
@@ -388,12 +391,12 @@ void testCountersWithFlushAndCompaction(Counters& counters, DB* db) {
   SyncPoint::GetInstance()->EnableProcessing();
 
   port::Thread set_options_thread([&]() {
-    ASSERT_OK(static_cast<DBImpl*>(db)->SetOptions(
+    ASSERT_OK(static_cast_with_check<DBImpl>(db.get())->SetOptions(
         {{"disable_auto_compactions", "false"}}));
   });
   TEST_SYNC_POINT("testCountersWithCompactionAndFlush:BeforeCompact");
   port::Thread compact_thread([&]() {
-    ASSERT_OK(static_cast<DBImpl*>(db)->CompactRange(
+    ASSERT_OK(static_cast_with_check<DBImpl>(db.get())->CompactRange(
         CompactRangeOptions(), db->DefaultColumnFamily(), nullptr, nullptr));
   });
 
@@ -440,8 +443,8 @@ void testSuccessiveMerge(Counters& counters, size_t max_num_merges,
   }
 }
 
-void testPartialMerge(Counters* counters, DB* db, size_t max_merge,
-                      size_t min_merge, size_t count) {
+void testPartialMerge(Counters* counters, const std::unique_ptr<DB>& db,
+                      size_t max_merge, size_t min_merge, size_t count) {
   FlushOptions o;
   o.wait = true;
 
@@ -481,8 +484,8 @@ void testPartialMerge(Counters* counters, DB* db, size_t max_merge,
   ASSERT_EQ(EnvMergeTest::now_nanos_count_, 0U);
 }
 
-void testSingleBatchSuccessiveMerge(DB* db, size_t max_num_merges,
-                                    size_t num_merges) {
+void testSingleBatchSuccessiveMerge(const std::unique_ptr<DB>& db,
+                                    size_t max_num_merges, size_t num_merges) {
   ASSERT_GT(num_merges, max_num_merges);
 
   Slice key("BatchSuccessiveMerge");
@@ -520,13 +523,13 @@ void runTest(const std::string& dbname, const bool use_ttl = false) {
     auto db = OpenDb(dbname, use_ttl);
 
     {
-      Counters counters(db, 0);
-      testCounters(counters, db.get(), true);
+      Counters counters(db.get(), 0);
+      testCounters(counters, db, true);
     }
 
     {
-      MergeBasedCounters counters(db, 0);
-      testCounters(counters, db.get(), use_compression);
+      MergeBasedCounters counters(db.get(), 0);
+      testCounters(counters, db, use_compression);
     }
   }
 
@@ -535,10 +538,10 @@ void runTest(const std::string& dbname, const bool use_ttl = false) {
   {
     size_t max_merge = 5;
     auto db = OpenDb(dbname, use_ttl, max_merge);
-    MergeBasedCounters counters(db, 0);
-    testCounters(counters, db.get(), use_compression);
+    MergeBasedCounters counters(db.get(), 0);
+    testCounters(counters, db, use_compression);
     testSuccessiveMerge(counters, max_merge, max_merge * 2);
-    testSingleBatchSuccessiveMerge(db.get(), 5, 7);
+    testSingleBatchSuccessiveMerge(db, 5, 7);
     ASSERT_OK(db->Close());
     ASSERT_OK(DestroyDB(dbname, Options()));
   }
@@ -549,16 +552,15 @@ void runTest(const std::string& dbname, const bool use_ttl = false) {
     uint32_t min_merge = 2;
     for (uint32_t count = min_merge - 1; count <= min_merge + 1; count++) {
       auto db = OpenDb(dbname, use_ttl, max_merge);
-      MergeBasedCounters counters(db, 0);
-      testPartialMerge(&counters, db.get(), max_merge, min_merge, count);
+      MergeBasedCounters counters(db.get(), 0);
+      testPartialMerge(&counters, db, max_merge, min_merge, count);
       ASSERT_OK(db->Close());
       ASSERT_OK(DestroyDB(dbname, Options()));
     }
     {
       auto db = OpenDb(dbname, use_ttl, max_merge);
-      MergeBasedCounters counters(db, 0);
-      testPartialMerge(&counters, db.get(), max_merge, min_merge,
-                       min_merge * 10);
+      MergeBasedCounters counters(db.get(), 0);
+      testPartialMerge(&counters, db, max_merge, min_merge, min_merge * 10);
       ASSERT_OK(db->Close());
       ASSERT_OK(DestroyDB(dbname, Options()));
     }
@@ -567,18 +569,18 @@ void runTest(const std::string& dbname, const bool use_ttl = false) {
   {
     {
       auto db = OpenDb(dbname);
-      MergeBasedCounters counters(db, 0);
+      MergeBasedCounters counters(db.get(), 0);
       counters.add("test-key", 1);
       counters.add("test-key", 1);
       counters.add("test-key", 1);
       ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
     }
 
-    DB* reopen_db;
+    std::unique_ptr<DB> reopen_db;
     ASSERT_OK(DB::Open(Options(), dbname, &reopen_db));
     std::string value;
     ASSERT_NOK(reopen_db->Get(ReadOptions(), "test-key", &value));
-    delete reopen_db;
+    reopen_db.reset();
     ASSERT_OK(DestroyDB(dbname, Options()));
   }
 
@@ -587,13 +589,13 @@ void runTest(const std::string& dbname, const bool use_ttl = false) {
     std::cout << "Test merge-operator not set after reopen (recovery case)\n";
     {
       auto db = OpenDb(dbname);
-      MergeBasedCounters counters(db, 0);
+      MergeBasedCounters counters(db.get(), 0);
       counters.add("test-key", 1);
       counters.add("test-key", 1);
       counters.add("test-key", 1);
     }
 
-    DB* reopen_db;
+    std::unique_ptr<DB> reopen_db;
     ASSERT_TRUE(DB::Open(Options(), dbname, &reopen_db).IsInvalidArgument());
   }
   */
@@ -614,8 +616,8 @@ TEST_F(MergeTest, MergeWithCompactionAndFlush) {
   {
     auto db = OpenDb(dbname);
     {
-      MergeBasedCounters counters(db, 0);
-      testCountersWithFlushAndCompaction(counters, db.get());
+      MergeBasedCounters counters(db.get(), 0);
+      testCountersWithFlushAndCompaction(counters, db);
     }
   }
   ASSERT_OK(DestroyDB(dbname, Options()));
diff --git a/db/multi_scan.cc b/db/multi_scan.cc
new file mode 100644
index 000000000000..3d3855e0946d
--- /dev/null
+++ b/db/multi_scan.cc
@@ -0,0 +1,76 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+using MultiScanIterator = MultiScan::MultiScanIterator;
+
+MultiScan::MultiScan(const ReadOptions& read_options,
+                     const MultiScanArgs& scan_opts, DB* db,
+                     ColumnFamilyHandle* cfh)
+    : read_options_(read_options), scan_opts_(scan_opts), db_(db), cfh_(cfh) {
+  bool slow_path = false;
+  // Setup read_options with iterate_uuper_bound based on the first scan.
+  // Subsequent scans will update and allocate a new DB iterator as necessary
+  if (scan_opts.GetScanRanges()[0].range.limit) {
+    upper_bound_ = *scan_opts.GetScanRanges()[0].range.limit;
+    read_options_.iterate_upper_bound = &upper_bound_;
+  } else {
+    read_options_.iterate_upper_bound = nullptr;
+  }
+  for (const auto& opts : scan_opts.GetScanRanges()) {
+    // Check that all the ScanOptions either specify an upper bound or not. If
+    // its mixed we take the slow path which avoids calling Prepare: we have to
+    // reallocate the Iterator with updated read_options everytime we switch
+    // between upper bound or no upper bound, which complicates Prepare.
+    if (opts.range.limit.has_value() !=
+        scan_opts.GetScanRanges()[0].range.limit.has_value()) {
+      slow_path = true;
+      break;
+    }
+  }
+  db_iter_.reset(db->NewIterator(read_options_, cfh));
+  if (!slow_path) {
+    db_iter_->Prepare(scan_opts);
+  }
+}
+
+MultiScanIterator& MultiScanIterator::operator++() {
+  status_ = db_iter_->status();
+  if (!status_.ok()) {
+    throw MultiScanException(status_);
+  }
+
+  if (idx_ >= scan_opts_.size()) {
+    throw std::logic_error("Index out of range");
+  }
+  idx_++;
+  if (idx_ < scan_opts_.size()) {
+    // Check if we need to update read_options_
+    if (scan_opts_[idx_].range.limit.has_value() !=
+        (read_options_.iterate_upper_bound != nullptr)) {
+      if (scan_opts_[idx_].range.limit) {
+        *upper_bound_ = *scan_opts_[idx_].range.limit;
+        read_options_.iterate_upper_bound = upper_bound_;
+      } else {
+        read_options_.iterate_upper_bound = nullptr;
+      }
+      db_iter_.reset(db_->NewIterator(read_options_, cfh_));
+      scan_.Reset(db_iter_.get());
+    } else if (scan_opts_[idx_].range.limit) {
+      *upper_bound_ = *scan_opts_[idx_].range.limit;
+    }
+    db_iter_->Seek(*scan_opts_[idx_].range.start);
+    status_ = db_iter_->status();
+    if (!status_.ok()) {
+      throw MultiScanException(status_);
+    }
+  }
+  return *this;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/db/obsolete_files_test.cc b/db/obsolete_files_test.cc
index eb3ed078c79e..7709a80fcc59 100644
--- a/db/obsolete_files_test.cc
+++ b/db/obsolete_files_test.cc
@@ -303,6 +303,48 @@ TEST_F(ObsoleteFilesTest, BlobFiles) {
   ASSERT_EQ(deleted_files, expected_deleted_files);
 }
 
+TEST_F(ObsoleteFilesTest, GetSortedWalFilesHangsAfterNoopPurge) {
+  // This test used to trigger a hang in `DB::GetSortedWalFiles()`, where it
+  // would wait for a no-op purge that did not signal the CV upon completion.
+
+  // Grab an iterator and flush to switch the super version. That way, when the
+  // iterator is destroyed, it will go through the purge path.
+  DB* db =
+      db_.get();  // Only using `db` makes it clear we only use DB-level APIs.
+  ASSERT_OK(db->Put(WriteOptions(), "key", "value"));
+  std::unique_ptr<Iterator> iter(db->NewIterator(ReadOptions()));
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  // Sync points ensure `GetSortedWalFiles()` waits for a purge after
+  // `FindObsoleteFiles()` releases the mutex but before its corresponding purge
+  // completes.
+  SyncPoint::GetInstance()->SetCallBack(
+      "FindObsoleteFiles::PostMutexUnlock", [&](void* /* arg */) {
+        TEST_SYNC_POINT(
+            "ObsoleteFilesTest::GetSortedWalFilesHangsAfterNoopPurge:"
+            "InCallback:1");
+        TEST_SYNC_POINT(
+            "ObsoleteFilesTest::GetSortedWalFilesHangsAfterNoopPurge:"
+            "InCallback:2");
+      });
+  SyncPoint::GetInstance()->LoadDependency({
+      {"ObsoleteFilesTest::GetSortedWalFilesHangsAfterNoopPurge:InCallback:1",
+       "ObsoleteFilesTest::GetSortedWalFilesHangsAfterNoopPurge:Thread:Begin"},
+      {"DBImpl::GetSortedWalFilesImpl:WaitPurge",
+       "ObsoleteFilesTest::GetSortedWalFilesHangsAfterNoopPurge:InCallback:2"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  port::Thread get_sorted_wal_files_thread([db]() {
+    TEST_SYNC_POINT(
+        "ObsoleteFilesTest::GetSortedWalFilesHangsAfterNoopPurge:Thread:Begin");
+    VectorWalPtr files;
+    ASSERT_OK(db->GetSortedWalFiles(files));
+  });
+  iter.reset();
+  get_sorted_wal_files_thread.join();
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/options_file_test.cc b/db/options_file_test.cc
index 7e48f0cf38c1..f420d0dff4df 100644
--- a/db/options_file_test.cc
+++ b/db/options_file_test.cc
@@ -66,16 +66,16 @@ TEST_F(OptionsFileTest, NumberOfOptionsFiles) {
   opt.create_if_missing = true;
   ASSERT_OK(DestroyDB(dbname_, opt));
   std::unordered_set<std::string> filename_history;
-  DB* db;
+  std::unique_ptr<DB> db;
   for (int i = 0; i < kReopenCount; ++i) {
     ASSERT_OK(DB::Open(opt, dbname_, &db));
     int num_options_files = 0;
-    UpdateOptionsFiles(db, &filename_history, &num_options_files);
+    UpdateOptionsFiles(db.get(), &filename_history, &num_options_files);
     ASSERT_GT(num_options_files, 0);
     ASSERT_LE(num_options_files, 2);
     // Make sure we always keep the latest option files.
-    VerifyOptionsFileName(db, filename_history);
-    delete db;
+    VerifyOptionsFileName(db.get(), filename_history);
+    db.reset();
   }
 }
 
diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc
index c439c1ffedf7..fbd38c6c26ee 100644
--- a/db/perf_context_test.cc
+++ b/db/perf_context_test.cc
@@ -38,8 +38,8 @@ const std::string kDbName =
 
 namespace ROCKSDB_NAMESPACE {
 
-std::shared_ptr<DB> OpenDb(bool read_only = false) {
-  DB* db;
+std::unique_ptr<DB> OpenDb(bool read_only = false) {
+  std::unique_ptr<DB> db;
   Options options;
   options.create_if_missing = true;
   options.max_open_files = -1;
@@ -61,7 +61,7 @@ std::shared_ptr<DB> OpenDb(bool read_only = false) {
     s = DB::OpenForReadOnly(options, kDbName, &db);
   }
   EXPECT_OK(s);
-  return std::shared_ptr<DB>(db);
+  return db;
 }
 
 class PerfContextTest : public testing::Test {};
@@ -659,12 +659,11 @@ TEST_F(PerfContextTest, ToString) {
 
 TEST_F(PerfContextTest, MergeOperatorTime) {
   ASSERT_OK(DestroyDB(kDbName, Options()));
-  DB* db;
+  std::unique_ptr<DB> db;
   Options options;
   options.create_if_missing = true;
   options.merge_operator = MergeOperators::CreateStringAppendOperator();
-  Status s = DB::Open(options, kDbName, &db);
-  EXPECT_OK(s);
+  EXPECT_OK(DB::Open(options, kDbName, &db));
 
   std::string val;
   ASSERT_OK(db->Merge(WriteOptions(), "k1", "val1"));
@@ -704,7 +703,7 @@ TEST_F(PerfContextTest, MergeOperatorTime) {
 #endif
   EXPECT_GT(get_perf_context()->merge_operator_time_nanos, 0);
 
-  delete db;
+  db.reset();
 }
 
 TEST_F(PerfContextTest, CopyAndMove) {
@@ -972,13 +971,12 @@ TEST_F(PerfContextTest, CPUTimer) {
 TEST_F(PerfContextTest, MergeOperandCount) {
   ASSERT_OK(DestroyDB(kDbName, Options()));
 
-  DB* db = nullptr;
   Options options;
   options.create_if_missing = true;
   options.merge_operator = MergeOperators::CreateStringAppendOperator();
 
+  std::unique_ptr<DB> db;
   ASSERT_OK(DB::Open(options, kDbName, &db));
-  std::unique_ptr<DB> db_guard(db);
 
   constexpr size_t num_keys = 3;
   const std::string key_prefix("key");
@@ -1007,7 +1005,7 @@ TEST_F(PerfContextTest, MergeOperandCount) {
     for (size_t j = 0; j <= i; ++j) {
       // Take a snapshot before each Merge so they are preserved and not
       // collapsed during flush.
-      snapshots.emplace_back(db);
+      snapshots.emplace_back(db.get());
 
       ASSERT_OK(db->Merge(WriteOptions(), keys[i], value + std::to_string(j)));
     }
@@ -1124,7 +1122,7 @@ TEST_F(PerfContextTest, MergeOperandCount) {
 TEST_F(PerfContextTest, WriteMemtableTimePerfLevel) {
   // Write and check time
   ASSERT_OK(DestroyDB(kDbName, Options()));
-  std::shared_ptr<DB> db = OpenDb();
+  auto db = OpenDb();
 
   SetPerfLevel(PerfLevel::kEnableWait);
   PerfContext* perf_ctx = get_perf_context();
diff --git a/db/periodic_task_scheduler.cc b/db/periodic_task_scheduler.cc
index 2f266529c57c..ee3f07b91e73 100644
--- a/db/periodic_task_scheduler.cc
+++ b/db/periodic_task_scheduler.cc
@@ -26,6 +26,7 @@ static const std::map<PeriodicTaskType, uint64_t> kDefaultPeriodSeconds = {
     {PeriodicTaskType::kPersistStats, kInvalidPeriodSec},
     {PeriodicTaskType::kFlushInfoLog, 10},
     {PeriodicTaskType::kRecordSeqnoTime, kInvalidPeriodSec},
+    {PeriodicTaskType::kTriggerCompaction, 12 * 60 * 60}  // 12 hours
 };
 
 static const std::map<PeriodicTaskType, std::string> kPeriodicTaskTypeNames = {
@@ -33,16 +34,20 @@ static const std::map<PeriodicTaskType, std::string> kPeriodicTaskTypeNames = {
     {PeriodicTaskType::kPersistStats, "pst_st"},
     {PeriodicTaskType::kFlushInfoLog, "flush_info_log"},
     {PeriodicTaskType::kRecordSeqnoTime, "record_seq_time"},
+    {PeriodicTaskType::kTriggerCompaction, "trigger_compaction"},
 };
 
 Status PeriodicTaskScheduler::Register(PeriodicTaskType task_type,
-                                       const PeriodicTaskFunc& fn) {
-  return Register(task_type, fn, kDefaultPeriodSeconds.at(task_type));
+                                       const PeriodicTaskFunc& fn,
+                                       bool run_immediately) {
+  return Register(task_type, fn, kDefaultPeriodSeconds.at(task_type),
+                  run_immediately);
 }
 
 Status PeriodicTaskScheduler::Register(PeriodicTaskType task_type,
                                        const PeriodicTaskFunc& fn,
-                                       uint64_t repeat_period_seconds) {
+                                       uint64_t repeat_period_seconds,
+                                       bool run_immediately) {
   MutexLock l(&timer_mutex);
   static std::atomic<uint64_t> initial_delay(0);
 
@@ -65,10 +70,13 @@ Status PeriodicTaskScheduler::Register(PeriodicTaskType task_type,
   std::string unique_id =
       kPeriodicTaskTypeNames.at(task_type) + std::to_string(id_++);
 
-  bool succeeded = timer_->Add(
-      fn, unique_id,
-      (initial_delay.fetch_add(1) % repeat_period_seconds) * kMicrosInSecond,
-      repeat_period_seconds * kMicrosInSecond);
+  uint64_t initial_delay_micros =
+      (initial_delay.fetch_add(1) % repeat_period_seconds) * kMicrosInSecond;
+  if (!run_immediately) {
+    initial_delay_micros += repeat_period_seconds * kMicrosInSecond;
+  }
+  bool succeeded = timer_->Add(fn, unique_id, initial_delay_micros,
+                               repeat_period_seconds * kMicrosInSecond);
   if (!succeeded) {
     return Status::Aborted("Failed to register periodic task");
   }
diff --git a/db/periodic_task_scheduler.h b/db/periodic_task_scheduler.h
index 3ac8a3b9cee6..8511f5f2d8e7 100644
--- a/db/periodic_task_scheduler.h
+++ b/db/periodic_task_scheduler.h
@@ -21,6 +21,7 @@ enum class PeriodicTaskType : uint8_t {
   kPersistStats,
   kFlushInfoLog,
   kRecordSeqnoTime,
+  kTriggerCompaction,
   kMax,
 };
 
@@ -42,13 +43,16 @@ class PeriodicTaskScheduler {
   PeriodicTaskScheduler& operator=(PeriodicTaskScheduler&&) = delete;
 
   // Register a task with its default repeat period. Thread safe call.
-  Status Register(PeriodicTaskType task_type, const PeriodicTaskFunc& fn);
+  // @param run_immediately If true, the task will run soon after it's
+  // scheduled, instead of waiting for the repeat period.
+  Status Register(PeriodicTaskType task_type, const PeriodicTaskFunc& fn,
+                  bool run_immediately);
 
   // Register a task with specified repeat period. 0 is an invalid argument
   // (kInvalidPeriodSec). To stop the task, please use Unregister().
   // Thread safe call.
   Status Register(PeriodicTaskType task_type, const PeriodicTaskFunc& fn,
-                  uint64_t repeat_period_seconds);
+                  uint64_t repeat_period_seconds, bool run_immediately);
 
   // Unregister the task. Thread safe call.
   Status Unregister(PeriodicTaskType task_type);
diff --git a/db/periodic_task_scheduler_test.cc b/db/periodic_task_scheduler_test.cc
index baf74ed15e3a..5575333b095a 100644
--- a/db/periodic_task_scheduler_test.cc
+++ b/db/periodic_task_scheduler_test.cc
@@ -56,6 +56,12 @@ TEST_F(PeriodicTaskSchedulerTest, Basic) {
   SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::FlushInfoLog:StartRunning",
       [&](void*) { flush_info_log_counter++; });
+
+  int trigger_compaction_counter = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::TriggerPeriodicCompaction:StartRunning",
+      [&](void*) { trigger_compaction_counter++; });
+
   SyncPoint::GetInstance()->EnableProcessing();
 
   Reopen(options);
@@ -70,7 +76,7 @@ TEST_F(PeriodicTaskSchedulerTest, Basic) {
 
   const PeriodicTaskScheduler& scheduler =
       dbfull()->TEST_GetPeriodicTaskScheduler();
-  ASSERT_EQ(3, scheduler.TEST_GetValidTaskNum());
+  ASSERT_EQ((int)PeriodicTaskType::kMax - 1, scheduler.TEST_GetValidTaskNum());
 
   ASSERT_EQ(1, dump_st_counter);
   ASSERT_EQ(1, pst_st_counter);
@@ -103,14 +109,14 @@ TEST_F(PeriodicTaskSchedulerTest, Basic) {
   ASSERT_EQ(3, pst_st_counter);
   ASSERT_EQ(4, flush_info_log_counter);
 
-  ASSERT_EQ(1u, scheduler.TEST_GetValidTaskNum());
+  ASSERT_EQ(2u, scheduler.TEST_GetValidTaskNum());
 
   // Re-enable one task
   ASSERT_OK(dbfull()->SetDBOptions({{"stats_dump_period_sec", "5"}}));
   ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_dump_period_sec);
   ASSERT_EQ(0u, dbfull()->GetDBOptions().stats_persist_period_sec);
 
-  ASSERT_EQ(2, scheduler.TEST_GetValidTaskNum());
+  ASSERT_EQ(3, scheduler.TEST_GetValidTaskNum());
 
   dbfull()->TEST_WaitForPeriodicTaskRun(
       [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kPeriodSec)); });
@@ -118,6 +124,16 @@ TEST_F(PeriodicTaskSchedulerTest, Basic) {
   ASSERT_EQ(3, pst_st_counter);
   ASSERT_EQ(5, flush_info_log_counter);
 
+  ASSERT_EQ(0, trigger_compaction_counter);
+  dbfull()->TEST_WaitForPeriodicTaskRun([&] {
+    mock_clock_->MockSleepForSeconds(static_cast<int>(12 * 60 * 60));
+  });
+  ASSERT_EQ(1, trigger_compaction_counter);
+  dbfull()->TEST_WaitForPeriodicTaskRun([&] {
+    mock_clock_->MockSleepForSeconds(static_cast<int>(12 * 60 * 60));
+  });
+  ASSERT_EQ(2, trigger_compaction_counter);
+
   Close();
 }
 
@@ -141,16 +157,18 @@ TEST_F(PeriodicTaskSchedulerTest, MultiInstances) {
                                         [&](void*) { pst_st_counter++; });
   SyncPoint::GetInstance()->EnableProcessing();
 
-  auto dbs = std::vector<DB*>(kInstanceNum);
+  auto dbs = std::vector<std::unique_ptr<DB>>(kInstanceNum);
   for (int i = 0; i < kInstanceNum; i++) {
     ASSERT_OK(
         DB::Open(options, test::PerThreadDBPath(std::to_string(i)), &(dbs[i])));
   }
 
-  auto dbi = static_cast_with_check<DBImpl>(dbs[kInstanceNum - 1]);
+  auto dbi = static_cast_with_check<DBImpl>(dbs[kInstanceNum - 1].get());
 
   const PeriodicTaskScheduler& scheduler = dbi->TEST_GetPeriodicTaskScheduler();
-  ASSERT_EQ(kInstanceNum * 3, scheduler.TEST_GetValidTaskNum());
+  // kRecordSeqnoTime is not registered since the feature is not enabled
+  ASSERT_EQ(kInstanceNum * ((int)PeriodicTaskType::kMax - 1),
+            scheduler.TEST_GetValidTaskNum());
 
   int expected_run = kInstanceNum;
   dbi->TEST_WaitForPeriodicTaskRun(
@@ -172,7 +190,7 @@ TEST_F(PeriodicTaskSchedulerTest, MultiInstances) {
 
   int half = kInstanceNum / 2;
   for (int i = 0; i < half; i++) {
-    delete dbs[i];
+    dbs[i].reset();
   }
 
   expected_run += (kInstanceNum - half) * 2;
@@ -186,7 +204,7 @@ TEST_F(PeriodicTaskSchedulerTest, MultiInstances) {
 
   for (int i = half; i < kInstanceNum; i++) {
     ASSERT_OK(dbs[i]->Close());
-    delete dbs[i];
+    dbs[i].reset();
   }
 }
 
@@ -211,11 +229,11 @@ TEST_F(PeriodicTaskSchedulerTest, MultiEnv) {
   options1.env = mock_env2.get();
 
   std::string dbname = test::PerThreadDBPath("multi_env_test");
-  DB* db;
+  std::unique_ptr<DB> db;
   ASSERT_OK(DB::Open(options2, dbname, &db));
 
   ASSERT_OK(db->Close());
-  delete db;
+  db.reset();
   Close();
 }
 
diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc
index 27aa0e28d0c9..6e2909ca5159 100644
--- a/db/plain_table_db_test.cc
+++ b/db/plain_table_db_test.cc
@@ -98,7 +98,7 @@ class PlainTableDBTest : public testing::Test,
  private:
   std::string dbname_;
   Env* env_;
-  DB* db_;
+  std::unique_ptr<DB> db_;
 
   bool mmap_mode_;
   Options last_options_;
@@ -107,7 +107,7 @@ class PlainTableDBTest : public testing::Test,
   PlainTableDBTest() : env_(Env::Default()) {}
 
   ~PlainTableDBTest() override {
-    delete db_;
+    db_.reset();
     EXPECT_OK(DestroyDB(dbname_, Options()));
   }
 
@@ -115,7 +115,7 @@ class PlainTableDBTest : public testing::Test,
     mmap_mode_ = GetParam();
     dbname_ = test::PerThreadDBPath("plain_table_db_test");
     EXPECT_OK(DestroyDB(dbname_, Options()));
-    db_ = nullptr;
+    db_.reset();
     Reopen();
   }
 
@@ -144,14 +144,11 @@ class PlainTableDBTest : public testing::Test,
     return options;
   }
 
-  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_); }
+  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_.get()); }
 
   void Reopen(Options* options = nullptr) { ASSERT_OK(TryReopen(options)); }
 
-  void Close() {
-    delete db_;
-    db_ = nullptr;
-  }
+  void Close() { db_.reset(); }
 
   bool mmap_mode() const { return mmap_mode_; }
 
@@ -162,24 +159,21 @@ class PlainTableDBTest : public testing::Test,
   }
 
   void Destroy(Options* options) {
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     ASSERT_OK(DestroyDB(dbname_, *options));
   }
 
-  Status PureReopen(Options* options, DB** db) {
+  Status PureReopen(Options* options, std::unique_ptr<DB>* db) {
     return DB::Open(*options, dbname_, db);
   }
 
   Status ReopenForReadOnly(Options* options) {
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     return DB::OpenForReadOnly(*options, dbname_, &db_);
   }
 
   Status TryReopen(Options* options = nullptr) {
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     Options opts;
     if (options != nullptr) {
       opts = *options;
@@ -495,8 +489,7 @@ TEST_P(PlainTableDBTest, Flush) {
             ASSERT_GT(int_num, 0U);
 
             TablePropertiesCollection ptc;
-            ASSERT_OK(
-                static_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc));
+            ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&ptc));
             ASSERT_EQ(1U, ptc.size());
             auto row = ptc.begin();
             auto tp = row->second;
@@ -1339,11 +1332,7 @@ TEST_P(PlainTableDBTest, AdaptiveTable) {
 INSTANTIATE_TEST_CASE_P(PlainTableDBTest, PlainTableDBTest, ::testing::Bool());
 
 TEST_P(PlainTableDBTest, DeleteRangeNotSupported) {
-  // XXX: After attempting DeleteRange with PlainTable, Writes will permanently
-  // fail. Even if re-opening the DB, if WAL is used, the WAL is not recoverable
-  // (without manual intervention). Furthermore, a partial write batch can
-  // be exposed to readers, breaking WriteBatch atomicity.
-  for (bool use_write_batch : {/*false, */ true}) {
+  for (bool use_write_batch : {false, true}) {
     DestroyAndReopen();
 
     ASSERT_OK(Put("a0001111", "1"));
@@ -1362,12 +1351,7 @@ TEST_P(PlainTableDBTest, DeleteRangeNotSupported) {
     ASSERT_EQ(Get("a0001111"), "1");
     ASSERT_EQ(Get("b0001111"), "2");
     ASSERT_EQ(Get("c0001111"), "3");
-    if (use_write_batch) {
-      // XXX: broken WriteBatch atomicity
-      ASSERT_EQ(Get("d0001111"), "4");
-    } else {
-      ASSERT_EQ(Get("d0001111"), "NOT_FOUND");
-    }
+    ASSERT_EQ(Get("d0001111"), "NOT_FOUND");  // expect WriteBatch atomicity
     ASSERT_EQ(Get("e0001111"), "NOT_FOUND");
 
     ASSERT_EQ(Put("e0001111", "5").code(), Status::Code::kNotSupported);
@@ -1377,8 +1361,14 @@ TEST_P(PlainTableDBTest, DeleteRangeNotSupported) {
     ASSERT_EQ(dbfull()->TEST_FlushMemTable().code(),
               Status::Code::kNotSupported);
 
-    // XXX: WAL is not recoverable
-    ASSERT_EQ(TryReopen().code(), Status::Code::kNotSupported);
+    // WAL is recoverable (at least in standard configurations)
+    ASSERT_OK(TryReopen());
+
+    ASSERT_EQ(Get("a0001111"), "1");
+    ASSERT_EQ(Get("b0001111"), "2");
+    ASSERT_EQ(Get("c0001111"), "3");
+    ASSERT_EQ(Get("d0001111"), "NOT_FOUND");
+    ASSERT_EQ(Get("e0001111"), "NOT_FOUND");
   }
 }
 
diff --git a/db/prefix_test.cc b/db/prefix_test.cc
index 9b1d4ed79e6a..d1559b50721b 100644
--- a/db/prefix_test.cc
+++ b/db/prefix_test.cc
@@ -220,8 +220,6 @@ class SamePrefixTransform : public SliceTransform {
     return false;
   }
 
-  bool InRange(const Slice& dst) const override { return dst == prefix_; }
-
   bool FullLengthEnabled(size_t* /*len*/) const override { return false; }
 };
 
@@ -229,8 +227,8 @@ class SamePrefixTransform : public SliceTransform {
 
 class PrefixTest : public testing::Test {
  public:
-  std::shared_ptr<DB> OpenDb() {
-    DB* db;
+  std::unique_ptr<DB> OpenDb() {
+    std::unique_ptr<DB> db;
 
     options.create_if_missing = true;
     options.write_buffer_size = FLAGS_write_buffer_size;
@@ -251,7 +249,7 @@ class PrefixTest : public testing::Test {
 
     Status s = DB::Open(options, kDbName, &db);
     EXPECT_OK(s);
-    return std::shared_ptr<DB>(db);
+    return db;
   }
 
   void FirstOption() { option_config_ = kBegin; }
@@ -304,7 +302,7 @@ class PrefixTest : public testing::Test {
 };
 
 TEST(SamePrefixTest, InDomainTest) {
-  DB* db;
+  std::unique_ptr<DB> db;
   Options options;
   options.create_if_missing = true;
   options.prefix_extractor.reset(new SamePrefixTransform("HHKB"));
@@ -331,7 +329,7 @@ TEST(SamePrefixTest, InDomainTest) {
     ASSERT_EQ(db_iter->value(), "idk");
 
     delete db_iter;
-    delete db;
+    db.reset();
     ASSERT_OK(DestroyDB(kDbName, Options()));
   }
 
@@ -348,7 +346,7 @@ TEST(SamePrefixTest, InDomainTest) {
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_OK(db_iter->status());
     delete db_iter;
-    delete db;
+    db.reset();
     ASSERT_OK(DestroyDB(kDbName, Options()));
   }
 }
diff --git a/db/repair.cc b/db/repair.cc
index 73671154ba5f..941d69dedc11 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -100,13 +100,15 @@ class Repairer {
         db_options_(SanitizeOptions(dbname_, db_options)),
         immutable_db_options_(ImmutableDBOptions(db_options_)),
         icmp_(default_cf_opts.comparator),
-        default_cf_opts_(
-            SanitizeOptions(immutable_db_options_, default_cf_opts)),
+        default_cf_opts_(SanitizeCfOptions(immutable_db_options_,
+                                           /*read_only*/ false,
+                                           default_cf_opts)),
         default_iopts_(
             ImmutableOptions(immutable_db_options_, default_cf_opts_)),
         default_mopts_(MutableCFOptions(default_cf_opts_)),
-        unknown_cf_opts_(
-            SanitizeOptions(immutable_db_options_, unknown_cf_opts)),
+        unknown_cf_opts_(SanitizeCfOptions(immutable_db_options_,
+                                           /*read_only*/ false,
+                                           unknown_cf_opts)),
         create_unknown_cfs_(create_unknown_cfs),
         raw_table_cache_(
             // TableCache can be small since we expect each table to be opened
@@ -118,8 +120,8 @@ class Repairer {
                                     /*io_tracer=*/nullptr, db_session_id_)),
         wb_(db_options_.db_write_buffer_size),
         wc_(db_options_.delayed_write_rate),
-        vset_(dbname_, &immutable_db_options_, file_options_,
-              raw_table_cache_.get(), &wb_, &wc_,
+        vset_(dbname_, &immutable_db_options_, MutableDBOptions{db_options_},
+              file_options_, raw_table_cache_.get(), &wb_, &wc_,
               /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
               /*db_id=*/"", db_session_id_, db_options.daily_offpeak_time_utc,
               /*error_handler=*/nullptr, /*read_only=*/false),
@@ -456,8 +458,9 @@ class Repairer {
       meta.file_creation_time = current_time;
       SnapshotChecker* snapshot_checker = DisableGCSnapshotChecker::Instance();
 
-      auto write_hint =
-          cfd->current()->storage_info()->CalculateSSTWriteHint(/*level=*/0);
+      auto write_hint = cfd->current()->storage_info()->CalculateSSTWriteHint(
+          /*level=*/0, db_options_.calculate_sst_write_lifetime_hint_set);
+
       std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
           range_del_iters;
       auto range_del_iter = mem->NewRangeTombstoneIterator(
@@ -575,14 +578,7 @@ class Repairer {
           static_cast<bool>(props->user_defined_timestamps_persisted);
     }
     if (status.ok()) {
-      uint64_t tail_size = 0;
-      bool contain_no_data_blocks =
-          props->num_entries > 0 &&
-          (props->num_entries == props->num_range_deletions);
-      if (props->tail_start_offset > 0 || contain_no_data_blocks) {
-        assert(props->tail_start_offset <= file_size);
-        tail_size = file_size - props->tail_start_offset;
-      }
+      uint64_t tail_size = FileMetaData::CalculateTailSize(file_size, *props);
       t->meta.tail_size = tail_size;
     }
     ColumnFamilyData* cfd = nullptr;
@@ -708,17 +704,17 @@ class Repairer {
       VersionEdit dummy_edit;
       for (const auto* table : cf_id_and_tables.second) {
         // TODO(opt): separate out into multiple levels
+        const auto& meta = table->meta;
         dummy_edit.AddFile(
-            0, table->meta.fd.GetNumber(), table->meta.fd.GetPathId(),
-            table->meta.fd.GetFileSize(), table->meta.smallest,
-            table->meta.largest, table->meta.fd.smallest_seqno,
-            table->meta.fd.largest_seqno, table->meta.marked_for_compaction,
-            table->meta.temperature, table->meta.oldest_blob_file_number,
-            table->meta.oldest_ancester_time, table->meta.file_creation_time,
-            table->meta.epoch_number, table->meta.file_checksum,
-            table->meta.file_checksum_func_name, table->meta.unique_id,
-            table->meta.compensated_range_deletion_size, table->meta.tail_size,
-            table->meta.user_defined_timestamps_persisted);
+            0, meta.fd.GetNumber(), meta.fd.GetPathId(), meta.fd.GetFileSize(),
+            meta.smallest, meta.largest, meta.fd.smallest_seqno,
+            meta.fd.largest_seqno, meta.marked_for_compaction, meta.temperature,
+            meta.oldest_blob_file_number, meta.oldest_ancester_time,
+            meta.file_creation_time, meta.epoch_number, meta.file_checksum,
+            meta.file_checksum_func_name, meta.unique_id,
+            meta.compensated_range_deletion_size, meta.tail_size,
+            meta.user_defined_timestamps_persisted, meta.min_timestamp,
+            meta.max_timestamp);
       }
       s = dummy_version_builder.Apply(&dummy_edit);
       if (s.ok()) {
diff --git a/db/seqno_time_test.cc b/db/seqno_time_test.cc
index cb247edfb767..e474c583d892 100644
--- a/db/seqno_time_test.cc
+++ b/db/seqno_time_test.cc
@@ -96,7 +96,7 @@ TEST_F(SeqnoTimeTest, TemperatureBasicUniversal) {
   }
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
-  // All data is hot, only output to penultimate level
+  // All data is hot, only output to proximal level
   ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
   ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
   ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
@@ -185,7 +185,7 @@ TEST_F(SeqnoTimeTest, TemperatureBasicLevel) {
   options.num_levels = kNumLevels;
   options.level_compaction_dynamic_level_bytes = true;
   // TODO(zjay): for level compaction, auto-compaction may stuck in deadloop, if
-  //  the penultimate level score > 1, but the hot is not cold enough to compact
+  //  the proximal level score > 1, but the hot is not cold enough to compact
   //  to last level, which will keep triggering compaction.
   options.disable_auto_compactions = true;
   DestroyAndReopen(options);
@@ -205,7 +205,7 @@ TEST_F(SeqnoTimeTest, TemperatureBasicLevel) {
   cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
 
-  // All data is hot, only output to penultimate level
+  // All data is hot, only output to proximal level
   ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
   ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
   ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
@@ -661,14 +661,14 @@ TEST_P(SeqnoTimeTablePropTest, MultiInstancesBasic) {
   options.stats_dump_period_sec = 0;
   options.stats_persist_period_sec = 0;
 
-  auto dbs = std::vector<DB*>(kInstanceNum);
+  auto dbs = std::vector<std::unique_ptr<DB>>(kInstanceNum);
   for (int i = 0; i < kInstanceNum; i++) {
     ASSERT_OK(
         DB::Open(options, test::PerThreadDBPath(std::to_string(i)), &(dbs[i])));
   }
 
   // Make sure the second instance has the worker enabled
-  auto dbi = static_cast_with_check<DBImpl>(dbs[1]);
+  auto dbi = static_cast_with_check<DBImpl>(dbs[1].get());
   WriteOptions wo;
   for (int i = 0; i < 200; i++) {
     ASSERT_OK(dbi->Put(wo, Key(i), "value"));
@@ -680,7 +680,7 @@ TEST_P(SeqnoTimeTablePropTest, MultiInstancesBasic) {
 
   for (int i = 0; i < kInstanceNum; i++) {
     ASSERT_OK(dbs[i]->Close());
-    delete dbs[i];
+    dbs[i].reset();
   }
 }
 
@@ -753,7 +753,7 @@ TEST_P(SeqnoTimeTablePropTest, SeqnoToTimeMappingUniversal) {
   CompactRangeOptions cro;
   cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
-  // make sure the data is all compacted to penultimate level if the feature is
+  // make sure the data is all compacted to proximal level if the feature is
   // on, otherwise, compacted to the last level.
   if (options.preclude_last_level_data_seconds > 0) {
     ASSERT_GT(NumTableFilesAtLevel(5), 0);
@@ -792,9 +792,8 @@ TEST_P(SeqnoTimeTablePropTest, SeqnoToTimeMappingUniversal) {
   }
   ASSERT_GT(num_seqno_zeroing, 0);
   std::vector<KeyVersion> key_versions;
-  ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(),
-                              std::numeric_limits<size_t>::max(),
-                              &key_versions));
+  ASSERT_OK(GetAllKeyVersions(
+      db_.get(), {}, {}, std::numeric_limits<size_t>::max(), &key_versions));
   // make sure there're more than 300 keys and first 100 keys are having seqno
   // zeroed out, the last 100 key seqno not zeroed out
   ASSERT_GT(key_versions.size(), 300);
@@ -919,10 +918,11 @@ TEST_P(SeqnoTimeTablePropTest, PrePopulateInDB) {
       ASSERT_EQ(db_->GetLatestSequenceNumber(), 0);
 
       // And even if we re-open read-write, we do not get pre-population,
-      // because that's only for new DBs.
+      // because that's only for new DBs. We just get a single bootstrap
+      // entry as a lower bound on write times of future writes.
       Reopen(track_options);
       sttm = dbfull()->TEST_GetSeqnoToTimeMapping();
-      ASSERT_EQ(sttm.Size(), 0);
+      ASSERT_EQ(sttm.Size(), 1);
       ASSERT_EQ(db_->GetLatestSequenceNumber(), 0);
     }
   }
diff --git a/db/seqno_to_time_mapping.cc b/db/seqno_to_time_mapping.cc
index b540fd919671..36da27c5bf03 100644
--- a/db/seqno_to_time_mapping.cc
+++ b/db/seqno_to_time_mapping.cc
@@ -490,7 +490,7 @@ bool SeqnoToTimeMapping::Append(SequenceNumber seqno, uint64_t time) {
   return added;
 }
 
-bool SeqnoToTimeMapping::PrePopulate(SequenceNumber from_seqno,
+void SeqnoToTimeMapping::PrePopulate(SequenceNumber from_seqno,
                                      SequenceNumber to_seqno,
                                      uint64_t from_time, uint64_t to_time) {
   assert(Empty());
@@ -505,8 +505,6 @@ bool SeqnoToTimeMapping::PrePopulate(SequenceNumber from_seqno,
                                  (to_seqno - from_seqno);
     pairs_.emplace_back(i, t);
   }
-
-  return /*success*/ true;
 }
 
 std::string SeqnoToTimeMapping::ToHumanString() const {
diff --git a/db/seqno_to_time_mapping.h b/db/seqno_to_time_mapping.h
index 741e64369435..a74041fd9a0e 100644
--- a/db/seqno_to_time_mapping.h
+++ b/db/seqno_to_time_mapping.h
@@ -138,7 +138,7 @@ class SeqnoToTimeMapping {
   // Adds a series of mappings interpolating from from_seqno->from_time to
   // to_seqno->to_time. This can only be called on an empty object and both
   // seqno range and time range are inclusive.
-  bool PrePopulate(SequenceNumber from_seqno, SequenceNumber to_seqno,
+  void PrePopulate(SequenceNumber from_seqno, SequenceNumber to_seqno,
                    uint64_t from_time, uint64_t to_time);
 
   // Append a new entry to the list. The `seqno` should be >= all previous
@@ -148,6 +148,10 @@ class SeqnoToTimeMapping {
   // rather than creating a new entry.
   bool Append(SequenceNumber seqno, uint64_t time);
 
+  bool Append(std::pair<SequenceNumber, uint64_t> seqno_time_pair) {
+    return Append(seqno_time_pair.first, seqno_time_pair.second);
+  }
+
   // Clear all entries and (re-)enter enforced mode if not already in that
   // state. Enforced limits are unchanged.
   void Clear() {
@@ -274,6 +278,48 @@ class SeqnoToTimeMapping {
   pair_const_iterator FindGreaterEqSeqno(SequenceNumber seqno) const;
 };
 
+// A struct to help combining settings across column families
+struct MinAndMaxPreserveSeconds {
+  uint64_t min_preserve_seconds = std::numeric_limits<uint64_t>::max();
+  uint64_t max_preserve_seconds = std::numeric_limits<uint64_t>::min();
+
+  MinAndMaxPreserveSeconds() = default;
+
+  template <class CFOpts>
+  explicit MinAndMaxPreserveSeconds(const CFOpts& opts) {
+    Combine(opts);
+  }
+
+  bool IsEnabled() const {
+    return min_preserve_seconds != std::numeric_limits<uint64_t>::max();
+  }
+
+  // Incorporate another CF's settings into the result. If preserve/preclude are
+  // disabled for this CF, they are excluded from the result.
+  template <class CFOpts>
+  void Combine(const CFOpts& opts) {
+    uint64_t preserve_seconds = std::max(opts.preserve_internal_time_seconds,
+                                         opts.preclude_last_level_data_seconds);
+    if (preserve_seconds > 0) {
+      min_preserve_seconds = std::min(preserve_seconds, min_preserve_seconds);
+      max_preserve_seconds = std::max(preserve_seconds, max_preserve_seconds);
+    }
+  }
+
+  // Choose how many seconds between mapping samples
+  uint64_t GetRecodingCadence() const {
+    if (IsEnabled()) {
+      // round up to 1 when the time_duration is smaller than
+      // kMaxSeqnoTimePairsPerCF
+      return (min_preserve_seconds + kMaxSeqnoTimePairsPerCF - 1) /
+             kMaxSeqnoTimePairsPerCF;
+    } else {
+      // disabled
+      return 0;
+    }
+  }
+};
+
 // === Utility methods used for TimedPut === //
 
 // Pack a value Slice and a unix write time into buffer `buf` and return a Slice
diff --git a/db/table_cache.cc b/db/table_cache.cc
index 773446b6a583..0e4e9f2e5155 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -98,6 +98,8 @@ Status TableCache::GetTableReader(
   std::unique_ptr<FSRandomAccessFile> file;
   FileOptions fopts = file_options;
   fopts.temperature = file_temperature;
+  fopts.file_checksum = file_meta.file_checksum;
+  fopts.file_checksum_func_name = file_meta.file_checksum_func_name;
   Status s = PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options);
   TEST_SYNC_POINT_CALLBACK("TableCache::GetTableReader:BeforeOpenFile",
                            const_cast<Status*>(&s));
@@ -113,8 +115,7 @@ Status TableCache::GetTableReader(
     Status temp_s =
         PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options);
     if (temp_s.ok()) {
-      temp_s = ioptions_.fs->NewRandomAccessFile(fname, file_options, &file,
-                                                 nullptr);
+      temp_s = ioptions_.fs->NewRandomAccessFile(fname, fopts, &file, nullptr);
     }
     if (temp_s.ok()) {
       RecordTick(ioptions_.stats, NO_FILE_OPENS);
@@ -146,7 +147,8 @@ Status TableCache::GetTableReader(
     s = mutable_cf_options.table_factory->NewTableReader(
         ro,
         TableReaderOptions(
-            ioptions_, mutable_cf_options.prefix_extractor, file_options,
+            ioptions_, mutable_cf_options.prefix_extractor,
+            mutable_cf_options.compression_manager.get(), file_options,
             internal_comparator,
             mutable_cf_options.block_protection_bytes_per_key, skip_filters,
             immortal_tables_, false /* force_direct_prefetch */, level,
@@ -205,6 +207,7 @@ Status TableCache::FindTable(
       RecordTick(ioptions_.stats, NO_FILE_ERRORS);
       // We do not cache error results so that if the error is transient,
       // or somebody repairs the file, we recover automatically.
+      IGNORE_STATUS_IF_ERROR(s);
     } else {
       s = cache_.Insert(key, table_reader.get(), 1, handle);
       if (s.ok()) {
diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc
index 3c7d8a61d739..a3e249887ab1 100644
--- a/db/version_builder_test.cc
+++ b/db/version_builder_test.cc
@@ -76,7 +76,8 @@ class VersionBuilderTest : public testing::Test {
         oldest_blob_file_number, kUnknownOldestAncesterTime,
         kUnknownFileCreationTime, epoch_number, kUnknownFileChecksum,
         kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0,
-        /* user_defined_timestamps_persisted */ true);
+        /* user_defined_timestamps_persisted */ true, /* min timestamp */ "",
+        /* max timestamp */ "");
     f->compensated_file_size = file_size;
     f->num_entries = num_entries;
     f->num_deletions = num_deletions;
diff --git a/db/version_edit.cc b/db/version_edit.cc
index f666308bc071..67a6f3cc5ba3 100644
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@@ -112,124 +112,9 @@ bool VersionEdit::EncodeTo(std::string* dst,
         f.epoch_number == kUnknownEpochNumber) {
       return false;
     }
-    PutVarint32(dst, kNewFile4);
-    PutVarint32Varint64(dst, new_files_[i].first /* level */, f.fd.GetNumber());
-    PutVarint64(dst, f.fd.GetFileSize());
-    EncodeFileBoundaries(dst, f, ts_sz.value());
-    PutVarint64Varint64(dst, f.fd.smallest_seqno, f.fd.largest_seqno);
-    // Customized fields' format:
-    // +-----------------------------+
-    // | 1st field's tag (varint32)  |
-    // +-----------------------------+
-    // | 1st field's size (varint32) |
-    // +-----------------------------+
-    // |    bytes for 1st field      |
-    // |  (based on size decoded)    |
-    // +-----------------------------+
-    // |                             |
-    // |          ......             |
-    // |                             |
-    // +-----------------------------+
-    // | last field's size (varint32)|
-    // +-----------------------------+
-    // |    bytes for last field     |
-    // |  (based on size decoded)    |
-    // +-----------------------------+
-    // | terminating tag (varint32)  |
-    // +-----------------------------+
-    //
-    // Customized encoding for fields:
-    //   tag kPathId: 1 byte as path_id
-    //   tag kNeedCompaction:
-    //        now only can take one char value 1 indicating need-compaction
-    //
-    PutVarint32(dst, NewFileCustomTag::kOldestAncesterTime);
-    std::string varint_oldest_ancester_time;
-    PutVarint64(&varint_oldest_ancester_time, f.oldest_ancester_time);
-    TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintOldestAncesterTime",
-                             &varint_oldest_ancester_time);
-    PutLengthPrefixedSlice(dst, Slice(varint_oldest_ancester_time));
-
-    PutVarint32(dst, NewFileCustomTag::kFileCreationTime);
-    std::string varint_file_creation_time;
-    PutVarint64(&varint_file_creation_time, f.file_creation_time);
-    TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintFileCreationTime",
-                             &varint_file_creation_time);
-    PutLengthPrefixedSlice(dst, Slice(varint_file_creation_time));
-
-    PutVarint32(dst, NewFileCustomTag::kEpochNumber);
-    std::string varint_epoch_number;
-    PutVarint64(&varint_epoch_number, f.epoch_number);
-    PutLengthPrefixedSlice(dst, Slice(varint_epoch_number));
-
-    if (f.file_checksum_func_name != kUnknownFileChecksumFuncName) {
-      PutVarint32(dst, NewFileCustomTag::kFileChecksum);
-      PutLengthPrefixedSlice(dst, Slice(f.file_checksum));
-
-      PutVarint32(dst, NewFileCustomTag::kFileChecksumFuncName);
-      PutLengthPrefixedSlice(dst, Slice(f.file_checksum_func_name));
-    }
-
-    if (f.fd.GetPathId() != 0) {
-      PutVarint32(dst, NewFileCustomTag::kPathId);
-      char p = static_cast<char>(f.fd.GetPathId());
-      PutLengthPrefixedSlice(dst, Slice(&p, 1));
-    }
-    if (f.temperature != Temperature::kUnknown) {
-      PutVarint32(dst, NewFileCustomTag::kTemperature);
-      char p = static_cast<char>(f.temperature);
-      PutLengthPrefixedSlice(dst, Slice(&p, 1));
-    }
-    if (f.marked_for_compaction) {
-      PutVarint32(dst, NewFileCustomTag::kNeedCompaction);
-      char p = static_cast<char>(1);
-      PutLengthPrefixedSlice(dst, Slice(&p, 1));
-    }
-    if (has_min_log_number_to_keep_ && !min_log_num_written) {
-      PutVarint32(dst, NewFileCustomTag::kMinLogNumberToKeepHack);
-      std::string varint_log_number;
-      PutFixed64(&varint_log_number, min_log_number_to_keep_);
-      PutLengthPrefixedSlice(dst, Slice(varint_log_number));
-      min_log_num_written = true;
-    }
-    if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
-      PutVarint32(dst, NewFileCustomTag::kOldestBlobFileNumber);
-      std::string oldest_blob_file_number;
-      PutVarint64(&oldest_blob_file_number, f.oldest_blob_file_number);
-      PutLengthPrefixedSlice(dst, Slice(oldest_blob_file_number));
-    }
-    UniqueId64x2 unique_id = f.unique_id;
-    TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:UniqueId", &unique_id);
-    if (unique_id != kNullUniqueId64x2) {
-      PutVarint32(dst, NewFileCustomTag::kUniqueId);
-      std::string unique_id_str = EncodeUniqueIdBytes(&unique_id);
-      PutLengthPrefixedSlice(dst, Slice(unique_id_str));
-    }
-    if (f.compensated_range_deletion_size) {
-      PutVarint32(dst, kCompensatedRangeDeletionSize);
-      std::string compensated_range_deletion_size;
-      PutVarint64(&compensated_range_deletion_size,
-                  f.compensated_range_deletion_size);
-      PutLengthPrefixedSlice(dst, Slice(compensated_range_deletion_size));
-    }
-    if (f.tail_size) {
-      PutVarint32(dst, NewFileCustomTag::kTailSize);
-      std::string varint_tail_size;
-      PutVarint64(&varint_tail_size, f.tail_size);
-      PutLengthPrefixedSlice(dst, Slice(varint_tail_size));
-    }
-    if (!f.user_defined_timestamps_persisted) {
-      // The default value for the flag is true, it's only explicitly persisted
-      // when it's false. We are putting 0 as the value here to signal false
-      // (i.e. UDTS not persisted).
-      PutVarint32(dst, NewFileCustomTag::kUserDefinedTimestampsPersisted);
-      char p = static_cast<char>(0);
-      PutLengthPrefixedSlice(dst, Slice(&p, 1));
-    }
-    TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields",
-                             dst);
-
-    PutVarint32(dst, NewFileCustomTag::kTerminate);
+    EncodeToNewFile4(f, new_files_[i].first, ts_sz.value(),
+                     has_min_log_number_to_keep_, min_log_number_to_keep_,
+                     min_log_num_written, dst);
   }
 
   for (const auto& blob_file_addition : blob_file_additions_) {
@@ -288,9 +173,151 @@ bool VersionEdit::EncodeTo(std::string* dst,
     char p = static_cast<char>(persist_user_defined_timestamps_);
     PutLengthPrefixedSlice(dst, Slice(&p, 1));
   }
+
+  if (HasSubcompactionProgress()) {
+    PutVarint32(dst, kSubcompactionProgress);
+    std::string progress_data;
+    subcompaction_progress_.EncodeTo(&progress_data);
+    PutLengthPrefixedSlice(dst, progress_data);
+  }
+
   return true;
 }
 
+void VersionEdit::EncodeToNewFile4(const FileMetaData& f, int level,
+                                   size_t ts_sz,
+                                   bool has_min_log_number_to_keep,
+                                   uint64_t min_log_number_to_keep,
+                                   bool& min_log_num_written,
+                                   std::string* dst) {
+  PutVarint32(dst, kNewFile4);
+  PutVarint32Varint64(dst, level, f.fd.GetNumber());
+  PutVarint64(dst, f.fd.GetFileSize());
+  EncodeFileBoundaries(dst, f, ts_sz);
+  PutVarint64Varint64(dst, f.fd.smallest_seqno, f.fd.largest_seqno);
+  // Customized fields' format:
+  // +-----------------------------+
+  // | 1st field's tag (varint32)  |
+  // +-----------------------------+
+  // | 1st field's size (varint32) |
+  // +-----------------------------+
+  // |    bytes for 1st field      |
+  // |  (based on size decoded)    |
+  // +-----------------------------+
+  // |                             |
+  // |          ......             |
+  // |                             |
+  // +-----------------------------+
+  // | last field's size (varint32)|
+  // +-----------------------------+
+  // |    bytes for last field     |
+  // |  (based on size decoded)    |
+  // +-----------------------------+
+  // | terminating tag (varint32)  |
+  // +-----------------------------+
+  //
+  // Customized encoding for fields:
+  //   tag kPathId: 1 byte as path_id
+  //   tag kNeedCompaction:
+  //        now only can take one char value 1 indicating need-compaction
+  //
+  PutVarint32(dst, NewFileCustomTag::kOldestAncesterTime);
+  std::string varint_oldest_ancester_time;
+  PutVarint64(&varint_oldest_ancester_time, f.oldest_ancester_time);
+  TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintOldestAncesterTime",
+                           &varint_oldest_ancester_time);
+  PutLengthPrefixedSlice(dst, Slice(varint_oldest_ancester_time));
+
+  PutVarint32(dst, NewFileCustomTag::kFileCreationTime);
+  std::string varint_file_creation_time;
+  PutVarint64(&varint_file_creation_time, f.file_creation_time);
+  TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintFileCreationTime",
+                           &varint_file_creation_time);
+  PutLengthPrefixedSlice(dst, Slice(varint_file_creation_time));
+
+  PutVarint32(dst, NewFileCustomTag::kEpochNumber);
+  std::string varint_epoch_number;
+  PutVarint64(&varint_epoch_number, f.epoch_number);
+  PutLengthPrefixedSlice(dst, Slice(varint_epoch_number));
+
+  if (f.file_checksum_func_name != kUnknownFileChecksumFuncName) {
+    PutVarint32(dst, NewFileCustomTag::kFileChecksum);
+    PutLengthPrefixedSlice(dst, Slice(f.file_checksum));
+
+    PutVarint32(dst, NewFileCustomTag::kFileChecksumFuncName);
+    PutLengthPrefixedSlice(dst, Slice(f.file_checksum_func_name));
+  }
+
+  if (f.fd.GetPathId() != 0) {
+    PutVarint32(dst, NewFileCustomTag::kPathId);
+    char p = static_cast<char>(f.fd.GetPathId());
+    PutLengthPrefixedSlice(dst, Slice(&p, 1));
+  }
+  if (f.temperature != Temperature::kUnknown) {
+    PutVarint32(dst, NewFileCustomTag::kTemperature);
+    char p = static_cast<char>(f.temperature);
+    PutLengthPrefixedSlice(dst, Slice(&p, 1));
+  }
+  if (f.marked_for_compaction) {
+    PutVarint32(dst, NewFileCustomTag::kNeedCompaction);
+    char p = static_cast<char>(1);
+    PutLengthPrefixedSlice(dst, Slice(&p, 1));
+  }
+  if (has_min_log_number_to_keep && !min_log_num_written) {
+    PutVarint32(dst, NewFileCustomTag::kMinLogNumberToKeepHack);
+    std::string varint_log_number;
+    PutFixed64(&varint_log_number, min_log_number_to_keep);
+    PutLengthPrefixedSlice(dst, Slice(varint_log_number));
+    min_log_num_written = true;
+  }
+  if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
+    PutVarint32(dst, NewFileCustomTag::kOldestBlobFileNumber);
+    std::string oldest_blob_file_number;
+    PutVarint64(&oldest_blob_file_number, f.oldest_blob_file_number);
+    PutLengthPrefixedSlice(dst, Slice(oldest_blob_file_number));
+  }
+  UniqueId64x2 unique_id = f.unique_id;
+  TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:UniqueId", &unique_id);
+  if (unique_id != kNullUniqueId64x2) {
+    PutVarint32(dst, NewFileCustomTag::kUniqueId);
+    std::string unique_id_str = EncodeUniqueIdBytes(&unique_id);
+    PutLengthPrefixedSlice(dst, Slice(unique_id_str));
+  }
+  if (f.compensated_range_deletion_size) {
+    PutVarint32(dst, NewFileCustomTag::kCompensatedRangeDeletionSize);
+    std::string compensated_range_deletion_size;
+    PutVarint64(&compensated_range_deletion_size,
+                f.compensated_range_deletion_size);
+    PutLengthPrefixedSlice(dst, Slice(compensated_range_deletion_size));
+  }
+  if (f.tail_size) {
+    PutVarint32(dst, NewFileCustomTag::kTailSize);
+    std::string varint_tail_size;
+    PutVarint64(&varint_tail_size, f.tail_size);
+    PutLengthPrefixedSlice(dst, Slice(varint_tail_size));
+  }
+  if (!f.user_defined_timestamps_persisted) {
+    // The default value for the flag is true, it's only explicitly persisted
+    // when it's false. We are putting 0 as the value here to signal false
+    // (i.e. UDTS not persisted).
+    PutVarint32(dst, NewFileCustomTag::kUserDefinedTimestampsPersisted);
+    char p = static_cast<char>(0);
+    PutLengthPrefixedSlice(dst, Slice(&p, 1));
+  }
+  // Encode min/max timestamp if they are non-empty
+  if (!f.min_timestamp.empty()) {
+    PutVarint32(dst, NewFileCustomTag::kMinTimestamp);
+    PutLengthPrefixedSlice(dst, Slice(f.min_timestamp));
+  }
+  if (!f.max_timestamp.empty()) {
+    PutVarint32(dst, NewFileCustomTag::kMaxTimestamp);
+    PutLengthPrefixedSlice(dst, Slice(f.max_timestamp));
+  }
+  TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields",
+                           dst);
+
+  PutVarint32(dst, NewFileCustomTag::kTerminate);
+}
 static bool GetInternalKey(Slice* input, InternalKey* dst) {
   Slice str;
   if (GetLengthPrefixedSlice(input, &str)) {
@@ -301,12 +328,12 @@ static bool GetInternalKey(Slice* input, InternalKey* dst) {
   }
 }
 
-bool VersionEdit::GetLevel(Slice* input, int* level, const char** /*msg*/) {
+bool VersionEdit::GetLevel(Slice* input, int* level, int& max_level) {
   uint32_t v = 0;
   if (GetVarint32(input, &v)) {
     *level = v;
-    if (max_level_ < *level) {
-      max_level_ = *level;
+    if (max_level < *level) {
+      max_level = *level;
     }
     return true;
   } else {
@@ -314,16 +341,18 @@ bool VersionEdit::GetLevel(Slice* input, int* level, const char** /*msg*/) {
   }
 }
 
-const char* VersionEdit::DecodeNewFile4From(Slice* input) {
-  const char* msg = nullptr;
+const char* VersionEdit::DecodeNewFile4From(Slice* input, int& max_level,
+                                            uint64_t& min_log_number_to_keep,
+                                            bool& has_min_log_number_to_keep,
+                                            NewFiles& new_files,
+                                            FileMetaData& f) {
   int level = 0;
-  FileMetaData f;
   uint64_t number = 0;
   uint32_t path_id = 0;
   uint64_t file_size = 0;
   SequenceNumber smallest_seqno = 0;
   SequenceNumber largest_seqno = kMaxSequenceNumber;
-  if (GetLevel(input, &level, &msg) && GetVarint64(input, &number) &&
+  if (GetLevel(input, &level, max_level) && GetVarint64(input, &number) &&
       GetVarint64(input, &file_size) && GetInternalKey(input, &f.smallest) &&
       GetInternalKey(input, &f.largest) &&
       GetVarint64(input, &smallest_seqno) &&
@@ -381,10 +410,10 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) {
         case kMinLogNumberToKeepHack:
           // This is a hack to encode kMinLogNumberToKeep in a
           // forward-compatible fashion.
-          if (!GetFixed64(&field, &min_log_number_to_keep_)) {
+          if (!GetFixed64(&field, &min_log_number_to_keep)) {
             return "deleted log number malformatted";
           }
-          has_min_log_number_to_keep_ = true;
+          has_min_log_number_to_keep = true;
           break;
         case kOldestBlobFileNumber:
           if (!GetVarint64(&field, &f.oldest_blob_file_number)) {
@@ -396,7 +425,7 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) {
             return "temperature field wrong size";
           } else {
             Temperature casted_field = static_cast<Temperature>(field[0]);
-            if (casted_field <= Temperature::kCold) {
+            if (casted_field < Temperature::kLastTemperature) {
               f.temperature = casted_field;
             }
           }
@@ -423,6 +452,12 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) {
           }
           f.user_defined_timestamps_persisted = (field[0] == 1);
           break;
+        case kMinTimestamp:
+          f.min_timestamp = field.ToString();
+          break;
+        case kMaxTimestamp:
+          f.max_timestamp = field.ToString();
+          break;
         default:
           if ((custom_tag & kCustomTagNonSafeIgnoreMask) != 0) {
             // Should not proceed if cannot understand it
@@ -436,13 +471,12 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) {
   }
   f.fd =
       FileDescriptor(number, path_id, file_size, smallest_seqno, largest_seqno);
-  new_files_.push_back(std::make_pair(level, f));
+  new_files.emplace_back(level, f);
   return nullptr;
 }
 
 void VersionEdit::EncodeFileBoundaries(std::string* dst,
-                                       const FileMetaData& meta,
-                                       size_t ts_sz) const {
+                                       const FileMetaData& meta, size_t ts_sz) {
   if (ts_sz == 0 || meta.user_defined_timestamps_persisted) {
     PutLengthPrefixedSlice(dst, meta.smallest.Encode());
     PutLengthPrefixedSlice(dst, meta.largest.Encode());
@@ -545,7 +579,8 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
         break;
 
       case kCompactCursor:
-        if (GetLevel(&input, &level, &msg) && GetInternalKey(&input, &key)) {
+        if (GetLevel(&input, &level, max_level_) &&
+            GetInternalKey(&input, &key)) {
           // Here we re-use the output format of compact pointer in LevelDB
           // to persist compact_cursors_
           compact_cursors_.push_back(std::make_pair(level, key));
@@ -558,7 +593,8 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
 
       case kDeletedFile: {
         uint64_t number = 0;
-        if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number)) {
+        if (GetLevel(&input, &level, max_level_) &&
+            GetVarint64(&input, &number)) {
           deleted_files_.insert(std::make_pair(level, number));
         } else {
           if (!msg) {
@@ -571,8 +607,8 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
       case kNewFile: {
         uint64_t number = 0;
         uint64_t file_size = 0;
-        if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
-            GetVarint64(&input, &file_size) &&
+        if (GetLevel(&input, &level, max_level_) &&
+            GetVarint64(&input, &number) && GetVarint64(&input, &file_size) &&
             GetInternalKey(&input, &f.smallest) &&
             GetInternalKey(&input, &f.largest)) {
           f.fd = FileDescriptor(number, 0, file_size);
@@ -589,8 +625,8 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
         uint64_t file_size = 0;
         SequenceNumber smallest_seqno = 0;
         SequenceNumber largest_seqno = kMaxSequenceNumber;
-        if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
-            GetVarint64(&input, &file_size) &&
+        if (GetLevel(&input, &level, max_level_) &&
+            GetVarint64(&input, &number) && GetVarint64(&input, &file_size) &&
             GetInternalKey(&input, &f.smallest) &&
             GetInternalKey(&input, &f.largest) &&
             GetVarint64(&input, &smallest_seqno) &&
@@ -612,8 +648,9 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
         uint64_t file_size = 0;
         SequenceNumber smallest_seqno = 0;
         SequenceNumber largest_seqno = kMaxSequenceNumber;
-        if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
-            GetVarint32(&input, &path_id) && GetVarint64(&input, &file_size) &&
+        if (GetLevel(&input, &level, max_level_) &&
+            GetVarint64(&input, &number) && GetVarint32(&input, &path_id) &&
+            GetVarint64(&input, &file_size) &&
             GetInternalKey(&input, &f.smallest) &&
             GetInternalKey(&input, &f.largest) &&
             GetVarint64(&input, &smallest_seqno) &&
@@ -630,7 +667,10 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
       }
 
       case kNewFile4: {
-        msg = DecodeNewFile4From(&input);
+        FileMetaData ignored_file;
+        msg = DecodeNewFile4From(&input, max_level_, min_log_number_to_keep_,
+                                 has_min_log_number_to_keep_, new_files_,
+                                 ignored_file);
         break;
       }
 
@@ -767,6 +807,23 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
         }
         break;
 
+      case kSubcompactionProgress: {
+        Slice encoded;
+        if (!GetLengthPrefixedSlice(&input, &encoded)) {
+          msg = "SubcompactionProgress not prefixed by length";
+          break;
+        }
+
+        SubcompactionProgress progress;
+        Status s = progress.DecodeFrom(&encoded);
+        if (!s.ok()) {
+          return s;
+        }
+
+        SetSubcompactionProgress(progress);
+        break;
+      }
+
       default:
         if (tag & kTagSafeIgnoreMask) {
           // Tag from future which can be safely ignored.
@@ -933,6 +990,10 @@ std::string VersionEdit::DebugString(bool hex_key) const {
     r.append("\n FullHistoryTsLow: ");
     r.append(Slice(full_history_ts_low_).ToString(hex_key));
   }
+  if (HasSubcompactionProgress()) {
+    r.append("\n SubcompactionProgress: ");
+    r.append(subcompaction_progress_.ToString());
+  }
   r.append("\n}\n");
   return r;
 }
@@ -1082,9 +1143,301 @@ std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const {
     jw << "FullHistoryTsLow" << Slice(full_history_ts_low_).ToString(hex_key);
   }
 
+  if (HasSubcompactionProgress()) {
+    jw << "SubcompactionProgress" << subcompaction_progress_.ToString();
+  }
+
   jw.EndObject();
 
   return jw.Get();
 }
 
+void SubcompactionProgressPerLevel::EncodeTo(std::string* dst) const {
+  if (num_processed_output_records_ > 0) {
+    PutVarint32(
+        dst,
+        SubcompactionProgressPerLevelCustomTag::kNumProcessedOutputRecords);
+    std::string varint_records;
+    PutVarint64(&varint_records, num_processed_output_records_);
+    PutLengthPrefixedSlice(dst, varint_records);
+  }
+
+  if (!output_files_.empty()) {
+    PutVarint32(dst, SubcompactionProgressPerLevelCustomTag::kOutputFilesDelta);
+    std::string files_data;
+    EncodeOutputFiles(&files_data);
+    PutLengthPrefixedSlice(dst, files_data);
+  }
+
+  PutVarint32(dst, SubcompactionProgressPerLevelCustomTag::
+                       kSubcompactionProgressPerLevelTerminate);
+}
+
+Status SubcompactionProgressPerLevel::DecodeFrom(Slice* input) {
+  Clear();
+
+  while (true) {
+    uint32_t tag = 0;
+    if (!GetVarint32(input, &tag)) {
+      return Status::Corruption("SubcompactionProgressPerLevel", "tag error");
+    }
+
+    if (tag == SubcompactionProgressPerLevelCustomTag::
+                   kSubcompactionProgressPerLevelTerminate) {
+      break;
+    }
+
+    Slice field;
+    if (!GetLengthPrefixedSlice(input, &field)) {
+      return Status::Corruption("SubcompactionProgressPerLevel",
+                                "field length prefixed slice error");
+    }
+
+    switch (tag) {
+      case SubcompactionProgressPerLevelCustomTag::kNumProcessedOutputRecords: {
+        if (!GetVarint64(&field, &num_processed_output_records_)) {
+          return Status::Corruption("SubcompactionProgressPerLevel",
+                                    "invalid num_processed_output_records_");
+        }
+        break;
+      }
+
+      case SubcompactionProgressPerLevelCustomTag::kOutputFilesDelta: {
+        Status s = DecodeOutputFiles(&field, output_files_);
+        if (!s.ok()) {
+          return s;
+        }
+        break;
+      }
+
+      default:
+        // Forward compatibility: Handle unknown tags
+        if ((tag & SubcompactionProgressPerLevelCustomTag::
+                       kSubcompactionProgressPerLevelCustomTagSafeIgnoreMask) !=
+            0) {
+          break;
+        } else {
+          return Status::NotSupported("SubcompactionProgress",
+                                      "unsupported critical custom field");
+        }
+    }
+  }
+
+  return Status::OK();
+}
+
+void SubcompactionProgressPerLevel::EncodeOutputFiles(std::string* dst) const {
+  size_t new_files_count =
+      output_files_.size() > last_persisted_output_files_count_
+          ? output_files_.size() - last_persisted_output_files_count_
+          : 0;
+
+  assert(new_files_count > 0);
+
+  PutVarint32(dst, static_cast<uint32_t>(new_files_count));
+
+  for (size_t i = last_persisted_output_files_count_; i < output_files_.size();
+       ++i) {
+    std::string file_dst;
+    bool ignored_min_log_written = false;
+
+    VersionEdit::EncodeToNewFile4(
+        output_files_[i], -1 /* level */, 0 /* ts_sz */,
+        false /* has_min_log_number_to_keep */, 0 /* min_log_number_to_keep */,
+        ignored_min_log_written, &file_dst);
+
+    PutLengthPrefixedSlice(dst, file_dst);
+  }
+}
+
+Status SubcompactionProgressPerLevel::DecodeOutputFiles(
+    Slice* input, autovector<FileMetaData>& output_files) {
+  uint32_t new_file_count = 0;
+  if (!GetVarint32(input, &new_file_count)) {
+    return Status::Corruption("SubcompactionProgressPerLevel",
+                              "new output file count");
+  }
+
+  assert(output_files.size() == 0);
+
+  output_files.reserve(new_file_count);
+
+  for (uint32_t i = 0; i < new_file_count; ++i) {
+    Slice file_input;
+    if (!GetLengthPrefixedSlice(input, &file_input)) {
+      return Status::Corruption("SubcompactionProgressPerLevel",
+                                "output file metadata");
+    }
+
+    uint32_t tag = 0;
+    if (!GetVarint32(&file_input, &tag) || tag != kNewFile4) {
+      return Status::Corruption("SubcompactionProgressPerLevel",
+                                "expected kNewFile4 tag");
+    }
+
+    int ignored_max_level = -1;
+    uint64_t ignored_min_log_number_to_keep = 0;
+    bool ignored_has_min_log_number_to_keep = false;
+    VersionEdit::NewFiles ignored_new_files;
+    FileMetaData file;
+
+    const char* err = VersionEdit::DecodeNewFile4From(
+        &file_input, ignored_max_level, ignored_min_log_number_to_keep,
+        ignored_has_min_log_number_to_keep, ignored_new_files, file);
+
+    if (err != nullptr) {
+      return Status::Corruption("SubcompactionProgressPerLevel", err);
+    }
+
+    output_files.push_back(std::move(file));
+  }
+
+  return Status::OK();
+}
+
+void SubcompactionProgress::EncodeTo(std::string* dst) const {
+  if (!next_internal_key_to_compact.empty()) {
+    PutVarint32(dst, SubcompactionProgressCustomTag::kNextInternalKeyToCompact);
+    PutLengthPrefixedSlice(dst, next_internal_key_to_compact);
+  }
+
+  PutVarint32(dst, SubcompactionProgressCustomTag::kNumProcessedInputRecords);
+  std::string varint_records;
+  PutVarint64(&varint_records, num_processed_input_records);
+  PutLengthPrefixedSlice(dst, varint_records);
+
+  if (output_level_progress.GetOutputFiles().size() >
+      output_level_progress.GetLastPersistedOutputFilesCount()) {
+    PutVarint32(dst, SubcompactionProgressCustomTag::kOutputLevelProgress);
+    std::string level_progress_data;
+    output_level_progress.EncodeTo(&level_progress_data);
+    PutLengthPrefixedSlice(dst, level_progress_data);
+  }
+
+  if (proximal_output_level_progress.GetOutputFiles().size() >
+      proximal_output_level_progress.GetLastPersistedOutputFilesCount()) {
+    PutVarint32(dst,
+                SubcompactionProgressCustomTag::kProximalOutputLevelProgress);
+    std::string level_progress_data;
+    proximal_output_level_progress.EncodeTo(&level_progress_data);
+    PutLengthPrefixedSlice(dst, level_progress_data);
+  }
+  PutVarint32(dst,
+              SubcompactionProgressCustomTag::kSubcompactionProgressTerminate);
+}
+
+Status SubcompactionProgress::DecodeFrom(Slice* input) {
+  Clear();
+
+  while (true) {
+    uint32_t custom_tag = 0;
+    if (!GetVarint32(input, &custom_tag)) {
+      return Status::Corruption("SubcompactionProgress",
+                                "custom field tag error");
+    }
+
+    if (custom_tag ==
+        SubcompactionProgressCustomTag::kSubcompactionProgressTerminate) {
+      break;
+    }
+
+    Slice field;
+    if (!GetLengthPrefixedSlice(input, &field)) {
+      return Status::Corruption("SubcompactionProgress",
+                                "custom field length prefixed slice error");
+    }
+
+    switch (custom_tag) {
+      case SubcompactionProgressCustomTag::kNextInternalKeyToCompact:
+        next_internal_key_to_compact = field.ToString();
+        break;
+
+      case SubcompactionProgressCustomTag::kNumProcessedInputRecords:
+        if (!GetVarint64(&field, &num_processed_input_records)) {
+          return Status::Corruption("SubcompactionProgress",
+                                    "invalid num_processed_input_records");
+        }
+        break;
+
+      case SubcompactionProgressCustomTag::kOutputLevelProgress: {
+        Status s = output_level_progress.DecodeFrom(&field);
+        if (!s.ok()) {
+          return s;
+        }
+        break;
+      }
+
+      case SubcompactionProgressCustomTag::kProximalOutputLevelProgress: {
+        Status s = proximal_output_level_progress.DecodeFrom(&field);
+        if (!s.ok()) {
+          return s;
+        }
+        break;
+      }
+
+      default:
+        if ((custom_tag & SubcompactionProgressCustomTag::
+                              kSubcompactionProgressCustomTagSafeIgnoreMask) !=
+            0) {
+          break;
+        } else {
+          return Status::NotSupported("SubcompactionProgress",
+                                      "unsupported critical custom field");
+        }
+    }
+  }
+
+  return Status::OK();
+}
+
+bool SubcompactionProgressBuilder::ProcessVersionEdit(const VersionEdit& edit) {
+  if (!edit.HasSubcompactionProgress()) {
+    return false;
+  }
+
+  const SubcompactionProgress& progress = edit.GetSubcompactionProgress();
+
+  MergeDeltaProgress(progress);
+
+  has_subcompaction_progress_ = true;
+
+  return true;
+}
+
+void SubcompactionProgressBuilder::MergeDeltaProgress(
+    const SubcompactionProgress& delta_progress) {
+  accumulated_subcompaction_progress_.next_internal_key_to_compact =
+      delta_progress.next_internal_key_to_compact;
+
+  accumulated_subcompaction_progress_.num_processed_input_records =
+      delta_progress.num_processed_input_records;
+
+  MaybeMergeDeltaProgressPerLevel(
+      accumulated_subcompaction_progress_.output_level_progress,
+      delta_progress.output_level_progress);
+
+  MaybeMergeDeltaProgressPerLevel(
+      accumulated_subcompaction_progress_.proximal_output_level_progress,
+      delta_progress.proximal_output_level_progress);
+}
+
+void SubcompactionProgressBuilder::MaybeMergeDeltaProgressPerLevel(
+    SubcompactionProgressPerLevel& accumulated_level_progress,
+    const SubcompactionProgressPerLevel& delta_level_progress) {
+  const auto& delta_files = delta_level_progress.GetOutputFiles();
+  if (delta_files.empty()) {
+    return;
+  }
+  for (const FileMetaData& file : delta_files) {
+    accumulated_level_progress.AddToOutputFiles(file);  // Stored as copy
+  }
+
+  accumulated_level_progress.SetNumProcessedOutputRecords(
+      delta_level_progress.GetNumProcessedOutputRecords());
+}
+
+void SubcompactionProgressBuilder::Clear() {
+  accumulated_subcompaction_progress_.Clear();
+  has_subcompaction_progress_ = false;
+}
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/version_edit.h b/db/version_edit.h
index 9189b4628109..ee6a6b01be43 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -25,6 +25,7 @@
 #include "rocksdb/advanced_options.h"
 #include "table/table_reader.h"
 #include "table/unique_id_impl.h"
+#include "test_util/sync_point.h"
 #include "util/autovector.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -72,6 +73,23 @@ enum Tag : uint32_t {
   kWalAddition2,
   kWalDeletion2,
   kPersistUserDefinedTimestamps,
+  kSubcompactionProgress,
+};
+
+enum SubcompactionProgressPerLevelCustomTag : uint32_t {
+  kSubcompactionProgressPerLevelTerminate = 1,  // End of fields marker
+  kOutputFilesDelta = 2,
+  kNumProcessedOutputRecords = 3,
+  kSubcompactionProgressPerLevelCustomTagSafeIgnoreMask = 1 << 16,
+};
+
+enum SubcompactionProgressCustomTag : uint32_t {
+  kSubcompactionProgressTerminate = 1,  // End of fields marker
+  kNextInternalKeyToCompact = 2,
+  kNumProcessedInputRecords = 3,
+  kOutputLevelProgress = 4,
+  kProximalOutputLevelProgress = 5,
+  kSubcompactionProgressCustomTagSafeIgnoreMask = 1 << 16,
 };
 
 enum NewFileCustomTag : uint32_t {
@@ -110,7 +128,7 @@ constexpr uint64_t kUnknownOldestAncesterTime = 0;
 constexpr uint64_t kUnknownNewestKeyTime = 0;
 constexpr uint64_t kUnknownFileCreationTime = 0;
 constexpr uint64_t kUnknownEpochNumber = 0;
-// If `Options::allow_ingest_behind` is true, this epoch number
+// If `Options::cf_allow_ingest_behind` is true, this epoch number
 // will be dedicated to files ingested behind.
 constexpr uint64_t kReservedEpochNumberForFileIngestedBehind = 1;
 
@@ -259,6 +277,14 @@ struct FileMetaData {
   // false, it's explicitly written to Manifest.
   bool user_defined_timestamps_persisted = true;
 
+  // Minimum user-defined timestamp in the file. Empty if no UDT or unknown.
+  // This is populated from the table properties "rocksdb.timestamp_min".
+  std::string min_timestamp;
+
+  // Maximum user-defined timestamp in the file. Empty if no UDT or unknown.
+  // This is populated from the table properties "rocksdb.timestamp_max".
+  std::string max_timestamp;
+
   FileMetaData() = default;
 
   FileMetaData(uint64_t file, uint32_t file_path_id, uint64_t file_size,
@@ -271,7 +297,9 @@ struct FileMetaData {
                const std::string& _file_checksum_func_name,
                UniqueId64x2 _unique_id,
                const uint64_t _compensated_range_deletion_size,
-               uint64_t _tail_size, bool _user_defined_timestamps_persisted)
+               uint64_t _tail_size, bool _user_defined_timestamps_persisted,
+               const std::string& _min_timestamp,
+               const std::string& _max_timestamp)
       : fd(file, file_path_id, file_size, smallest_seq, largest_seq),
         smallest(smallest_key),
         largest(largest_key),
@@ -286,7 +314,9 @@ struct FileMetaData {
         file_checksum_func_name(_file_checksum_func_name),
         unique_id(std::move(_unique_id)),
         tail_size(_tail_size),
-        user_defined_timestamps_persisted(_user_defined_timestamps_persisted) {
+        user_defined_timestamps_persisted(_user_defined_timestamps_persisted),
+        min_timestamp(_min_timestamp),
+        max_timestamp(_max_timestamp) {
     TEST_SYNC_POINT_CALLBACK("FileMetaData::FileMetaData", this);
   }
 
@@ -369,7 +399,8 @@ struct FileMetaData {
     usage += sizeof(*this);
 #endif  // ROCKSDB_MALLOC_USABLE_SIZE
     usage += smallest.size() + largest.size() + file_checksum.size() +
-             file_checksum_func_name.size();
+             file_checksum_func_name.size() + min_timestamp.size() +
+             max_timestamp.size();
     return usage;
   }
 
@@ -380,6 +411,33 @@ struct FileMetaData {
     assert(!res || fd.smallest_seqno == fd.largest_seqno);
     return res;
   }
+
+  static uint64_t CalculateTailSize(uint64_t file_size,
+                                    const TableProperties& props) {
+#ifndef NDEBUG
+    bool skip = false;
+    TEST_SYNC_POINT_CALLBACK("FileMetaData::CalculateTailSize", &skip);
+    if (skip) {
+      return 0;
+    }
+#endif  // NDEBUG
+    uint64_t tail_size = 0;
+
+    // Differentiate between a file with no data blocks (tail_start_offset = 0)
+    // and a file with unknown tail_start_offset (also set to 0 due to
+    // non-negative integer storage limitation)
+    bool contain_no_data_blocks =
+        props.num_entries == 0 ||
+        (props.num_entries > 0 &&
+         (props.num_entries == props.num_range_deletions));
+
+    if (props.tail_start_offset > 0 || contain_no_data_blocks) {
+      assert(props.tail_start_offset <= file_size);
+      tail_size = file_size - props.tail_start_offset;
+    }
+
+    return tail_size;
+  }
 };
 
 // A compressed copy of file meta data that just contain minimum data needed
@@ -413,12 +471,194 @@ struct LevelFilesBrief {
   }
 };
 
+struct SubcompactionProgressPerLevel {
+  uint64_t GetNumProcessedOutputRecords() const {
+    return num_processed_output_records_;
+  }
+
+  void SetNumProcessedOutputRecords(uint64_t num) {
+    num_processed_output_records_ = num;
+  }
+
+  const autovector<FileMetaData>& GetOutputFiles() const {
+    return output_files_;
+  }
+
+  void AddToOutputFiles(const FileMetaData& file) {
+    output_files_.push_back(file);
+  }
+
+  size_t GetLastPersistedOutputFilesCount() const {
+    return last_persisted_output_files_count_;
+  }
+
+  void UpdateLastPersistedOutputFilesCount() {
+    last_persisted_output_files_count_ = output_files_.size();
+  }
+
+  void EncodeTo(std::string* dst) const;
+
+  Status DecodeFrom(Slice* input);
+
+  void Clear() {
+    num_processed_output_records_ = 0;
+    output_files_.clear();
+    last_persisted_output_files_count_ = 0;
+  }
+
+  std::string ToString() const {
+    std::ostringstream oss;
+    oss << "SubcompactionProgressPerLevel{";
+    oss << " num_processed_output_records=" << num_processed_output_records_;
+    oss << ", output_files_count=" << output_files_.size();
+    oss << ", last_persisted_output_files_count="
+        << last_persisted_output_files_count_;
+    oss << " }";
+    return oss.str();
+  }
+
+  void TEST_ClearOutputFiles() { output_files_.clear(); }
+
+ private:
+  uint64_t num_processed_output_records_ = 0;
+
+  autovector<FileMetaData> output_files_ = {};
+
+  // Number of files already persisted to help calculate the new output files to
+  // persist in the future. This is to prevent having to persist all the output
+  // files metadata so far every time of a "snapshot" of a progress is persisted
+  // which can lead to O(1+2+...+n) = O(n^2) file metadata being persisted. The
+  // current approach of persisting only the delta should always persist
+  // exactly the number (n) of output files in total.
+  size_t last_persisted_output_files_count_ = 0;
+
+  void EncodeOutputFiles(std::string* dst) const;
+
+  Status DecodeOutputFiles(Slice* input,
+                           autovector<FileMetaData>& temp_storage);
+};
+
+struct SubcompactionProgress {
+  std::string next_internal_key_to_compact;
+
+  uint64_t num_processed_input_records = 0;
+
+  SubcompactionProgressPerLevel output_level_progress;
+
+  SubcompactionProgressPerLevel proximal_output_level_progress;
+
+  SubcompactionProgress() = default;
+
+  void Clear() {
+    next_internal_key_to_compact.clear();
+    num_processed_input_records = 0;
+    output_level_progress.Clear();
+    proximal_output_level_progress.Clear();
+  }
+
+  void EncodeTo(std::string* dst) const;
+
+  Status DecodeFrom(Slice* input);
+
+  std::string ToString() const {
+    std::ostringstream oss;
+    oss << "SubcompactionProgress{";
+    oss << " next_internal_key_to_compact=";
+    if (next_internal_key_to_compact.empty()) {
+      oss << "";
+    } else {
+      ParsedInternalKey parsed_key;
+      Slice key_slice(next_internal_key_to_compact);
+      if (ParseInternalKey(key_slice, &parsed_key, false /* log_err_key */)
+              .ok()) {
+        oss << "user_key(hex)=" << parsed_key.user_key.ToString(true /* hex */);
+        oss << ", seq=";
+        if (parsed_key.sequence == kMaxSequenceNumber) {
+          oss << "kMaxSequenceNumber";
+        } else {
+          oss << parsed_key.sequence;
+        }
+        oss << ", type=";
+        if (parsed_key.type == kValueTypeForSeek) {
+          oss << "kValueTypeForSeek";
+        } else {
+          oss << static_cast<int>(parsed_key.type);
+        }
+      } else {
+        oss << "raw=" << key_slice.ToString(true /* hex */);
+      }
+    }
+    oss << ", num_processed_input_records=" << num_processed_input_records;
+    oss << ", output_level_progress=" << output_level_progress.ToString();
+    oss << ", proximal_output_level_progress="
+        << proximal_output_level_progress.ToString();
+    oss << " }";
+    return oss.str();
+  }
+};
+
+class VersionEdit;
+
+// Builder class to reconstruct complete subcompaction progress object
+// from multiple decoded VersionEdits containing delta output files information
+// of the same subcompaction. See
+// `SubcompactionProgressPerLevel::last_persisted_output_files_count_`'s comment
+//
+// WARNING: This class currently assumes all input VersionEdits contain progress
+// information for the SAME subcompaction. It does not validate
+// progress data from different subcompactions so mixing progress from
+// multiple subcompactions can result in corrupted state silently. The caller is
+// responsible for ensuring all VersionEdits processed by a single instance
+// of this builder correspond to the same subcompaction.
+class SubcompactionProgressBuilder {
+ public:
+  SubcompactionProgressBuilder() = default;
+
+  bool ProcessVersionEdit(const VersionEdit& edit);
+
+  const SubcompactionProgress& GetAccumulatedSubcompactionProgress() const {
+    return accumulated_subcompaction_progress_;
+  }
+
+  bool HasAccumulatedSubcompactionProgress() const {
+    return has_subcompaction_progress_;
+  }
+
+  void Clear();
+
+ private:
+  void MergeDeltaProgress(const SubcompactionProgress& delta_progress);
+
+  void MaybeMergeDeltaProgressPerLevel(
+      SubcompactionProgressPerLevel& accumulated_level_progress,
+      const SubcompactionProgressPerLevel& delta_level_progress);
+
+  SubcompactionProgress accumulated_subcompaction_progress_;
+  bool has_subcompaction_progress_ = false;
+};
+
+// Type alias for backward compatibility - vector of subcompaction progress
+using CompactionProgress = std::vector<SubcompactionProgress>;
+
 // The state of a DB at any given time is referred to as a Version.
 // Any modification to the Version is considered a Version Edit. A Version is
 // constructed by joining a sequence of Version Edits. Version Edits are written
 // to the MANIFEST file.
 class VersionEdit {
  public:
+  // Retrieve the table files added as well as their associated levels.
+  using NewFiles = std::vector<std::pair<int, FileMetaData>>;
+
+  static void EncodeToNewFile4(const FileMetaData& f, int level, size_t ts_sz,
+                               bool has_min_log_number_to_keep,
+                               uint64_t min_log_number_to_keep,
+                               bool& min_log_num_written, std::string* dst);
+
+  static const char* DecodeNewFile4From(Slice* input, int& max_level,
+                                        uint64_t& min_log_number_to_keep,
+                                        bool& has_min_log_number_to_keep,
+                                        NewFiles& new_files, FileMetaData& f);
+
   void Clear();
 
   void SetDBId(const std::string& db_id) {
@@ -511,17 +751,19 @@ class VersionEdit {
                const std::string& file_checksum_func_name,
                const UniqueId64x2& unique_id,
                const uint64_t compensated_range_deletion_size,
-               uint64_t tail_size, bool user_defined_timestamps_persisted) {
+               uint64_t tail_size, bool user_defined_timestamps_persisted,
+               const std::string& min_timestamp = "",
+               const std::string& max_timestamp = "") {
     assert(smallest_seqno <= largest_seqno);
     new_files_.emplace_back(
         level,
-        FileMetaData(file, file_path_id, file_size, smallest, largest,
-                     smallest_seqno, largest_seqno, marked_for_compaction,
-                     temperature, oldest_blob_file_number, oldest_ancester_time,
-                     file_creation_time, epoch_number, file_checksum,
-                     file_checksum_func_name, unique_id,
-                     compensated_range_deletion_size, tail_size,
-                     user_defined_timestamps_persisted));
+        FileMetaData(
+            file, file_path_id, file_size, smallest, largest, smallest_seqno,
+            largest_seqno, marked_for_compaction, temperature,
+            oldest_blob_file_number, oldest_ancester_time, file_creation_time,
+            epoch_number, file_checksum, file_checksum_func_name, unique_id,
+            compensated_range_deletion_size, tail_size,
+            user_defined_timestamps_persisted, min_timestamp, max_timestamp));
     files_to_quarantine_.push_back(file);
     if (!HasLastSequence() || largest_seqno > GetLastSequence()) {
       SetLastSequence(largest_seqno);
@@ -537,8 +779,6 @@ class VersionEdit {
     }
   }
 
-  // Retrieve the table files added as well as their associated levels.
-  using NewFiles = std::vector<std::pair<int, FileMetaData>>;
   const NewFiles& GetNewFiles() const { return new_files_; }
 
   NewFiles& GetMutableNewFiles() { return new_files_; }
@@ -708,6 +948,22 @@ class VersionEdit {
     full_history_ts_low_ = std::move(full_history_ts_low);
   }
 
+  void SetSubcompactionProgress(const SubcompactionProgress& progress) {
+    has_subcompaction_progress_ = true;
+    subcompaction_progress_ = progress;
+  }
+
+  bool HasSubcompactionProgress() const { return has_subcompaction_progress_; }
+
+  const SubcompactionProgress& GetSubcompactionProgress() const {
+    return subcompaction_progress_;
+  }
+
+  void ClearSubcompactionProgress() {
+    has_subcompaction_progress_ = false;
+    subcompaction_progress_.Clear();
+  }
+
   // return true on success.
   // `ts_sz` is the size in bytes for the user-defined timestamp contained in
   // a user key. This argument is optional because it's only required for
@@ -730,15 +986,22 @@ class VersionEdit {
   std::string DebugJSON(int edit_num, bool hex_key = false) const;
 
  private:
-  bool GetLevel(Slice* input, int* level, const char** msg);
-
-  const char* DecodeNewFile4From(Slice* input);
-
+  // Decode level information from serialized VersionEdit data and and track the
+  // maximum level seen.
+  //
+  // Parameters:
+  //   input: Pointer to serialized data slice
+  //   level: Output parameter for the decoded level value
+  //   max_level: get updated if the decoded level is higher than passed in
+  //   value
+  //
+  // Returns: true on successful decode, false on parse error
+  static bool GetLevel(Slice* input, int* level, int& max_level);
   // Encode file boundaries `FileMetaData.smallest` and `FileMetaData.largest`.
   // User-defined timestamps in the user key will be stripped if they shouldn't
   // be persisted.
-  void EncodeFileBoundaries(std::string* dst, const FileMetaData& meta,
-                            size_t ts_sz) const;
+  static void EncodeFileBoundaries(std::string* dst, const FileMetaData& meta,
+                                   size_t ts_sz);
 
   int max_level_ = 0;
   std::string db_id_;
@@ -789,6 +1052,9 @@ class VersionEdit {
   std::string full_history_ts_low_;
   bool persist_user_defined_timestamps_ = true;
 
+  bool has_subcompaction_progress_ = false;
+  SubcompactionProgress subcompaction_progress_;
+
   // Newly created table files and blob files are eligible for deletion if they
   // are not registered as live files after the background jobs creating them
   // have finished. In case committing the VersionEdit containing such changes
diff --git a/db/version_edit_handler.cc b/db/version_edit_handler.cc
index 52947c484cf6..42d83b84d627 100644
--- a/db/version_edit_handler.cc
+++ b/db/version_edit_handler.cc
@@ -117,21 +117,43 @@ Status ListColumnFamiliesHandler::ApplyVersionEdit(
   return s;
 }
 
+Status FileChecksumRetriever::FetchFileChecksumList(
+    FileChecksumList& file_checksum_list) {
+  Status s = Status::OK();
+  for (const auto& [cf, file_checksums] : cf_file_checksums_) {
+    [[maybe_unused]] const auto& _ = cf;
+    for (const auto& [file_number, info] : file_checksums) {
+      if (!(s = file_checksum_list.InsertOneFileChecksum(
+                file_number, info.first, info.second))
+               .ok()) {
+        break;
+      }
+    }
+  }
+  return s;
+}
+
 Status FileChecksumRetriever::ApplyVersionEdit(VersionEdit& edit,
                                                ColumnFamilyData** /*unused*/) {
+  uint32_t column_family_id = edit.GetColumnFamily();
+  if (edit.IsColumnFamilyDrop()) {
+    cf_file_checksums_.erase(column_family_id);
+  }
   for (const auto& deleted_file : edit.GetDeletedFiles()) {
-    Status s = file_checksum_list_.RemoveOneFileChecksum(deleted_file.second);
-    if (!s.ok()) {
-      return s;
+    if (cf_file_checksums_.find(column_family_id) == cf_file_checksums_.end()) {
+      return Status::NotFound();
     }
+    if (cf_file_checksums_[column_family_id].find(deleted_file.second) ==
+        cf_file_checksums_[column_family_id].end()) {
+      return Status::NotFound();
+    }
+    cf_file_checksums_[column_family_id].erase(deleted_file.second);
   }
   for (const auto& new_file : edit.GetNewFiles()) {
-    Status s = file_checksum_list_.InsertOneFileChecksum(
-        new_file.second.fd.GetNumber(), new_file.second.file_checksum,
-        new_file.second.file_checksum_func_name);
-    if (!s.ok()) {
-      return s;
-    }
+    cf_file_checksums_[column_family_id].emplace(
+        new_file.second.fd.GetNumber(),
+        std::make_pair(new_file.second.file_checksum,
+                       new_file.second.file_checksum_func_name));
   }
   for (const auto& new_blob_file : edit.GetBlobFileAdditions()) {
     std::string checksum_value = new_blob_file.GetChecksumValue();
@@ -141,11 +163,9 @@ Status FileChecksumRetriever::ApplyVersionEdit(VersionEdit& edit,
       checksum_value = kUnknownFileChecksum;
       checksum_method = kUnknownFileChecksumFuncName;
     }
-    Status s = file_checksum_list_.InsertOneFileChecksum(
-        new_blob_file.GetBlobFileNumber(), checksum_value, checksum_method);
-    if (!s.ok()) {
-      return s;
-    }
+    cf_file_checksums_[column_family_id].emplace(
+        new_blob_file.GetBlobFileNumber(),
+        std::make_pair(checksum_value, checksum_method));
   }
   return Status::OK();
 }
@@ -408,7 +428,7 @@ void VersionEditHandler::CheckIterationResult(const log::Reader& reader,
       if (cfd->IsDropped()) {
         continue;
       }
-      if (read_only_) {
+      if (version_set_->unchanging()) {
         cfd->table_cache()->SetTablesAreImmortal();
       }
       *s = LoadTables(cfd, /*prefetch_index_and_filter_in_cache=*/false,
@@ -471,8 +491,8 @@ void VersionEditHandler::CheckIterationResult(const log::Reader& reader,
 ColumnFamilyData* VersionEditHandler::CreateCfAndInit(
     const ColumnFamilyOptions& cf_options, const VersionEdit& edit) {
   uint32_t cf_id = edit.GetColumnFamily();
-  ColumnFamilyData* cfd =
-      version_set_->CreateColumnFamily(cf_options, read_options_, &edit);
+  ColumnFamilyData* cfd = version_set_->CreateColumnFamily(
+      cf_options, read_options_, &edit, read_only_);
   assert(cfd != nullptr);
   cfd->set_initialized();
   assert(builders_.find(cf_id) == builders_.end());
@@ -1135,6 +1155,15 @@ void DumpManifestHandler::CheckIterationResult(const log::Reader& reader,
     // Print out DebugStrings. Can include non-terminating null characters.
     fwrite(cfd->current()->DebugString(hex_).data(), sizeof(char),
            cfd->current()->DebugString(hex_).size(), stdout);
+
+    fprintf(stdout,
+            "By default, manifest file dump prints LSM trees as if %d levels "
+            "were configured, "
+            "which is not necessarily true for the column family (CF) this "
+            "manifest is associated with. "
+            "Please consult other DB files, such as the OPTIONS file, to "
+            "confirm.\n",
+            cfd->ioptions().num_levels);
   }
   fprintf(stdout,
           "next_file_number %" PRIu64 " last_sequence %" PRIu64
diff --git a/db/version_edit_handler.h b/db/version_edit_handler.h
index f3637ae73075..1d4b22e3c13e 100644
--- a/db/version_edit_handler.h
+++ b/db/version_edit_handler.h
@@ -80,19 +80,42 @@ class ListColumnFamiliesHandler : public VersionEditHandlerBase {
 
 class FileChecksumRetriever : public VersionEditHandlerBase {
  public:
-  FileChecksumRetriever(const ReadOptions& read_options, uint64_t max_read_size,
-                        FileChecksumList& file_checksum_list)
-      : VersionEditHandlerBase(read_options, max_read_size),
-        file_checksum_list_(file_checksum_list) {}
+  FileChecksumRetriever(const ReadOptions& read_options, uint64_t max_read_size)
+      : VersionEditHandlerBase(read_options, max_read_size) {}
 
   ~FileChecksumRetriever() override {}
 
+  Status FetchFileChecksumList(FileChecksumList& file_checksum_list);
+
  protected:
   Status ApplyVersionEdit(VersionEdit& edit,
                           ColumnFamilyData** /*unused*/) override;
 
  private:
-  FileChecksumList& file_checksum_list_;
+  // Map from CF to file # to string pair, where first portion of the value
+  // is checksum, and second portion of the value is checksum function name.
+  //
+  // [column family id A]
+  //      |
+  //      |-- [file #1] -> [checksum #1, checksum function name #1]
+  //      |-- [file #2] -> [checksum #2, checksum function name #2]
+  //      |
+  //     ...
+  //      |
+  //      |-- [file #N] -> [checksum #N, checksum function name #N]
+  // [column family id B]
+  //      |
+  //      |-- [file #1] -> [checksum #1, checksum function name #1]
+  //      |
+  //     ...
+  //      |
+  //      |-- [file #M] -> [checksum #M, checksum function name #M]
+  //      |
+  //     ...
+  std::unordered_map<
+      uint32_t,
+      std::unordered_map<uint64_t, std::pair<std::string, std::string>>>
+      cf_file_checksums_;
 };
 
 using VersionBuilderUPtr = std::unique_ptr<BaseReferencedVersionBuilder>;
@@ -198,7 +221,9 @@ class VersionEditHandler : public VersionEditHandlerBase {
                             bool prefetch_index_and_filter_in_cache,
                             bool is_initial_load);
 
-  virtual bool MustOpenAllColumnFamilies() const { return !read_only_; }
+  virtual bool MustOpenAllColumnFamilies() const {
+    return !version_set_->unchanging();
+  }
 
   const bool read_only_;
   std::vector<ColumnFamilyDescriptor> column_families_;
@@ -334,10 +359,10 @@ class ManifestTailer : public VersionEditHandlerPointInTime {
                           const ReadOptions& read_options,
                           EpochNumberRequirement epoch_number_requirement =
                               EpochNumberRequirement::kMustPresent)
-      : VersionEditHandlerPointInTime(/*read_only=*/false, column_families,
-                                      version_set, io_tracer, read_options,
-                                      /*allow_incomplete_valid_version=*/false,
-                                      epoch_number_requirement),
+      : VersionEditHandlerPointInTime(
+            /*read_only=*/true, column_families, version_set, io_tracer,
+            read_options,
+            /*allow_incomplete_valid_version=*/false, epoch_number_requirement),
         mode_(Mode::kRecovery) {}
 
   Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath, int level,
diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc
index 25235206994a..d5f6beee93cc 100644
--- a/db/version_edit_test.cc
+++ b/db/version_edit_test.cc
@@ -794,6 +794,339 @@ TEST(FileMetaDataTest, UpdateBoundariesBlobIndex) {
   }
 }
 
+class SubcompactionProgressTest : public VersionEditTest {
+ protected:
+  static constexpr uint64_t kTestFileSize = 1024;
+  static constexpr SequenceNumber kTestSmallestSeq = 50;
+  static constexpr SequenceNumber kTestLargestSeq = 150;
+  static constexpr uint64_t kTestOldestAncesterTime = 12345;
+  static constexpr uint64_t kTestFileCreationTime = 67890;
+  static constexpr uint64_t kTestEpochNumber = 10;
+  static const std::string kTestChecksumFuncName;
+
+  FileMetaData CreateTestFile(uint64_t file_number, const std::string& prefix) {
+    FileMetaData file;
+    file.fd = FileDescriptor(file_number, 0, kTestFileSize, kTestSmallestSeq,
+                             kTestLargestSeq);
+    file.smallest = InternalKey(prefix + "a", kTestSmallestSeq, kTypeValue);
+    file.largest = InternalKey(prefix + "z", kTestLargestSeq, kTypeValue);
+    file.oldest_ancester_time = kTestOldestAncesterTime;
+    file.file_creation_time = kTestFileCreationTime;
+    file.epoch_number = kTestEpochNumber;
+    file.file_checksum = "checksum_" + std::to_string(file_number);
+    file.file_checksum_func_name = kTestChecksumFuncName;
+    file.marked_for_compaction = false;
+    file.temperature = Temperature::kUnknown;
+    return file;
+  }
+
+  // Store external file metadata objects for testing
+  // These simulate files owned by CompactionOutputs
+  std::vector<FileMetaData> compaction_output_files_;
+  std::vector<FileMetaData> proximal_level_compaction_output_files_;
+
+  SubcompactionProgress CreateSubcompactionProgress(
+      const std::string& next_key, uint64_t num_processed_input_records,
+      uint64_t num_processed_output_records,
+      uint64_t num_processed_proximal_level_output_records,
+      const std::vector<uint64_t>& output_file_numbers = {},
+      const std::vector<uint64_t>& proximal_file_numbers = {},
+      const std::string& file_prefix = "file_") {
+    SubcompactionProgress progress;
+    progress.next_internal_key_to_compact = next_key;
+    progress.num_processed_input_records = num_processed_input_records;
+    progress.output_level_progress.SetNumProcessedOutputRecords(
+        num_processed_output_records);
+    progress.proximal_output_level_progress.SetNumProcessedOutputRecords(
+        num_processed_proximal_level_output_records);
+
+    for (uint64_t file_num : output_file_numbers) {
+      FileMetaData file = CreateTestFile(file_num, file_prefix + "output_");
+      progress.output_level_progress.AddToOutputFiles(file);
+    }
+
+    for (uint64_t file_num : proximal_file_numbers) {
+      FileMetaData file = CreateTestFile(file_num, file_prefix + "proximal_");
+      progress.proximal_output_level_progress.AddToOutputFiles(file);
+    }
+
+    return progress;
+  }
+
+  std::pair<const VersionEdit, const SubcompactionProgress>
+  EncodeDecodeProgress(const SubcompactionProgress& progress) {
+    VersionEdit edit;
+    edit.SetSubcompactionProgress(progress);
+
+    std::string encoded;
+    EXPECT_TRUE(edit.EncodeTo(&encoded, 0 /* ts_sz */));
+
+    VersionEdit decoded_edit;
+    EXPECT_OK(decoded_edit.DecodeFrom(encoded));
+    EXPECT_TRUE(decoded_edit.HasSubcompactionProgress());
+
+    SubcompactionProgress decoded_progress =
+        decoded_edit.GetSubcompactionProgress();
+
+    return {std::move(decoded_edit), std::move(decoded_progress)};
+  }
+
+  void VerifyFileMetaDataEquality(const FileMetaData& expected,
+                                  const FileMetaData& actual) {
+    // Verify the major fields only
+    ASSERT_EQ(actual.fd.GetNumber(), expected.fd.GetNumber());
+    ASSERT_EQ(actual.fd.GetFileSize(), expected.fd.GetFileSize());
+    ASSERT_EQ(actual.smallest.Encode(), expected.smallest.Encode());
+    ASSERT_EQ(actual.largest.Encode(), expected.largest.Encode());
+    ASSERT_EQ(actual.oldest_ancester_time, expected.oldest_ancester_time);
+    ASSERT_EQ(actual.file_creation_time, expected.file_creation_time);
+    ASSERT_EQ(actual.epoch_number, expected.epoch_number);
+    ASSERT_EQ(actual.file_checksum, expected.file_checksum);
+    ASSERT_EQ(actual.file_checksum_func_name, expected.file_checksum_func_name);
+    ASSERT_EQ(actual.marked_for_compaction, expected.marked_for_compaction);
+    ASSERT_EQ(actual.temperature, expected.temperature);
+  }
+
+  void VerifyProgressEquality(const SubcompactionProgress& expected,
+                              const SubcompactionProgress& actual) {
+    ASSERT_EQ(actual.next_internal_key_to_compact,
+              expected.next_internal_key_to_compact);
+
+    ASSERT_EQ(actual.num_processed_input_records,
+              expected.num_processed_input_records);
+
+    for (const bool is_proximal_level : {false, true}) {
+      const SubcompactionProgressPerLevel&
+          actual_subcompaction_progress_by_level =
+              is_proximal_level ? actual.proximal_output_level_progress
+                                : actual.output_level_progress;
+
+      const SubcompactionProgressPerLevel&
+          expected_subcompaction_progress_by_level =
+              is_proximal_level ? expected.proximal_output_level_progress
+                                : expected.output_level_progress;
+
+      ASSERT_EQ(
+          actual_subcompaction_progress_by_level.GetNumProcessedOutputRecords(),
+          expected_subcompaction_progress_by_level
+              .GetNumProcessedOutputRecords());
+
+      ASSERT_EQ(
+          actual_subcompaction_progress_by_level.GetOutputFiles().size(),
+          expected_subcompaction_progress_by_level.GetOutputFiles().size());
+
+      for (size_t i = 0;
+           i < expected_subcompaction_progress_by_level.GetOutputFiles().size();
+           ++i) {
+        VerifyFileMetaDataEquality(
+            expected_subcompaction_progress_by_level.GetOutputFiles()[i],
+            actual_subcompaction_progress_by_level.GetOutputFiles()[i]);
+      }
+    }
+  }
+};
+
+const std::string SubcompactionProgressTest::kTestChecksumFuncName = "crc32c";
+
+TEST_F(SubcompactionProgressTest, BasicEncodeDecode) {
+  // Create progress with files for both levels
+  SubcompactionProgress progress = CreateSubcompactionProgress(
+      "key_100",  // next_internal_key_to_compact
+      500,        // num_processed_input_records
+      400,        // num_processed_output_records
+      100,        // num_processed_proximal_level_output_records
+      {1},        // output_file_numbers
+      {2},        // proximal_file_numbers
+      "test_"     // file_prefix
+  );
+
+  auto [ignored, decoded_progress] = EncodeDecodeProgress(progress);
+
+  VerifyProgressEquality(progress, decoded_progress);
+}
+
+TEST_F(SubcompactionProgressTest, OutputFilesDeltaEncodeDecode) {
+  // Test Delta Encoding/Decoding
+  SubcompactionProgress initial_progress = CreateSubcompactionProgress(
+      "key_100",  // next_internal_key_to_compact
+      100,        // num_processed_input_records
+      40,         // num_processed_output_records
+      60,         // num_processed_proximal_level_output_records
+      {1},        // output_file_numbers
+      {2},        // proximal_file_numbers
+      "initial_"  // file_prefix
+  );
+
+  auto [initial_decoded_edit, ignored_1] =
+      EncodeDecodeProgress(initial_progress);
+  initial_progress.output_level_progress.UpdateLastPersistedOutputFilesCount();
+  initial_progress.proximal_output_level_progress
+      .UpdateLastPersistedOutputFilesCount();
+
+  // Add one new output file to output and proximal level
+  SubcompactionProgress updated_progress = initial_progress;
+  updated_progress.next_internal_key_to_compact = "key_300";
+  updated_progress.num_processed_input_records = 1000;
+  updated_progress.output_level_progress.SetNumProcessedOutputRecords(400);
+  updated_progress.proximal_output_level_progress.SetNumProcessedOutputRecords(
+      600);
+
+  FileMetaData new_file = CreateTestFile(3, "new_");
+  updated_progress.output_level_progress.AddToOutputFiles(new_file);
+
+  FileMetaData new_file_proximal = CreateTestFile(4, "new_");
+  updated_progress.proximal_output_level_progress.AddToOutputFiles(
+      new_file_proximal);
+
+  auto [delta_decoded_edit, delta_decoded_progress] =
+      EncodeDecodeProgress(updated_progress);
+
+  ASSERT_EQ(delta_decoded_progress.next_internal_key_to_compact,
+            updated_progress.next_internal_key_to_compact);
+
+  ASSERT_EQ(delta_decoded_progress.num_processed_input_records,
+            updated_progress.num_processed_input_records);
+
+  for (const bool& is_proximal_level : {false, true}) {
+    const SubcompactionProgressPerLevel& delta_progress_per_level =
+        is_proximal_level
+            ? delta_decoded_progress.proximal_output_level_progress
+            : delta_decoded_progress.output_level_progress;
+
+    const SubcompactionProgressPerLevel& updated_progress_per_level =
+        is_proximal_level ? updated_progress.proximal_output_level_progress
+                          : updated_progress.output_level_progress;
+
+    ASSERT_EQ(delta_progress_per_level.GetNumProcessedOutputRecords(),
+              updated_progress_per_level.GetNumProcessedOutputRecords());
+
+    // Only the newly added file since last persistence should be present
+    ASSERT_EQ(delta_progress_per_level.GetOutputFiles().size(), 1);
+
+    ASSERT_EQ(delta_progress_per_level.GetOutputFiles()[0].fd.GetNumber(),
+              is_proximal_level ? new_file_proximal.fd.GetNumber()
+                                : new_file.fd.GetNumber());
+  }
+
+  // Test SubcompactionProgressBuilder
+  SubcompactionProgressBuilder builder;
+  ASSERT_FALSE(builder.HasAccumulatedSubcompactionProgress());
+
+  ASSERT_TRUE(builder.ProcessVersionEdit(initial_decoded_edit));
+  ASSERT_TRUE(builder.HasAccumulatedSubcompactionProgress());
+  ASSERT_TRUE(builder.ProcessVersionEdit(delta_decoded_edit));
+
+  const auto& accumulated_progress =
+      builder.GetAccumulatedSubcompactionProgress();
+
+  ASSERT_EQ(accumulated_progress.next_internal_key_to_compact,
+            updated_progress.next_internal_key_to_compact);
+
+  ASSERT_EQ(accumulated_progress.num_processed_input_records,
+            updated_progress.num_processed_input_records);
+
+  for (const bool& is_proximal_level : {false, true}) {
+    const SubcompactionProgressPerLevel& accumulated_progress_per_level =
+        is_proximal_level ? accumulated_progress.proximal_output_level_progress
+                          : accumulated_progress.output_level_progress;
+
+    const SubcompactionProgressPerLevel& updated_progress_per_level =
+        is_proximal_level ? updated_progress.proximal_output_level_progress
+                          : updated_progress.output_level_progress;
+
+    ASSERT_EQ(accumulated_progress_per_level.GetNumProcessedOutputRecords(),
+              updated_progress_per_level.GetNumProcessedOutputRecords());
+
+    ASSERT_EQ(accumulated_progress_per_level.GetOutputFiles().size(),
+              updated_progress_per_level.GetOutputFiles().size());
+
+    std::set<uint64_t> accumulated_file_numbers;
+    for (const auto& file : accumulated_progress_per_level.GetOutputFiles()) {
+      accumulated_file_numbers.insert(file.fd.GetNumber());
+    }
+
+    std::set<uint64_t> expected_file_numbers;
+    for (const auto& file : updated_progress_per_level.GetOutputFiles()) {
+      expected_file_numbers.insert(file.fd.GetNumber());
+    }
+
+    ASSERT_EQ(accumulated_file_numbers, expected_file_numbers);
+  }
+
+  // ===== PART 3: Test Builder Reset =====
+  builder.Clear();
+  ASSERT_FALSE(builder.HasAccumulatedSubcompactionProgress());
+}
+
+TEST_F(SubcompactionProgressTest, UnknownTags) {
+  SubcompactionProgress progress;
+  std::string encoded;
+
+  // 1. Test unknown ignorable tag
+  progress.next_internal_key_to_compact = "test_key";
+  progress.num_processed_input_records = 100;
+
+  PutVarint32(&encoded,
+              SubcompactionProgressCustomTag::kNextInternalKeyToCompact);
+  PutLengthPrefixedSlice(&encoded, progress.next_internal_key_to_compact);
+
+  PutVarint32(&encoded,
+              SubcompactionProgressCustomTag::kNumProcessedInputRecords);
+  std::string varint_records;
+  PutVarint64(&varint_records, progress.num_processed_input_records);
+  PutLengthPrefixedSlice(&encoded, varint_records);
+
+  // Manually encode with unknown ignorable tag (has
+  // SubcompactionProgressCustomTag::kSubcompactionProgressCustomTagSafeIgnoreMask
+  // bit set)
+  uint32_t unknown_ignorable_tag =
+      SubcompactionProgressCustomTag::
+          kSubcompactionProgressCustomTagSafeIgnoreMask +
+      1;
+  PutVarint32(&encoded, unknown_ignorable_tag);
+  PutLengthPrefixedSlice(&encoded, "future_data");
+
+  PutVarint32(&encoded,
+              SubcompactionProgressCustomTag::kSubcompactionProgressTerminate);
+
+  // Test decoding - should succeed and ignore unknown tag
+  Slice input(encoded);
+  SubcompactionProgress decoded_progress;
+  Status s = decoded_progress.DecodeFrom(&input);
+  ASSERT_OK(s);
+
+  // Verify known fields are preserved
+  ASSERT_EQ(decoded_progress.next_internal_key_to_compact,
+            progress.next_internal_key_to_compact);
+  ASSERT_EQ(decoded_progress.num_processed_input_records,
+            progress.num_processed_input_records);
+
+  // 2. Test unknown non-ignorable tag
+  encoded.clear();
+  PutVarint32(&encoded,
+              SubcompactionProgressCustomTag::kNextInternalKeyToCompact);
+  PutLengthPrefixedSlice(&encoded, "test_key");
+
+  // Manually encode with unknown non-ignorable tag (do not have
+  // SubcompactionProgressCustomTag::kSubcompactionProgressCustomTagSafeIgnoreMask
+  // bit set)
+  uint32_t unknown_critical_tag =
+      SubcompactionProgressCustomTag::
+          kSubcompactionProgressCustomTagSafeIgnoreMask -
+      1;
+  PutVarint32(&encoded, unknown_critical_tag);
+  PutLengthPrefixedSlice(&encoded, "critical_future_data");
+  PutVarint32(&encoded,
+              SubcompactionProgressCustomTag::kSubcompactionProgressTerminate);
+
+  // Test decoding - should fail on critical unknown tag
+  Slice critical_input(encoded);
+  SubcompactionProgress critical_progress;
+  Status critical_status = critical_progress.DecodeFrom(&critical_input);
+  ASSERT_NOK(critical_status);
+  ASSERT_TRUE(critical_status.IsNotSupported());
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/version_set.cc b/db/version_set.cc
index b560713cbbab..6c9cbc82a17c 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -16,6 +16,7 @@
 #include <list>
 #include <map>
 #include <set>
+#include <stdexcept>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -95,6 +96,8 @@ namespace ROCKSDB_NAMESPACE {
 
 namespace {
 
+using ScanOptionsMap = std::unordered_map<size_t, MultiScanArgs>;
+
 // Find File in LevelFilesBrief data structure
 // Within an index range defined by left and right
 int FindFileInRange(const InternalKeyComparator& icmp,
@@ -978,7 +981,8 @@ class LevelIterator final : public InternalIterator {
           nullptr,
       bool allow_unprepared_value = false,
       std::unique_ptr<TruncatedRangeDelIterator>*** range_tombstone_iter_ptr_ =
-          nullptr)
+          nullptr,
+      Statistics* db_statistics = nullptr, SystemClock* clock = nullptr)
       : table_cache_(table_cache),
         read_options_(read_options),
         file_options_(file_options),
@@ -1002,7 +1006,10 @@ class LevelIterator final : public InternalIterator {
         skip_filters_(skip_filters),
         allow_unprepared_value_(allow_unprepared_value),
         is_next_read_sequential_(false),
-        to_return_sentinel_(false) {
+        to_return_sentinel_(false),
+        scan_opts_(nullptr),
+        db_statistics_(db_statistics),
+        clock_(clock) {
     // Empty level is not supported.
     assert(flevel_ != nullptr && flevel_->num_files > 0);
     if (range_tombstone_iter_ptr_) {
@@ -1010,7 +1017,15 @@ class LevelIterator final : public InternalIterator {
     }
   }
 
-  ~LevelIterator() override { delete file_iter_.Set(nullptr); }
+  ~LevelIterator() override {
+    delete file_iter_.Set(nullptr);
+    // Clean up any prepared iterators that weren't used
+    for (auto& entry : prepared_iters_) {
+      delete entry.second;
+    }
+    prepared_iters_.clear();
+    assert(prepared_iters_.size() == 0);
+  }
 
   // Seek to the first file with a key >= target.
   // If range_tombstone_iter_ is not nullptr, then we pretend that file
@@ -1098,6 +1113,136 @@ class LevelIterator final : public InternalIterator {
     read_seq_ = read_seq;
   }
 
+  inline bool FileHasMultiScanArg(size_t file_index) {
+    if (file_to_scan_opts_.get()) {
+      auto it = file_to_scan_opts_->find(file_index);
+      if (it != file_to_scan_opts_->end()) {
+        return !it->second.empty();
+      }
+    }
+    return false;
+  }
+
+  MultiScanArgs& GetMultiScanArgForFile(size_t file_index) {
+    auto multi_scan_args_it = file_to_scan_opts_->find(file_index);
+    if (multi_scan_args_it == file_to_scan_opts_->end()) {
+      auto ret = file_to_scan_opts_->emplace(
+          file_index, MultiScanArgs(user_comparator_.user_comparator()));
+      multi_scan_args_it = ret.first;
+      assert(ret.second);
+    }
+    return multi_scan_args_it->second;
+  }
+
+  void Prepare(const MultiScanArgs* so) override {
+    // We assume here that scan_opts is sorted such that
+    // scan_opts[0].range.start < scan_opts[1].range.start, and non
+    // overlapping
+    if (so == nullptr) {
+      return;
+    }
+
+    scan_opts_ = so;
+
+    // Verify comparator is consistent
+    assert(so->GetComparator() == user_comparator_.user_comparator());
+
+    file_to_scan_opts_ = std::make_unique<ScanOptionsMap>();
+    for (size_t k = 0; k < scan_opts_->size(); k++) {
+      const ScanOptions& opt = scan_opts_->GetScanRanges().at(k);
+      auto start = opt.range.start;
+      auto end = opt.range.limit;
+
+      if (!start.has_value()) {
+        continue;
+      }
+
+      // We can capture this case in the future, but for now lets skip this.
+      if (!end.has_value()) {
+        continue;
+      }
+
+      const size_t timestamp_size =
+          user_comparator_.user_comparator()->timestamp_size();
+      InternalKey istart, iend;
+      if (timestamp_size == 0) {
+        istart =
+            InternalKey(start.value(), kMaxSequenceNumber, kValueTypeForSeek);
+        // end key is exclusive for multiscan
+        iend = InternalKey(end.value(), kMaxSequenceNumber, kValueTypeForSeek);
+      } else {
+        std::string start_key_with_ts, end_key_with_ts;
+        AppendKeyWithMaxTimestamp(&start_key_with_ts, start.value(),
+                                  timestamp_size);
+        AppendKeyWithMaxTimestamp(&end_key_with_ts, end.value(),
+                                  timestamp_size);
+        istart = InternalKey(start_key_with_ts, kMaxSequenceNumber,
+                             kValueTypeForSeek);
+        // end key is exclusive for multiscan
+        iend =
+            InternalKey(end_key_with_ts, kMaxSequenceNumber, kValueTypeForSeek);
+      }
+
+      // TODO: This needs to be optimized, right now we iterate twice, which
+      // we dont need to. We can do this in N rather than 2N.
+      size_t fstart = FindFile(icomparator_, *flevel_, istart.Encode());
+      size_t fend = FindFile(icomparator_, *flevel_, iend.Encode());
+
+      // We need to check the relevant cases
+      // Cases:
+      // 1. [  S        E  ]
+      // 2. [  S  ]  [  E  ]
+      // 3. [  S  ] ...... [  E  ]
+      for (auto i = fstart; i <= fend; i++) {
+        if (i < flevel_->num_files) {
+          // FindFile only compares against the largest_key, so we need this
+          // additional check to ensure the scan range overlaps the file
+          if (icomparator_.InternalKeyComparator::Compare(
+                  iend.Encode(), flevel_->files[i].smallest_key) < 0) {
+            continue;
+          }
+          auto const metadata = flevel_->files[i].file_metadata;
+          if (metadata->FileIsStandAloneRangeTombstone()) {
+            // Skip stand alone range deletion files.
+            continue;
+          }
+          auto& args = GetMultiScanArgForFile(i);
+          args.insert(start.value(), end.value(), opt.property_bag);
+        }
+      }
+    }
+
+    StopWatch timer(clock_, db_statistics_, MULTISCAN_PREPARE_ITERATORS);
+
+    // Propagate multiscan configs
+    for (auto& file_to_arg : *file_to_scan_opts_) {
+      file_to_arg.second.CopyConfigFrom(*so);
+      assert(OverlapRange(*file_to_arg.second.GetScanRanges().begin(),
+                          file_to_arg.first) &&
+             OverlapRange(*file_to_arg.second.GetScanRanges().rbegin(),
+                          file_to_arg.first));
+    }
+
+    if (so->use_async_io) {
+      auto before = file_index_;
+      // Pre-create and prepare only relevant file iterators
+      for (auto& file_to_arg : *file_to_scan_opts_) {
+        size_t file_index = file_to_arg.first;
+
+        file_index_ = file_index;
+        // Create iterator for this file
+        auto iter = NewFileIterator();
+        if (iter != nullptr) {
+          // If we have async enabled, lets prepare all our iterators.
+          iter->Prepare(&file_to_arg.second);
+          // Store the prepared iterator
+          prepared_iters_[file_index] = iter;
+        }
+      }
+      file_index_ = before;
+    }
+  }
+
  private:
   // Return true if at least one invalid file is seen and skipped.
   bool SkipEmptyFileForward();
@@ -1170,6 +1315,10 @@ class LevelIterator final : public InternalIterator {
     }
   }
 
+#ifndef NDEBUG
+  bool OverlapRange(const ScanOptions& opts, size_t file_index);
+#endif
+
   TableCache* table_cache_;
   const ReadOptions& read_options_;
   const FileOptions& file_options_;
@@ -1223,6 +1372,16 @@ class LevelIterator final : public InternalIterator {
   bool prefix_exhausted_ = false;
   // Whether next/prev key is a sentinel key.
   bool to_return_sentinel_ = false;
+  const MultiScanArgs* scan_opts_ = nullptr;
+
+  Statistics* db_statistics_ = nullptr;
+  SystemClock* clock_ = nullptr;
+
+  // Our stored scan_opts for each prefix
+  std::unique_ptr<ScanOptionsMap> file_to_scan_opts_ = nullptr;
+
+  // Map to store pre-created iterators by file index
+  std::unordered_map<size_t, InternalIterator*> prepared_iters_;
 
   // Sets flags for if we should return the sentinel key next.
   // The condition for returning sentinel is reaching the end of current
@@ -1263,6 +1422,14 @@ void LevelIterator::Seek(const Slice& target) {
   }
 
   if (file_iter_.iter() != nullptr) {
+    if (scan_opts_) {
+      // At this point, we only know that the seek target is < largest_key
+      // in the file. We need to check whether there is actual overlap.
+      const FdWithKeyRange& cur_file = flevel_->files[file_index_];
+      if (KeyReachedUpperBound(cur_file.smallest_key)) {
+        return;
+      }
+    }
     file_iter_.Seek(target);
     // Status::TryAgain indicates asynchronous request for retrieval of data
     // blocks has been submitted. So it should return at this point and Seek
@@ -1485,7 +1652,31 @@ bool LevelIterator::SkipEmptyFileForward() {
     // LevelIterator::Seek*, it should also call Seek* into the corresponding
     // range tombstone iterator.
     if (file_iter_.iter() != nullptr) {
-      file_iter_.SeekToFirst();
+      // If we are doing prepared scan opts then we should seek to the values
+      // specified by the scan opts
+
+      if (scan_opts_ && FileHasMultiScanArg(file_index_)) {
+        const ScanOptions& opts =
+            GetMultiScanArgForFile(file_index_).GetScanRanges().front();
+        if (opts.range.start.has_value()) {
+          InternalKey target;
+          const size_t ts_size =
+              user_comparator_.user_comparator()->timestamp_size();
+          if (ts_size == 0) {
+            target = InternalKey(opts.range.start.value(), kMaxSequenceNumber,
+                                 kValueTypeForSeek);
+          } else {
+            std::string seek_key;
+            AppendKeyWithMaxTimestamp(&seek_key, opts.range.start.value(),
+                                      ts_size);
+            target =
+                InternalKey(seek_key, kMaxSequenceNumber, kValueTypeForSeek);
+          }
+          file_iter_.Seek(target.Encode());
+        }
+      } else {
+        file_iter_.SeekToFirst();
+      }
       if (range_tombstone_iter_) {
         if (*range_tombstone_iter_) {
           (*range_tombstone_iter_)->SeekToFirst();
@@ -1527,13 +1718,25 @@ void LevelIterator::SkipEmptyFileBackward() {
   }
 }
 
+#ifndef NDEBUG
+bool LevelIterator::OverlapRange(const ScanOptions& opts, size_t file_index) {
+  return (user_comparator_.CompareWithoutTimestamp(
+              opts.range.start.value(), /*a_has_ts=*/false,
+              ExtractUserKey(flevel_->files[file_index].largest_key),
+              /*b_has_ts=*/true) <= 0 &&
+          user_comparator_.CompareWithoutTimestamp(
+              opts.range.limit.value(), /*a_has_ts=*/false,
+              ExtractUserKey(flevel_->files[file_index].smallest_key),
+              /*b_has_ts=*/true) > 0);
+}
+#endif
+
 void LevelIterator::SetFileIterator(InternalIterator* iter) {
   if (pinned_iters_mgr_ && iter) {
     iter->SetPinnedItersMgr(pinned_iters_mgr_);
   }
 
   InternalIterator* old_iter = file_iter_.Set(iter);
-
   // Update the read pattern for PrefetchBuffer.
   if (is_next_read_sequential_) {
     file_iter_.UpdateReadaheadState(old_iter);
@@ -1563,11 +1766,29 @@ void LevelIterator::InitFileIterator(size_t new_file_index) {
       // no need to change anything
     } else {
       file_index_ = new_file_index;
+      if (!prepared_iters_.empty()) {
+        auto prepared_it = prepared_iters_.find(file_index_);
+        if (prepared_it != prepared_iters_.end()) {
+          InternalIterator* iter = prepared_it->second;
+          prepared_iters_.erase(prepared_it);
+          SetFileIterator(iter);
+          return;
+        }
+      }
+
       InternalIterator* iter = NewFileIterator();
+      if (FileHasMultiScanArg(file_index_)) {
+        auto& args = GetMultiScanArgForFile(file_index_);
+        assert(OverlapRange(*args.GetScanRanges().begin(), file_index_) &&
+               OverlapRange(*args.GetScanRanges().rbegin(), file_index_));
+        iter->Prepare(&args);
+      }
+
       SetFileIterator(iter);
     }
   }
 }
+
 }  // anonymous namespace
 
 Status Version::GetTableProperties(const ReadOptions& read_options,
@@ -1599,8 +1820,10 @@ Status Version::GetTableProperties(const ReadOptions& read_options,
     file_name = TableFileName(ioptions.cf_paths, file_meta->fd.GetNumber(),
                               file_meta->fd.GetPathId());
   }
-  s = ioptions.fs->NewRandomAccessFile(file_name, file_options_, &file,
-                                       nullptr);
+  FileOptions fopts = file_options_;
+  fopts.file_checksum = file_meta->file_checksum;
+  fopts.file_checksum_func_name = file_meta->file_checksum_func_name;
+  s = ioptions.fs->NewRandomAccessFile(file_name, fopts, &file, nullptr);
   if (!s.ok()) {
     return s;
   }
@@ -1627,8 +1850,8 @@ Status Version::GetTableProperties(const ReadOptions& read_options,
   return s;
 }
 
-Status Version::GetPropertiesOfAllTables(const ReadOptions& read_options,
-                                         TablePropertiesCollection* props) {
+Status Version::GetPropertiesOfAllTables(
+    const ReadOptions& read_options, TablePropertiesCollection* props) const {
   Status s;
   for (int level = 0; level < storage_info_.num_levels_; level++) {
     s = GetPropertiesOfAllTables(read_options, props, level);
@@ -1699,7 +1922,7 @@ Status Version::TablesRangeTombstoneSummary(int max_entries_to_print,
 
 Status Version::GetPropertiesOfAllTables(const ReadOptions& read_options,
                                          TablePropertiesCollection* props,
-                                         int level) {
+                                         int level) const {
   for (const auto& file_meta : storage_info_.files_[level]) {
     auto fname =
         TableFileName(cfd_->ioptions().cf_paths, file_meta->fd.GetNumber(),
@@ -1753,6 +1976,24 @@ Status Version::GetPropertiesOfTablesInRange(
   return Status::OK();
 }
 
+Status Version::GetPropertiesOfTablesByLevel(
+    const ReadOptions& read_options,
+    std::vector<std::unique_ptr<TablePropertiesCollection>>* props_by_level)
+    const {
+  Status s;
+
+  props_by_level->reserve(storage_info_.num_levels_);
+  for (int level = 0; level < storage_info_.num_levels_; level++) {
+    props_by_level->push_back(std::make_unique<TablePropertiesCollection>());
+    s = GetPropertiesOfAllTables(read_options, props_by_level->back().get(),
+                                 level);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  return Status::OK();
+}
+
 Status Version::GetAggregatedTableProperties(
     const ReadOptions& read_options, std::shared_ptr<const TableProperties>* tp,
     int level) {
@@ -1850,6 +2091,79 @@ void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) {
   }
 }
 
+void Version::GetColumnFamilyMetaData(
+    const GetColumnFamilyMetaDataOptions& options,
+    ColumnFamilyMetaData* cf_meta) {
+  assert(cf_meta);
+  assert(cfd_);
+
+  cf_meta->name = cfd_->GetName();
+  cf_meta->size = 0;
+  cf_meta->file_count = 0;
+  cf_meta->levels.clear();
+  cf_meta->blob_file_size = 0;
+  cf_meta->blob_file_count = 0;
+  cf_meta->blob_files.clear();
+
+  const auto& ioptions = cfd_->ioptions();
+  auto* vstorage = storage_info();
+
+  int first_level = (options.level >= 0) ? options.level : 0;
+  int last_level =
+      (options.level >= 0) ? options.level + 1 : cfd_->NumberLevels();
+
+  InternalKey ikey_start, ikey_end;
+  const InternalKey* begin = nullptr;
+  const InternalKey* end = nullptr;
+
+  if (options.range.start.has_value()) {
+    ikey_start = InternalKey(options.range.start.value(), kMaxSequenceNumber,
+                             kValueTypeForSeek);
+    begin = &ikey_start;
+  }
+
+  if (options.range.limit.has_value()) {
+    ikey_end = InternalKey(options.range.limit.value(), kMaxSequenceNumber,
+                           kValueTypeForSeek);
+    end = &ikey_end;
+  }
+
+  for (int l = first_level; l < last_level; ++l) {
+    uint64_t level_size = 0;
+    std::vector<SstFileMetaData> files;
+    std::vector<FileMetaData*> overlapping_files;
+    vstorage->GetOverlappingInputs(l, begin, end, &overlapping_files);
+
+    for (const auto& file : overlapping_files) {
+      uint32_t path_id = file->fd.GetPathId();
+      const auto& file_path = (path_id < ioptions.cf_paths.size())
+                                  ? ioptions.cf_paths[path_id].path
+                                  : ioptions.cf_paths.back().path;
+      const uint64_t file_number = file->fd.GetNumber();
+      files.emplace_back(
+          MakeTableFileName("", file_number), file_number, file_path,
+          file->fd.GetFileSize(), file->fd.smallest_seqno,
+          file->fd.largest_seqno, file->smallest.user_key().ToString(),
+          file->largest.user_key().ToString(),
+          file->stats.num_reads_sampled.load(std::memory_order_relaxed),
+          file->being_compacted, file->temperature,
+          file->oldest_blob_file_number, file->TryGetOldestAncesterTime(),
+          file->TryGetFileCreationTime(), file->epoch_number,
+          file->file_checksum, file->file_checksum_func_name);
+      files.back().num_entries = file->num_entries;
+      files.back().num_deletions = file->num_deletions;
+      files.back().smallest = file->smallest.Encode().ToString();
+      files.back().largest = file->largest.Encode().ToString();
+      level_size += file->fd.GetFileSize();
+      cf_meta->file_count++;
+    }
+    if (!files.empty()) {
+      cf_meta->levels.emplace_back(l, level_size, std::move(files));
+      cf_meta->size += level_size;
+    }
+  }
+}
+
 uint64_t Version::GetSstFilesSize() {
   uint64_t sst_files_size = 0;
   for (int level = 0; level < storage_info_.num_levels_; level++) {
@@ -1934,7 +2248,7 @@ InternalIterator* Version::TEST_GetLevelIterator(
       cfd_->internal_stats()->GetFileReadHist(level),
       TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
       nullptr /* range_del_agg */, nullptr /* compaction_boundaries */,
-      allow_unprepared_value, &tombstone_iter_ptr);
+      allow_unprepared_value, &tombstone_iter_ptr, db_statistics_, clock_);
   if (read_options.ignore_range_deletions) {
     merge_iter_builder->AddIterator(level_iter);
   } else {
@@ -2074,7 +2388,7 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options,
         TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
         /*range_del_agg=*/nullptr,
         /*compaction_boundaries=*/nullptr, allow_unprepared_value,
-        &tombstone_iter_ptr);
+        &tombstone_iter_ptr, db_statistics_, clock_);
     if (read_options.ignore_range_deletions) {
       merge_iter_builder->AddIterator(level_iter);
     } else {
@@ -2131,7 +2445,7 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options,
         mutable_cf_options_, should_sample_file_read(),
         cfd_->internal_stats()->GetFileReadHist(level),
         TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
-        &range_del_agg, nullptr, false));
+        &range_del_agg, nullptr, false, nullptr, db_statistics_, clock_));
     status = OverlapWithIterator(ucmp, smallest_user_key, largest_user_key,
                                  iter.get(), overlap);
   }
@@ -2709,9 +3023,10 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
           RecordTick(db_statistics_, MULTIGET_COROUTINE_COUNT,
                      mget_tasks.size());
           // Collect all results so far
-          std::vector<Status> statuses = folly::coro::blockingWait(
-              folly::coro::collectAllRange(std::move(mget_tasks))
-                  .scheduleOn(&range->context()->executor()));
+          std::vector<Status> statuses =
+              folly::coro::blockingWait(co_withExecutor(
+                  &range->context()->executor(),
+                  folly::coro::collectAllRange(std::move(mget_tasks))));
           if (s.ok()) {
             for (Status stat : statuses) {
               if (!stat.ok()) {
@@ -2996,9 +3311,10 @@ Status Version::MultiGetAsync(
         assert(waiting.size());
         RecordTick(db_statistics_, MULTIGET_COROUTINE_COUNT, mget_tasks.size());
         // Collect all results so far
-        std::vector<Status> statuses = folly::coro::blockingWait(
-            folly::coro::collectAllRange(std::move(mget_tasks))
-                .scheduleOn(&range->context()->executor()));
+        std::vector<Status> statuses =
+            folly::coro::blockingWait(co_withExecutor(
+                &range->context()->executor(),
+                folly::coro::collectAllRange(std::move(mget_tasks))));
         mget_tasks.clear();
         if (s.ok()) {
           for (Status stat : statuses) {
@@ -3130,7 +3446,7 @@ bool Version::MaybeInitializeFileMetaData(const ReadOptions& read_options,
   // Ensure new invariants on old files
   file_meta->num_deletions =
       std::max(tp->num_deletions, tp->num_range_deletions);
-  file_meta->num_entries = std::max(tp->num_entries, tp->num_deletions);
+  file_meta->num_entries = std::max(tp->num_entries, file_meta->num_deletions);
   return true;
 }
 
@@ -3423,7 +3739,8 @@ bool ShouldChangeFileTemperature(const ImmutableOptions& ioptions,
 
 void VersionStorageInfo::ComputeCompactionScore(
     const ImmutableOptions& immutable_options,
-    const MutableCFOptions& mutable_cf_options) {
+    const MutableCFOptions& mutable_cf_options,
+    const std::string& full_history_ts_low) {
   double total_downcompact_bytes = 0.0;
   // Historically, score is defined as actual bytes in a level divided by
   // the level's target size, and 1.0 is the threshold for triggering
@@ -3434,7 +3751,9 @@ void VersionStorageInfo::ComputeCompactionScore(
   // maintaining it to be over 1.0, we scale the original score by 10x
   // if it is larger than 1.0.
   const double kScoreScale = 10.0;
-  int max_output_level = MaxOutputLevel(immutable_options.allow_ingest_behind);
+  int max_output_level =
+      MaxOutputLevel(immutable_options.cf_allow_ingest_behind ||
+                     immutable_options.allow_ingest_behind);
   for (int level = 0; level <= MaxInputLevel(); level++) {
     double score;
     if (level == 0) {
@@ -3475,10 +3794,20 @@ void VersionStorageInfo::ComputeCompactionScore(
       }
 
       if (compaction_style_ == kCompactionStyleFIFO) {
-        score = static_cast<double>(total_size) /
-                mutable_cf_options.compaction_options_fifo.max_table_files_size;
-        if (score < 1 &&
-            mutable_cf_options.compaction_options_fifo.allow_compaction) {
+        const auto& fifo_opts = mutable_cf_options.compaction_options_fifo;
+        uint64_t effective_size = total_size;
+        uint64_t effective_max = fifo_opts.max_table_files_size;
+        if (fifo_opts.max_data_files_size > 0) {
+          // Blob-aware: include blob file sizes in the total
+          effective_size += GetBlobStats().total_file_size;
+          effective_max = fifo_opts.max_data_files_size;
+        }
+        if (effective_max == 0) {
+          // avoid divide 0
+          effective_max = 1;
+        }
+        score = static_cast<double>(effective_size) / effective_max;
+        if (score < 1 && fifo_opts.allow_compaction) {
           score = std::max(
               static_cast<double>(num_sorted_runs) /
                   mutable_cf_options.level0_file_num_compaction_trigger,
@@ -3614,7 +3943,9 @@ void VersionStorageInfo::ComputeCompactionScore(
   }
   ComputeFilesMarkedForCompaction(max_output_level);
   ComputeBottommostFilesMarkedForCompaction(
-      immutable_options.allow_ingest_behind);
+      immutable_options.cf_allow_ingest_behind ||
+          immutable_options.allow_ingest_behind,
+      immutable_options.user_comparator, full_history_ts_low);
   ComputeExpiredTtlFiles(immutable_options, mutable_cf_options.ttl);
   ComputeFilesMarkedForPeriodicCompaction(
       immutable_options, mutable_cf_options.periodic_compaction_seconds,
@@ -4205,17 +4536,20 @@ void VersionStorageInfo::GenerateFileLocationIndex() {
   }
 }
 
-void VersionStorageInfo::UpdateOldestSnapshot(SequenceNumber seqnum,
-                                              bool allow_ingest_behind) {
+void VersionStorageInfo::UpdateOldestSnapshot(
+    SequenceNumber seqnum, bool allow_ingest_behind, const Comparator* ucmp,
+    const std::string& full_history_ts_low) {
   assert(seqnum >= oldest_snapshot_seqnum_);
   oldest_snapshot_seqnum_ = seqnum;
   if (oldest_snapshot_seqnum_ > bottommost_files_mark_threshold_) {
-    ComputeBottommostFilesMarkedForCompaction(allow_ingest_behind);
+    ComputeBottommostFilesMarkedForCompaction(allow_ingest_behind, ucmp,
+                                              full_history_ts_low);
   }
 }
 
 void VersionStorageInfo::ComputeBottommostFilesMarkedForCompaction(
-    bool allow_ingest_behind) {
+    bool allow_ingest_behind, const Comparator* ucmp,
+    const std::string& full_history_ts_low) {
   bottommost_files_marked_for_compaction_.clear();
   bottommost_files_mark_threshold_ = kMaxSequenceNumber;
   if (allow_ingest_behind) {
@@ -4236,12 +4570,39 @@ void VersionStorageInfo::ComputeBottommostFilesMarkedForCompaction(
         current_time - static_cast<int64_t>(bottommost_file_compaction_delay_);
   }
 
+  // For UDT, we need to check if the file's max timestamp is below
+  // full_history_ts_low. If not, the compaction won't be able to collapse the
+  // timestamp to clean up the tombstone , so marking the file would be futile
+  // and could cause an infinite compaction loop.
+  const bool has_udt = ucmp && ucmp->timestamp_size() > 0;
+
   for (auto& level_and_file : bottommost_files_) {
     if (!level_and_file.second->being_compacted &&
         level_and_file.second->fd.largest_seqno != 0) {
       // largest_seqno might be nonzero due to containing the final key in an
       // earlier compaction, whose seqnum we didn't zero out.
       if (level_and_file.second->fd.largest_seqno < oldest_snapshot_seqnum_) {
+        if (has_udt) {
+          const std::string& max_ts = level_and_file.second->max_timestamp;
+          // If max_timestamp is empty, the file could come from very old
+          // version which does not have timestamp. In that case, we should pick
+          // the file for compaction. After compaction, the file will have
+          // max_timestamp set propertly.
+          if (!max_ts.empty()) {
+            // If full_history_ts_low is empty, it means it was never set, which
+            // means its value is 0. Therefore, it would be always smaller than
+            // max_timestamp
+            if (full_history_ts_low.empty()) {
+              continue;
+            }
+            // If max timestamp >= full_history_ts_low, skip this file
+            if (ucmp->CompareTimestamp(Slice(max_ts), full_history_ts_low) >=
+                0) {
+              continue;
+            }
+          }
+        }
+
         if (!needs_delay) {
           bottommost_files_marked_for_compaction_.push_back(level_and_file);
         } else if (creation_time_ub > 0) {
@@ -4303,7 +4664,8 @@ bool VersionStorageInfo::OverlapInLevel(int level,
 void VersionStorageInfo::GetOverlappingInputs(
     int level, const InternalKey* begin, const InternalKey* end,
     std::vector<FileMetaData*>* inputs, int hint_index, int* file_index,
-    bool expand_range, InternalKey** next_smallest) const {
+    bool expand_range, const FileMetaData* starting_l0_file,
+    InternalKey** next_smallest) const {
   if (level >= num_non_empty_levels_) {
     // this level is empty, no overlapping inputs
     return;
@@ -4336,7 +4698,19 @@ void VersionStorageInfo::GetOverlappingInputs(
 
   // index stores the file index need to check.
   std::list<size_t> index;
-  for (size_t i = 0; i < level_files_brief_[level].num_files; i++) {
+  size_t start_index = 0;
+  if (starting_l0_file != nullptr) {
+    uint64_t starting_file_number = starting_l0_file->fd.GetNumber();
+    for (size_t i = 0; i < level_files_brief_[level].num_files; i++) {
+      if (level_files_brief_[level].files[i].fd.GetNumber() ==
+          starting_file_number) {
+        start_index = i;
+        break;
+      }
+    }
+    assert(start_index < level_files_brief_[level].num_files);
+  }
+  for (size_t i = start_index; i < level_files_brief_[level].num_files; i++) {
     index.emplace_back(i);
   }
 
@@ -4611,8 +4985,7 @@ void VersionStorageInfo::RecoverEpochNumbers(ColumnFamilyData* cfd,
   if (restart_epoch) {
     cfd->ResetNextEpochNumber();
 
-    bool reserve_epoch_num_for_file_ingested_behind =
-        cfd->ioptions().allow_ingest_behind;
+    bool reserve_epoch_num_for_file_ingested_behind = cfd->AllowIngestBehind();
     if (reserve_epoch_num_for_file_ingested_behind) {
       uint64_t reserved_epoch_number = cfd->NewEpochNumber();
       assert(reserved_epoch_number ==
@@ -4620,7 +4993,8 @@ void VersionStorageInfo::RecoverEpochNumbers(ColumnFamilyData* cfd,
       ROCKS_LOG_INFO(cfd->ioptions().info_log.get(),
                      "[%s]CF has reserved epoch number %" PRIu64
                      " for files ingested "
-                     "behind since `Options::allow_ingest_behind` is true",
+                     "behind since `Options::allow_ingest_behind` or "
+                     "`Options::cf_allow_ingest_behind` is true",
                      cfd->GetName().c_str(), reserved_epoch_number);
     }
   }
@@ -4761,7 +5135,7 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableOptions& ioptions,
             cur_level_size <= base_bytes_min &&
             (options.preclude_last_level_data_seconds == 0 ||
              i < num_levels_ - 2)) {
-          // When per_key_placement is enabled, the penultimate level is
+          // When per_key_placement is enabled, the proximal level is
           // necessary.
           lowest_unnecessary_level_ = i;
         }
@@ -4903,24 +5277,38 @@ bool VersionStorageInfo::RangeMightExistAfterSortedRun(
 }
 
 Env::WriteLifeTimeHint VersionStorageInfo::CalculateSSTWriteHint(
-    int level) const {
-  if (compaction_style_ != kCompactionStyleLevel) {
+    int level, CompactionStyleSet compaction_style_set) const {
+  if (!compaction_style_set.Contains(compaction_style_)) {
     return Env::WLTH_NOT_SET;
   }
-  if (level == 0) {
-    return Env::WLTH_MEDIUM;
-  }
 
-  // L1: medium, L2: long, ...
-  if (level - base_level_ >= 2) {
-    return Env::WLTH_EXTREME;
-  } else if (level < base_level_) {
-    // There is no restriction which prevents level passed in to be smaller
-    // than base_level.
-    return Env::WLTH_MEDIUM;
+  switch (compaction_style_) {
+    case kCompactionStyleLevel:
+      if (level == 0) {
+        return Env::WLTH_MEDIUM;
+      }
+
+      // L1: medium, L2: long, ...
+      if (level - base_level_ >= 2) {
+        return Env::WLTH_EXTREME;
+      } else if (level < base_level_) {
+        // There is no restriction which prevents level passed in to be smaller
+        // than base_level.
+        return Env::WLTH_MEDIUM;
+      }
+      return static_cast<Env::WriteLifeTimeHint>(
+          level - base_level_ + static_cast<int>(Env::WLTH_MEDIUM));
+    case kCompactionStyleUniversal:
+      if (level == 0) {
+        return Env::WLTH_SHORT;
+      }
+      if (level == 1) {
+        return Env::WLTH_MEDIUM;
+      }
+      return Env::WLTH_LONG;
+    default:
+      return Env::WLTH_NOT_SET;
   }
-  return static_cast<Env::WriteLifeTimeHint>(
-      level - base_level_ + static_cast<int>(Env::WLTH_MEDIUM));
 }
 
 void Version::AddLiveFiles(std::vector<uint64_t>* live_table_files,
@@ -5109,12 +5497,13 @@ void AtomicGroupReadBuffer::Clear() {
 
 VersionSet::VersionSet(
     const std::string& dbname, const ImmutableDBOptions* _db_options,
+    const MutableDBOptions& mutable_db_options,
     const FileOptions& storage_options, Cache* table_cache,
     WriteBufferManager* write_buffer_manager, WriteController* write_controller,
     BlockCacheTracer* const block_cache_tracer,
     const std::shared_ptr<IOTracer>& io_tracer, const std::string& db_id,
     const std::string& db_session_id, const std::string& daily_offpeak_time_utc,
-    ErrorHandler* const error_handler, const bool read_only)
+    ErrorHandler* error_handler, bool unchanging)
     : column_family_set_(new ColumnFamilySet(
           dbname, _db_options, storage_options, table_cache,
           write_buffer_manager, write_controller, block_cache_tracer, io_tracer,
@@ -5137,18 +5526,21 @@ VersionSet::VersionSet(
       prev_log_number_(0),
       current_version_number_(0),
       manifest_file_size_(0),
+      last_compacted_manifest_file_size_(0),
       file_options_(storage_options),
       block_cache_tracer_(block_cache_tracer),
       io_tracer_(io_tracer),
       db_session_id_(db_session_id),
       offpeak_time_option_(OffpeakTimeOption(daily_offpeak_time_utc)),
       error_handler_(error_handler),
-      read_only_(read_only),
-      closed_(false) {}
+      unchanging_(unchanging),
+      closed_(false) {
+  UpdatedMutableDbOptions(mutable_db_options, /*mu=*/nullptr);
+}
 
 Status VersionSet::Close(FSDirectory* db_dir, InstrumentedMutex* mu) {
   Status s;
-  if (closed_ || read_only_ || !manifest_file_number_ || !descriptor_log_) {
+  if (closed_ || unchanging_ || !manifest_file_number_ || !descriptor_log_) {
     return s;
   }
 
@@ -5218,6 +5610,15 @@ void VersionSet::Reset() {
   if (column_family_set_) {
     WriteBufferManager* wbm = column_family_set_->write_buffer_manager();
     WriteController* wc = column_family_set_->write_controller();
+
+    // Clear TableCache to prevent use-after-free: Reset() deletes old
+    // ColumnFamilySet but reuses table_cache_, which may contain
+    // BlockBasedTable entries with dangling references to deleted CFD's
+    // ioptions.
+    if (table_cache_) {
+      table_cache_->EraseUnRefEntries();
+    }
+
     // db_id becomes the source of truth after DBImpl::Recover():
     // https://github.com/facebook/rocksdb/blob/v7.3.1/db/db_impl/db_impl_open.cc#L527
     // Note: we may not be able to recover db_id from MANIFEST if
@@ -5240,17 +5641,45 @@ void VersionSet::Reset() {
   current_version_number_ = 0;
   manifest_writers_.clear();
   manifest_file_size_ = 0;
+  last_compacted_manifest_file_size_ = 0;
+  TuneMaxManifestFileSize();
   obsolete_files_.clear();
   obsolete_manifests_.clear();
   wals_.Reset();
 }
 
+void VersionSet::UpdatedMutableDbOptions(
+    const MutableDBOptions& updated_options, InstrumentedMutex* mu) {
+  // Must be holding mutex if not called during initialization
+  if (mu) {
+    mu->AssertHeld();
+  } else {
+    // manifest_file_size_ must be 0 if called from the constructor
+    assert(manifest_file_size_ == 0);
+  }
+  file_options_.writable_file_max_buffer_size =
+      updated_options.writable_file_max_buffer_size;
+  min_max_manifest_file_size_ = updated_options.max_manifest_file_size;
+  max_manifest_space_amp_pct_ = static_cast<unsigned>(
+      std::max(updated_options.max_manifest_space_amp_pct, 0));
+  manifest_preallocation_size_ = updated_options.manifest_preallocation_size;
+  TuneMaxManifestFileSize();
+}
+
+void VersionSet::TuneMaxManifestFileSize() {
+  tuned_max_manifest_file_size_ =
+      std::max(min_max_manifest_file_size_,
+               last_compacted_manifest_file_size_ *
+                   (100U + max_manifest_space_amp_pct_) / 100U);
+}
+
 void VersionSet::AppendVersion(ColumnFamilyData* column_family_data,
                                Version* v) {
   // compute new compaction score
   v->storage_info()->ComputeCompactionScore(
       column_family_data->ioptions(),
-      column_family_data->GetLatestMutableCFOptions());
+      column_family_data->GetLatestMutableCFOptions(),
+      column_family_data->GetFullHistoryTsLow());
 
   // Mark v finalized
   v->storage_info_.SetFinalized();
@@ -5327,8 +5756,8 @@ Status VersionSet::ProcessManifestWrites(
         // the preceding version edits in the same atomic group, and update
         // their `remaining_entries_` member variable because we are NOT going
         // to write the version edits' of dropped CF to the MANIFEST. If we
-        // don't update, then Recover can report corrupted atomic group because
-        // the `remaining_entries_` do not match.
+        // don't update, then Recover can report corrupted atomic group
+        // because the `remaining_entries_` do not match.
         if (!batch_edits.empty()) {
           if (batch_edits.back()->IsInAtomicGroup() &&
               batch_edits.back()->GetRemainingEntries() > 0) {
@@ -5488,10 +5917,11 @@ Status VersionSet::ProcessManifestWrites(
   }
 #endif  // NDEBUG
 
+  uint64_t prev_manifest_file_size = manifest_file_size_;
   assert(pending_manifest_file_number_ == 0);
   if (!skip_manifest_write &&
       (!descriptor_log_ ||
-       manifest_file_size_ > db_options_->max_manifest_file_size)) {
+       prev_manifest_file_size >= tuned_max_manifest_file_size_)) {
     TEST_SYNC_POINT("VersionSet::ProcessManifestWrites:BeforeNewManifest");
     new_descriptor_log = true;
   } else {
@@ -5531,6 +5961,8 @@ Status VersionSet::ProcessManifestWrites(
   IOStatus manifest_io_status;
   manifest_io_status.PermitUncheckedError();
   std::unique_ptr<log::Writer> new_desc_log_ptr;
+  // Save before releasing mu
+  uint64_t manifest_preallocation_size = manifest_preallocation_size_;
   if (skip_manifest_write) {
     if (s.ok()) {
       constexpr bool update_stats = true;
@@ -5574,16 +6006,13 @@ Status VersionSet::ProcessManifestWrites(
       // This is fine because everything inside of this block is serialized --
       // only one thread can be here at the same time
       // create new manifest file
-      ROCKS_LOG_INFO(db_options_->info_log, "Creating manifest %" PRIu64 "\n",
-                     pending_manifest_file_number_);
       std::string descriptor_fname =
           DescriptorFileName(dbname_, pending_manifest_file_number_);
       std::unique_ptr<FSWritableFile> descriptor_file;
       io_s = NewWritableFile(fs_.get(), descriptor_fname, &descriptor_file,
                              opt_file_opts);
       if (io_s.ok()) {
-        descriptor_file->SetPreallocationBlockSize(
-            db_options_->manifest_preallocation_size);
+        descriptor_file->SetPreallocationBlockSize(manifest_preallocation_size);
         FileTypeSet tmp_set = db_options_->checksum_handoff_file_types;
         std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
             std::move(descriptor_file), descriptor_fname, opt_file_opts, clock_,
@@ -5633,10 +6062,12 @@ Status VersionSet::ProcessManifestWrites(
 #ifndef NDEBUG
         if (batch_edits.size() > 1 && batch_edits.size() - 1 == idx) {
           TEST_SYNC_POINT_CALLBACK(
-              "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0",
+              "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:"
+              "0",
               nullptr);
           TEST_SYNC_POINT(
-              "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:1");
+              "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:"
+              "1");
         }
         ++idx;
 #endif /* !NDEBUG */
@@ -5673,8 +6104,8 @@ Status VersionSet::ProcessManifestWrites(
           file_options_.temperature, dir_contains_current_file);
       if (!io_s.ok()) {
         s = io_s;
-        // Quarantine old manifest file in case new manifest file's CURRENT file
-        // wasn't created successfully and the old manifest is needed.
+        // Quarantine old manifest file in case new manifest file's CURRENT
+        // file wasn't created successfully and the old manifest is needed.
         limbo_descriptor_log_file_number.push_back(manifest_file_number_);
         files_to_quarantine_if_commit_fail.push_back(
             &limbo_descriptor_log_file_number);
@@ -5684,6 +6115,13 @@ Status VersionSet::ProcessManifestWrites(
     if (s.ok()) {
       // find offset in manifest file where this version is stored.
       new_manifest_file_size = raw_desc_log_ptr->file()->GetFileSize();
+      if (new_descriptor_log) {
+        ROCKS_LOG_INFO(db_options_->info_log,
+                       "Created manifest %" PRIu64
+                       ", compacted+appended from %" PRIu64 " to %" PRIu64 "\n",
+                       pending_manifest_file_number_, prev_manifest_file_size,
+                       new_manifest_file_size);
+      }
     }
 
     if (first_writer.edit_list.front()->IsColumnFamilyDrop()) {
@@ -5732,6 +6170,8 @@ Status VersionSet::ProcessManifestWrites(
     descriptor_log_ = std::move(new_desc_log_ptr);
     obsolete_manifests_.emplace_back(
         DescriptorFileName("", manifest_file_number_));
+    last_compacted_manifest_file_size_ = new_manifest_file_size;
+    TuneMaxManifestFileSize();
   }
 
   // Install the new versions
@@ -5741,7 +6181,8 @@ Status VersionSet::ProcessManifestWrites(
       assert(new_cf_options != nullptr);
       assert(max_last_sequence == descriptor_last_sequence_);
       CreateColumnFamily(*new_cf_options, read_options,
-                         first_writer.edit_list.front());
+                         first_writer.edit_list.front(),
+                         /*read_only*/ false);
     } else if (first_writer.edit_list.front()->IsColumnFamilyDrop()) {
       assert(batch_edits.size() == 1);
       assert(max_last_sequence == descriptor_last_sequence_);
@@ -5813,21 +6254,21 @@ Status VersionSet::ProcessManifestWrites(
     // that renaming tmp file to CURRENT failed.
     //
     // On local POSIX-compliant FS, the CURRENT must point to the original
-    // MANIFEST. We can delete the new MANIFEST for simplicity, but we can also
-    // keep it. Future recovery will ignore this MANIFEST. It's also ok for the
-    // process not to crash and continue using the db. Any future LogAndApply()
-    // call will switch to a new MANIFEST and update CURRENT, still ignoring
-    // this one.
+    // MANIFEST. We can delete the new MANIFEST for simplicity, but we can
+    // also keep it. Future recovery will ignore this MANIFEST. It's also ok
+    // for the process not to crash and continue using the db. Any future
+    // LogAndApply() call will switch to a new MANIFEST and update CURRENT,
+    // still ignoring this one.
     //
     // On non-local FS, it is
     // possible that the rename operation succeeded on the server (remote)
     // side, but the client somehow returns a non-ok status to RocksDB. Note
     // that this does not violate atomicity. Should we delete the new MANIFEST
     // successfully, a subsequent recovery attempt will likely see the CURRENT
-    // pointing to the new MANIFEST, thus fail. We will not be able to open the
-    // DB again. Therefore, if manifest operations succeed, we should keep the
-    // the new MANIFEST. If the process proceeds, any future LogAndApply() call
-    // will switch to a new MANIFEST and update CURRENT. If user tries to
+    // pointing to the new MANIFEST, thus fail. We will not be able to open
+    // the DB again. Therefore, if manifest operations succeed, we should keep
+    // the the new MANIFEST. If the process proceeds, any future LogAndApply()
+    // call will switch to a new MANIFEST and update CURRENT. If user tries to
     // re-open the DB,
     // a) CURRENT points to the new MANIFEST, and the new MANIFEST is present.
     // b) CURRENT points to the original MANIFEST, and the original MANIFEST
@@ -5956,9 +6397,9 @@ Status VersionSet::LogAndApply(
     first_writer.cv.Wait();
   }
   if (first_writer.done) {
-    // All non-CF-manipulation operations can be grouped together and committed
-    // to MANIFEST. They should all have finished. The status code is stored in
-    // the first manifest writer.
+    // All non-CF-manipulation operations can be grouped together and
+    // committed to MANIFEST. They should all have finished. The status code
+    // is stored in the first manifest writer.
 #ifndef NDEBUG
     for (const auto& writer : writers) {
       assert(writer.done);
@@ -6012,8 +6453,8 @@ void VersionSet::LogAndApplyCFHelper(VersionEdit* edit,
   assert(!edit->HasLastSequence());
   edit->SetLastSequence(*max_last_sequence);
   if (edit->IsColumnFamilyDrop()) {
-    // if we drop column family, we have to make sure to save max column family,
-    // so that we don't reuse existing ID
+    // if we drop column family, we have to make sure to save max column
+    // family, so that we don't reuse existing ID
     edit->SetMaxColumnFamily(column_family_set_->GetMaxColumnFamily());
   }
 }
@@ -6302,7 +6743,8 @@ void VersionSet::RecoverEpochNumbers() {
 Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
                                       const std::string& dbname,
                                       FileSystem* fs) {
-  // Read "CURRENT" file, which contains a pointer to the current manifest file
+  // Read "CURRENT" file, which contains a pointer to the current manifest
+  // file
   std::string manifest_path;
   uint64_t manifest_file_number;
   Status s = GetCurrentManifestPath(dbname, fs, /*is_retry=*/false,
@@ -6364,17 +6806,19 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
   const ReadOptions read_options;
   const WriteOptions write_options;
 
-  ImmutableDBOptions db_options(*options);
+  ImmutableDBOptions imm_db_options(*options);
+  MutableDBOptions mutable_db_options(*options);
   ColumnFamilyOptions cf_options(*options);
   std::shared_ptr<Cache> tc(NewLRUCache(options->max_open_files - 10,
                                         options->table_cache_numshardbits));
   WriteController wc(options->delayed_write_rate);
   WriteBufferManager wb(options->db_write_buffer_size);
-  VersionSet versions(dbname, &db_options, file_options, tc.get(), &wb, &wc,
-                      nullptr /*BlockCacheTracer*/, nullptr /*IOTracer*/,
+  VersionSet versions(dbname, &imm_db_options, mutable_db_options, file_options,
+                      tc.get(), &wb, &wc, nullptr /*BlockCacheTracer*/,
+                      nullptr /*IOTracer*/,
                       /*db_id*/ "",
                       /*db_session_id*/ "", options->daily_offpeak_time_utc,
-                      /*error_handler_*/ nullptr, /*read_only=*/false);
+                      /*error_handler_*/ nullptr, /*unchanging=*/false);
   Status status;
 
   std::vector<ColumnFamilyDescriptor> dummy;
@@ -6457,9 +6901,9 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
 
 // Get the checksum information including the checksum and checksum function
 // name of all SST and blob files in VersionSet. Store the information in
-// FileChecksumList which contains a map from file number to its checksum info.
-// If DB is not running, make sure call VersionSet::Recover() to load the file
-// metadata from Manifest to VersionSet before calling this function.
+// FileChecksumList which contains a map from file number to its checksum
+// info. If DB is not running, make sure call VersionSet::Recover() to load
+// the file metadata from Manifest to VersionSet before calling this function.
 Status VersionSet::GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) {
   // Clean the previously stored checksum information if any.
   Status s;
@@ -6601,8 +7045,8 @@ Status VersionSet::WriteCurrentStateToManifest(
   // WARNING: This method doesn't hold a mutex!!
 
   // This is done without DB mutex lock held, but only within single-threaded
-  // LogAndApply. Column family manipulations can only happen within LogAndApply
-  // (the same single thread), so we're safe to iterate.
+  // LogAndApply. Column family manipulations can only happen within
+  // LogAndApply (the same single thread), so we're safe to iterate.
 
   assert(io_s.ok());
   if (db_options_->write_dbid_to_manifest) {
@@ -6636,9 +7080,9 @@ Status VersionSet::WriteCurrentStateToManifest(
   }
 
   // New manifest should rollover the WAL deletion record from previous
-  // manifest. Otherwise, when an addition record of a deleted WAL gets added to
-  // this new manifest later (which can happens in e.g, SyncWAL()), this new
-  // manifest creates an illusion that such WAL hasn't been deleted.
+  // manifest. Otherwise, when an addition record of a deleted WAL gets added
+  // to this new manifest later (which can happens in e.g, SyncWAL()), this
+  // new manifest creates an illusion that such WAL hasn't been deleted.
   VersionEdit wal_deletions;
   wal_deletions.DeleteWalsBefore(min_log_number_to_keep());
   std::string wal_deletions_record;
@@ -6698,7 +7142,6 @@ Status VersionSet::WriteCurrentStateToManifest(
 
         for (const auto& f : level_files) {
           assert(f);
-
           edit.AddFile(level, f->fd.GetNumber(), f->fd.GetPathId(),
                        f->fd.GetFileSize(), f->smallest, f->largest,
                        f->fd.smallest_seqno, f->fd.largest_seqno,
@@ -6707,7 +7150,8 @@ Status VersionSet::WriteCurrentStateToManifest(
                        f->file_creation_time, f->epoch_number, f->file_checksum,
                        f->file_checksum_func_name, f->unique_id,
                        f->compensated_range_deletion_size, f->tail_size,
-                       f->user_defined_timestamps_persisted);
+                       f->user_defined_timestamps_persisted, f->min_timestamp,
+                       f->max_timestamp);
         }
       }
 
@@ -6770,9 +7214,9 @@ Status VersionSet::WriteCurrentStateToManifest(
 // TODO(aekmekji): in CompactionJob::GenSubcompactionBoundaries(), this
 // function is called repeatedly with consecutive pairs of slices. For example
 // if the slice list is [a, b, c, d] this function is called with arguments
-// (a,b) then (b,c) then (c,d). Knowing this, an optimization is possible where
-// we avoid doing binary search for the keys b and c twice and instead somehow
-// maintain state of where they first appear in the files.
+// (a,b) then (b,c) then (c,d). Knowing this, an optimization is possible
+// where we avoid doing binary search for the keys b and c twice and instead
+// somehow maintain state of where they first appear in the files.
 uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options,
                                      const ReadOptions& read_options,
                                      Version* v, const Slice& start,
@@ -6793,19 +7237,20 @@ uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options,
   }
 
   // Outline of the optimization that uses options.files_size_error_margin.
-  // When approximating the files total size that is used to store a keys range,
-  // we first sum up the sizes of the files that fully fall into the range.
-  // Then we sum up the sizes of all the files that may intersect with the range
-  // (this includes all files in L0 as well). Then, if total_intersecting_size
-  // is smaller than total_full_size * options.files_size_error_margin - we can
-  // infer that the intersecting files have a sufficiently negligible
-  // contribution to the total size, and we can approximate the storage required
-  // for the keys in range as just half of the intersecting_files_size.
-  // E.g., if the value of files_size_error_margin is 0.1, then the error of the
-  // approximation is limited to only ~10% of the total size of files that fully
-  // fall into the keys range. In such case, this helps to avoid a costly
-  // process of binary searching the intersecting files that is required only
-  // for a more precise calculation of the total size.
+  // When approximating the files total size that is used to store a keys
+  // range, we first sum up the sizes of the files that fully fall into the
+  // range. Then we sum up the sizes of all the files that may intersect with
+  // the range (this includes all files in L0 as well). Then, if
+  // total_intersecting_size is smaller than total_full_size *
+  // options.files_size_error_margin - we can infer that the intersecting
+  // files have a sufficiently negligible contribution to the total size, and
+  // we can approximate the storage required for the keys in range as just
+  // half of the intersecting_files_size. E.g., if the value of
+  // files_size_error_margin is 0.1, then the error of the approximation is
+  // limited to only ~10% of the total size of files that fully fall into the
+  // keys range. In such case, this helps to avoid a costly process of binary
+  // searching the intersecting files that is required only for a more precise
+  // calculation of the total size.
 
   autovector<FdWithKeyRange*, 32> first_files;
   autovector<FdWithKeyRange*, 16> last_files;
@@ -6877,10 +7322,11 @@ uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options,
     total_intersecting_size += file_ptr->fd.GetFileSize();
   }
 
-  // Now scan all the first & last files at each level, and estimate their size.
-  // If the total_intersecting_size is less than X% of the total_full_size - we
-  // want to approximate the result in order to avoid the costly binary search
-  // inside ApproximateSize. We use half of file size as an approximation below.
+  // Now scan all the first & last files at each level, and estimate their
+  // size. If the total_intersecting_size is less than X% of the
+  // total_full_size - we want to approximate the result in order to avoid the
+  // costly binary search inside ApproximateSize. We use half of file size as
+  // an approximation below.
 
   const double margin = options.files_size_error_margin;
   if (margin > 0 && total_intersecting_size <
@@ -7148,7 +7594,8 @@ InternalIterator* VersionSet::MakeInputIterator(
             /*no per level latency histogram=*/nullptr,
             TableReaderCaller::kCompaction, /*skip_filters=*/false,
             /*level=*/static_cast<int>(c->level(which)), range_del_agg,
-            c->boundaries(which), false, &tombstone_iter_ptr);
+            c->boundaries(which), false, &tombstone_iter_ptr,
+            db_options_->statistics.get(), clock_);
         range_tombstones.emplace_back(nullptr, tombstone_iter_ptr);
       }
     }
@@ -7294,8 +7741,10 @@ uint64_t VersionSet::GetObsoleteSstFilesSize() const {
 
 ColumnFamilyData* VersionSet::CreateColumnFamily(
     const ColumnFamilyOptions& cf_options, const ReadOptions& read_options,
-    const VersionEdit* edit) {
+    const VersionEdit* edit, bool read_only) {
   assert(edit->IsColumnFamilyAdd());
+  // Unchanging LSM tree implies no writes to the CF
+  assert(!unchanging_ || read_only);
 
   MutableCFOptions dummy_cf_options;
   Version* dummy_versions =
@@ -7305,7 +7754,7 @@ ColumnFamilyData* VersionSet::CreateColumnFamily(
   dummy_versions->Ref();
   auto new_cfd = column_family_set_->CreateColumnFamily(
       edit->GetColumnFamilyName(), edit->GetColumnFamily(), dummy_versions,
-      cf_options);
+      cf_options, read_only);
 
   Version* v = new Version(new_cfd, this, file_options_,
                            new_cfd->GetLatestMutableCFOptions(), io_tracer_,
@@ -7421,15 +7870,16 @@ Status VersionSet::VerifyFileMetadata(const ReadOptions& read_options,
 }
 
 ReactiveVersionSet::ReactiveVersionSet(
-    const std::string& dbname, const ImmutableDBOptions* _db_options,
+    const std::string& dbname, const ImmutableDBOptions* imm_db_options,
+    const MutableDBOptions& mutable_db_options,
     const FileOptions& _file_options, Cache* table_cache,
     WriteBufferManager* write_buffer_manager, WriteController* write_controller,
     const std::shared_ptr<IOTracer>& io_tracer)
-    : VersionSet(dbname, _db_options, _file_options, table_cache,
-                 write_buffer_manager, write_controller,
+    : VersionSet(dbname, imm_db_options, mutable_db_options, _file_options,
+                 table_cache, write_buffer_manager, write_controller,
                  /*block_cache_tracer=*/nullptr, io_tracer, /*db_id*/ "",
                  /*db_session_id*/ "", /*daily_offpeak_time_utc*/ "",
-                 /*error_handler=*/nullptr, /*read_only=*/true) {}
+                 /*error_handler=*/nullptr, /*unchanging=*/false) {}
 
 ReactiveVersionSet::~ReactiveVersionSet() = default;
 
@@ -7550,8 +8000,8 @@ Status ReactiveVersionSet::MaybeSwitchManifest(
     }
   } else if (s.IsPathNotFound()) {
     // This can happen if the primary switches to a new MANIFEST after the
-    // secondary reads the CURRENT file but before the secondary actually tries
-    // to open the MANIFEST.
+    // secondary reads the CURRENT file but before the secondary actually
+    // tries to open the MANIFEST.
     s = Status::TryAgain(
         "The primary may have switched to a new MANIFEST and deleted the old "
         "one.");
diff --git a/db/version_set.h b/db/version_set.h
index 72ae58f162c8..47a677cf59e6 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -200,7 +200,8 @@ class VersionStorageInfo {
   // REQUIRES: db_mutex held!!
   // TODO find a better way to pass compaction_options_fifo.
   void ComputeCompactionScore(const ImmutableOptions& immutable_options,
-                              const MutableCFOptions& mutable_cf_options);
+                              const MutableCFOptions& mutable_cf_options,
+                              const std::string& full_history_ts_low);
 
   // Estimate est_comp_needed_bytes_
   void EstimateCompactionBytesNeeded(
@@ -230,8 +231,15 @@ class VersionStorageInfo {
   // oldest snapshot changes as that is when bottom-level files can become
   // eligible for compaction.
   //
+  // For columns with User Defined Timestamps (UDT), also checks that the
+  // file's largest timestamp is below full_history_ts_low before marking,
+  // since compaction can only collapse timestamp when it is below this
+  // threshold.
+  //
   // REQUIRES: DB mutex held
-  void ComputeBottommostFilesMarkedForCompaction(bool allow_ingest_behind);
+  void ComputeBottommostFilesMarkedForCompaction(
+      bool allow_ingest_behind, const Comparator* ucmp,
+      const std::string& full_history_ts_low);
 
   // This computes files_marked_for_forced_blob_gc_ and is called by
   // ComputeCompactionScore()
@@ -248,7 +256,8 @@ class VersionStorageInfo {
   // files marked for compaction.
   // REQUIRES: DB mutex held
   void UpdateOldestSnapshot(SequenceNumber oldest_snapshot_seqnum,
-                            bool allow_ingest_behind);
+                            bool allow_ingest_behind, const Comparator* ucmp,
+                            const std::string& full_history_ts_low);
 
   int MaxInputLevel() const;
   int MaxOutputLevel(bool allow_ingest_behind) const;
@@ -268,8 +277,13 @@ class VersionStorageInfo {
       bool expand_range = true,   // if set, returns files which overlap the
                                   // range and overlap each other. If false,
                                   // then just files intersecting the range
-      InternalKey** next_smallest = nullptr)  // if non-null, returns the
-      const;  // smallest key of next file not included
+      const FileMetaData* starting_l0_file =
+          nullptr,  // If not null, restricts L0 file selection to only include
+                    // files at or older than starting_l0_file.
+      InternalKey** next_smallest =
+          nullptr  // if non-null, returns the
+                   // smallest key of next file not included
+  ) const;
   void GetCleanInputsWithinInterval(
       int level, const InternalKey* begin,  // nullptr means before all keys
       const InternalKey* end,               // nullptr means after all keys
@@ -286,8 +300,10 @@ class VersionStorageInfo {
       int hint_index,                // index of overlap file
       int* file_index,               // return index of overlap file
       bool within_interval = false,  // if set, force the inputs within interval
-      InternalKey** next_smallest = nullptr)  // if non-null, returns the
-      const;  // smallest key of next file not included
+      InternalKey** next_smallest =
+          nullptr  // if non-null, returns the
+                   // smallest key of next file not included
+  ) const;
 
   // Returns true iff some file in the specified level overlaps
   // some part of [*smallest_user_key,*largest_user_key].
@@ -630,7 +646,8 @@ class VersionStorageInfo {
                                      const Slice& largest_user_key,
                                      int last_level, int last_l0_idx);
 
-  Env::WriteLifeTimeHint CalculateSSTWriteHint(int level) const;
+  Env::WriteLifeTimeHint CalculateSSTWriteHint(
+      int level, CompactionStyleSet compaction_style_set) const;
 
   const Comparator* user_comparator() const { return user_comparator_; }
 
@@ -668,6 +685,8 @@ class VersionStorageInfo {
 
   // List of files per level, files in each level are arranged
   // in increasing order of keys
+  // In L0, files are ordered in decreasing epoch number, meaning
+  // more recent updates are ordered first.
   std::vector<FileMetaData*>* files_;
 
   // Map of all table files in version. Maps file number to (level, position on
@@ -993,17 +1012,21 @@ class Version {
                             const FileMetaData* file_meta,
                             const std::string* fname = nullptr) const;
 
-  // REQUIRES: lock is held
   // On success, *props will be populated with all SSTables' table properties.
   // The keys of `props` are the sst file name, the values of `props` are the
   // tables' properties, represented as std::shared_ptr.
   Status GetPropertiesOfAllTables(const ReadOptions& read_options,
-                                  TablePropertiesCollection* props);
+                                  TablePropertiesCollection* props) const;
   Status GetPropertiesOfAllTables(const ReadOptions& read_options,
-                                  TablePropertiesCollection* props, int level);
+                                  TablePropertiesCollection* props,
+                                  int level) const;
   Status GetPropertiesOfTablesInRange(const ReadOptions& read_options,
                                       const autovector<UserKeyRange>& ranges,
                                       TablePropertiesCollection* props) const;
+  Status GetPropertiesOfTablesByLevel(
+      const ReadOptions& read_options,
+      std::vector<std::unique_ptr<TablePropertiesCollection>>* props_by_level)
+      const;
 
   // Print summary of range delete tombstones in SST files into out_str,
   // with maximum max_entries_to_print entries printed out.
@@ -1037,6 +1060,10 @@ class Version {
 
   void GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta);
 
+  // Get column family metadata with optional filtering by key range and level.
+  void GetColumnFamilyMetaData(const GetColumnFamilyMetaDataOptions& options,
+                               ColumnFamilyMetaData* cf_meta);
+
   void GetSstFilesBoundaryKeys(Slice* smallest_user_key,
                                Slice* largest_user_key);
 
@@ -1174,9 +1201,14 @@ class AtomicGroupReadBuffer {
 // VersionSet is the collection of versions of all the column families of the
 // database. Each database owns one VersionSet. A VersionSet has access to all
 // column families via ColumnFamilySet, i.e. set of the column families.
+// `unchanging` means the LSM tree structure of the column families will not
+// change during the lifetime of this VersionSet (true for read-only instance,
+// but false for secondary instance or writable DB).
 class VersionSet {
  public:
-  VersionSet(const std::string& dbname, const ImmutableDBOptions* db_options,
+  VersionSet(const std::string& dbname,
+             const ImmutableDBOptions* imm_db_options,
+             const MutableDBOptions& mutable_db_options,
              const FileOptions& file_options, Cache* table_cache,
              WriteBufferManager* write_buffer_manager,
              WriteController* write_controller,
@@ -1184,7 +1216,7 @@ class VersionSet {
              const std::shared_ptr<IOTracer>& io_tracer,
              const std::string& db_id, const std::string& db_session_id,
              const std::string& daily_offpeak_time_utc,
-             ErrorHandler* const error_handler, const bool read_only);
+             ErrorHandler* error_handler, bool unchanging);
   // No copying allowed
   VersionSet(const VersionSet&) = delete;
   void operator=(const VersionSet&) = delete;
@@ -1193,6 +1225,13 @@ class VersionSet {
 
   virtual Status Close(FSDirectory* db_dir, InstrumentedMutex* mu);
 
+  // Requires: already holding DB mutex `mu`, to ensure
+  // * Safely read values from `updated_options`
+  // * Safely update fields on `this` (must be read elsewhere while holding mu)
+  // except `mu` can be nullptr during initialization
+  void UpdatedMutableDbOptions(const MutableDBOptions& updated_options,
+                               InstrumentedMutex* mu);
+
   Status LogAndApplyToDefaultColumnFamily(
       const ReadOptions& read_options, const WriteOptions& write_options,
       VersionEdit* edit, InstrumentedMutex* mu,
@@ -1263,8 +1302,11 @@ class VersionSet {
   void WakeUpWaitingManifestWriters();
 
   // Recover the last saved descriptor (MANIFEST) from persistent storage.
-  // If read_only == true, Recover() will not complain if some column families
-  // are not opened
+  // Unlike `unchanging` on the VersionSet, `read_only` here and in other
+  // functions below refers to the CF receiving no writes or modifications
+  // through this VersionSet, but could through external manifest updates
+  // etc. Thus, `read_only=true` for secondary instances as well as read-only
+  // instances.
   Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
                  bool read_only = false, std::string* db_id = nullptr,
                  bool no_error_if_files_missing = false, bool is_retry = false,
@@ -1342,6 +1384,8 @@ class VersionSet {
     return min_log_number_to_keep_.load();
   }
 
+  bool unchanging() const { return unchanging_; }
+
   // Allocate and return a new file number
   uint64_t NewFileNumber() { return next_file_number_.fetch_add(1); }
 
@@ -1390,6 +1434,29 @@ class VersionSet {
     return last_allocated_sequence_.fetch_add(s, std::memory_order_seq_cst);
   }
 
+  // Sync last_sequence_ with last_allocated_sequence_. This should be called
+  // during error recovery to ensure that any sequence numbers that were
+  // allocated (written to WAL) but not yet published are accounted for when
+  // creating new memtables/WALs. This prevents the "sequence number going
+  // backwards" corruption on subsequent recovery.
+  //
+  // This is necessary because with two_write_queues=true, writes allocate
+  // sequence numbers via FetchAddLastAllocatedSequence() before the write
+  // is complete, but only publish via SetLastSequence() after success.
+  // If an error occurs and recovery creates new memtables, SwitchMemtable
+  // uses LastSequence() which may be lower than already-allocated sequences.
+  //
+  // REQUIRED: DB mutex is held and no concurrent writers are active (i.e.,
+  // after WaitForBackgroundWork() in ResumeImpl).
+  void SyncLastSequenceWithAllocated() {
+    uint64_t alloc_seq =
+        last_allocated_sequence_.load(std::memory_order_seq_cst);
+    uint64_t last_seq = last_sequence_.load(std::memory_order_acquire);
+    if (alloc_seq > last_seq) {
+      last_sequence_.store(alloc_seq, std::memory_order_release);
+    }
+  }
+
   // Mark the specified file number as used.
   // REQUIRED: this is only called during single-threaded recovery or repair.
   void MarkFileNumberUsed(uint64_t number);
@@ -1533,10 +1600,6 @@ class VersionSet {
   }
 
   const FileOptions& file_options() { return file_options_; }
-  void ChangeFileOptions(const MutableDBOptions& new_options) {
-    file_options_.writable_file_max_buffer_size =
-        new_options.writable_file_max_buffer_size;
-  }
 
   // TODO - Consider updating together when file options change in SetDBOptions
   const OffpeakTimeOption& offpeak_time_option() {
@@ -1573,6 +1636,18 @@ class VersionSet {
     AppendVersion(cfd, version);
   }
 
+  bool& TEST_unchanging() { return const_cast<bool&>(unchanging_); }
+
+  uint64_t TEST_GetMinMaxManifestFileSize() {
+    return min_max_manifest_file_size_;
+  }
+  unsigned TEST_GetMaxManifestSpaceAmpPct() {
+    return max_manifest_space_amp_pct_;
+  }
+  size_t TEST_GetManifestPreallocationSize() {
+    return manifest_preallocation_size_;
+  }
+
  protected:
   struct ManifestWriter;
 
@@ -1593,6 +1668,7 @@ class VersionSet {
     }
   };
 
+  // Revert back to a post-construction state (keep same options/settings)
   void Reset();
 
   // Returns approximated offset of a key in a file for a given version.
@@ -1625,12 +1701,17 @@ class VersionSet {
 
   ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options,
                                        const ReadOptions& read_options,
-                                       const VersionEdit* edit);
+                                       const VersionEdit* edit, bool read_only);
 
   Status VerifyFileMetadata(const ReadOptions& read_options,
                             ColumnFamilyData* cfd, const std::string& fpath,
                             int level, const FileMetaData& meta);
 
+  // Auto-tune next max size for the current manifest file based on its initial
+  // "compacted" size and other parameters saved in this VersionSet. Must be
+  // holding DB mutex if outside of DB startup.
+  void TuneMaxManifestFileSize();
+
   // Protected by DB mutex.
   WalSet wals_;
 
@@ -1657,6 +1738,9 @@ class VersionSet {
   // The last sequence number of data committed to the descriptor (manifest
   // file).
   SequenceNumber descriptor_last_sequence_ = 0;
+  // See write_prepared_txn.h for a more detailed description of how Write
+  // Prepared transactions work, with concrete examples.
+  //
   // The last seq that is already allocated. It is applicable only when we have
   // two write queues. In that case seq might or might not have appreated in
   // memtable but it is expected to appear in the WAL.
@@ -1682,6 +1766,20 @@ class VersionSet {
   // Current size of manifest file
   uint64_t manifest_file_size_;
 
+  // Size of the populated manifest file last time it was re-written from
+  // scratch.
+  uint64_t last_compacted_manifest_file_size_;
+
+  // Auto-tuned max allowed size for the current manifest file
+  uint64_t tuned_max_manifest_file_size_;
+
+  // Saved copy of max_manifest_file_size in (Mutable)DBOptions
+  uint64_t min_max_manifest_file_size_;
+  // Saved, sanitized copy from (Mutable)DBOptions
+  unsigned max_manifest_space_amp_pct_;
+  // Saved copy from (Mutable)DBOptions
+  size_t manifest_preallocation_size_;
+
   // Obsolete files, or during DB shutdown any files not referenced by what's
   // left of the in-memory LSM state.
   std::vector<ObsoleteFileInfo> obsolete_files_;
@@ -1722,7 +1820,7 @@ class VersionSet {
                            VersionEdit* edit, SequenceNumber* max_last_sequence,
                            InstrumentedMutex* mu);
 
-  const bool read_only_;
+  const bool unchanging_;
   bool closed_;
 };
 
@@ -1734,6 +1832,7 @@ class ReactiveVersionSet : public VersionSet {
  public:
   ReactiveVersionSet(const std::string& dbname,
                      const ImmutableDBOptions* _db_options,
+                     const MutableDBOptions& mutable_db_options,
                      const FileOptions& _file_options, Cache* table_cache,
                      WriteBufferManager* write_buffer_manager,
                      WriteController* write_controller,
diff --git a/db/version_set_test.cc b/db/version_set_test.cc
index c249fa6dafad..a4cf2698c078 100644
--- a/db/version_set_test.cc
+++ b/db/version_set_test.cc
@@ -26,6 +26,7 @@
 #include "test_util/mock_time_env.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/defer.h"
 #include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -55,7 +56,8 @@ class GenerateLevelFilesBriefTest : public testing::Test {
         kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
         kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum,
         kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0,
-        /* user_defined_timestamps_persisted */ true);
+        /* user_defined_timestamps_persisted */ true, /* min timestamp */ "",
+        /* max timestamp */ "");
     files_.push_back(f);
   }
 
@@ -171,7 +173,8 @@ class VersionStorageInfoTestBase : public testing::Test {
         kUnknownOldestAncesterTime, kUnknownFileCreationTime,
         kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName,
         kNullUniqueId64x2, compensated_range_deletion_size, 0,
-        /* user_defined_timestamps_persisted */ true);
+        /* user_defined_timestamps_persisted */ true, /* min timestamp */ "",
+        /* max timestamp */ "");
     vstorage_.AddFile(level, f);
   }
 
@@ -390,7 +393,8 @@ TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_1) {
   ASSERT_EQ(51450U, vstorage_.MaxBytesForLevel(3));
   ASSERT_EQ(257250U, vstorage_.MaxBytesForLevel(4));
 
-  vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_);
+  vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_,
+                                   /*full_history_ts_low=*/"");
   // Only L0 hits compaction.
   ASSERT_EQ(vstorage_.CompactionScoreLevel(0), 0);
 }
@@ -420,7 +424,8 @@ TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_2) {
   ASSERT_EQ(51450U, vstorage_.MaxBytesForLevel(3));
   ASSERT_EQ(257250U, vstorage_.MaxBytesForLevel(4));
 
-  vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_);
+  vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_,
+                                   /*full_history_ts_low=*/"");
   // Although L2 and l3 have higher unadjusted compaction score, considering
   // a relatively large L0 being compacted down soon, L4 is picked up for
   // compaction.
@@ -452,7 +457,8 @@ TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_3) {
   ASSERT_EQ(2, vstorage_.base_level());
   ASSERT_EQ(20000U, vstorage_.MaxBytesForLevel(2));
 
-  vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_);
+  vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_,
+                                   /*full_history_ts_low=*/"");
   // Although L2 has higher unadjusted compaction score, considering
   // a relatively large L0 being compacted down soon, L3 is picked up for
   // compaction.
@@ -482,7 +488,8 @@ TEST_F(VersionStorageInfoTest, DrainUnnecessaryLevel) {
   ASSERT_EQ(1, vstorage_.base_level());
   ASSERT_EQ(1000, vstorage_.MaxBytesForLevel(1));
   ASSERT_EQ(10100, vstorage_.MaxBytesForLevel(3));
-  vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_);
+  vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_,
+                                   /*full_history_ts_low=*/"");
 
   // Tests that levels 1 and 3 are eligible for compaction.
   // Levels 1 and 3 are much smaller than target size,
@@ -1158,12 +1165,12 @@ class VersionSetTestBase {
       : env_(nullptr),
         dbname_(test::PerThreadDBPath(name)),
         options_(),
-        db_options_(options_),
+        imm_db_options_(options_),
         cf_options_(options_),
-        immutable_options_(db_options_, cf_options_),
+        immutable_options_(imm_db_options_, cf_options_),
         mutable_cf_options_(cf_options_),
         table_cache_(NewLRUCache(50000, 16)),
-        write_buffer_manager_(db_options_.db_write_buffer_size),
+        write_buffer_manager_(imm_db_options_.db_write_buffer_size),
         shutting_down_(false),
         table_factory_(std::make_shared<mock::MockTableFactory>()) {
     EXPECT_OK(test::CreateEnvFromSystem(ConfigOptions(), &env_, &env_guard_));
@@ -1177,8 +1184,8 @@ class VersionSetTestBase {
     EXPECT_OK(fs_->CreateDirIfMissing(dbname_, IOOptions(), nullptr));
 
     options_.env = env_;
-    db_options_.env = env_;
-    db_options_.fs = fs_;
+    imm_db_options_.env = env_;
+    imm_db_options_.fs = fs_;
     immutable_options_.env = env_;
     immutable_options_.fs = fs_;
     immutable_options_.clock = env_->GetSystemClock().get();
@@ -1187,16 +1194,17 @@ class VersionSetTestBase {
     mutable_cf_options_.table_factory = table_factory_;
 
     versions_.reset(new VersionSet(
-        dbname_, &db_options_, env_options_, table_cache_.get(),
-        &write_buffer_manager_, &write_controller_,
+        dbname_, &imm_db_options_, mutable_db_options_, env_options_,
+        table_cache_.get(), &write_buffer_manager_, &write_controller_,
         /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
         /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
         /*error_handler=*/nullptr, /*read_only=*/false));
     reactive_versions_ = std::make_shared<ReactiveVersionSet>(
-        dbname_, &db_options_, env_options_, table_cache_.get(),
-        &write_buffer_manager_, &write_controller_, nullptr);
-    db_options_.db_paths.emplace_back(dbname_,
-                                      std::numeric_limits<uint64_t>::max());
+        dbname_, &imm_db_options_, mutable_db_options_, env_options_,
+        table_cache_.get(), &write_buffer_manager_, &write_controller_,
+        nullptr);
+    imm_db_options_.db_paths.emplace_back(dbname_,
+                                          std::numeric_limits<uint64_t>::max());
   }
 
   virtual ~VersionSetTestBase() {
@@ -1219,7 +1227,7 @@ class VersionSetTestBase {
     ASSERT_OK(
         SetIdentityFile(WriteOptions(), env_, dbname_, Temperature::kUnknown));
     VersionEdit new_db;
-    if (db_options_.write_dbid_to_manifest) {
+    if (imm_db_options_.write_dbid_to_manifest) {
       DBOptions tmp_db_options;
       tmp_db_options.env = env_;
       std::unique_ptr<DBImpl> impl(new DBImpl(tmp_db_options, dbname_));
@@ -1344,7 +1352,8 @@ class VersionSetTestBase {
           Temperature::kUnknown, info.oldest_blob_file_number, 0, 0,
           info.epoch_number, kUnknownFileChecksum, kUnknownFileChecksumFuncName,
           kNullUniqueId64x2, 0, 0,
-          /* user_defined_timestamps_persisted */ true);
+          /* user_defined_timestamps_persisted */ true, /* min timestamp */ "",
+          /* max timestamp */ "");
       if (info.file_missing) {
         ASSERT_OK(fs_->DeleteFile(fname, IOOptions(), nullptr));
       }
@@ -1380,8 +1389,8 @@ class VersionSetTestBase {
 
   void ReopenDB() {
     versions_.reset(new VersionSet(
-        dbname_, &db_options_, env_options_, table_cache_.get(),
-        &write_buffer_manager_, &write_controller_,
+        dbname_, &imm_db_options_, mutable_db_options_, env_options_,
+        table_cache_.get(), &write_buffer_manager_, &write_controller_,
         /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
         /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
         /*error_handler=*/nullptr, /*read_only=*/false));
@@ -1470,7 +1479,8 @@ class VersionSetTestBase {
   const std::string dbname_;
   EnvOptions env_options_;
   Options options_;
-  ImmutableDBOptions db_options_;
+  ImmutableDBOptions imm_db_options_;
+  MutableDBOptions mutable_db_options_;
   ColumnFamilyOptions cf_options_;
   ImmutableOptions immutable_options_;
   MutableCFOptions mutable_cf_options_;
@@ -1901,11 +1911,11 @@ TEST_F(VersionSetTest, WalAddition) {
   // Recover a new VersionSet.
   {
     std::unique_ptr<VersionSet> new_versions(new VersionSet(
-        dbname_, &db_options_, env_options_, table_cache_.get(),
-        &write_buffer_manager_, &write_controller_,
+        dbname_, &imm_db_options_, mutable_db_options_, env_options_,
+        table_cache_.get(), &write_buffer_manager_, &write_controller_,
         /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
         /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
-        /*error_handler=*/nullptr, /*read_only=*/false));
+        /*error_handler=*/nullptr, /*unchanging=*/false));
     ASSERT_OK(new_versions->Recover(column_families_, /*read_only=*/false));
     const auto& wals = new_versions->GetWalSet().GetWals();
     ASSERT_EQ(wals.size(), 1);
@@ -1969,11 +1979,11 @@ TEST_F(VersionSetTest, WalCloseWithoutSync) {
   // Recover a new VersionSet.
   {
     std::unique_ptr<VersionSet> new_versions(new VersionSet(
-        dbname_, &db_options_, env_options_, table_cache_.get(),
-        &write_buffer_manager_, &write_controller_,
+        dbname_, &imm_db_options_, mutable_db_options_, env_options_,
+        table_cache_.get(), &write_buffer_manager_, &write_controller_,
         /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
         /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
-        /*error_handler=*/nullptr, /*read_only=*/false));
+        /*error_handler=*/nullptr, /*unchanging=*/false));
     ASSERT_OK(new_versions->Recover(column_families_, false));
     const auto& wals = new_versions->GetWalSet().GetWals();
     ASSERT_EQ(wals.size(), 2);
@@ -2023,11 +2033,11 @@ TEST_F(VersionSetTest, WalDeletion) {
   // Recover a new VersionSet, only the non-closed WAL should show up.
   {
     std::unique_ptr<VersionSet> new_versions(new VersionSet(
-        dbname_, &db_options_, env_options_, table_cache_.get(),
-        &write_buffer_manager_, &write_controller_,
+        dbname_, &imm_db_options_, mutable_db_options_, env_options_,
+        table_cache_.get(), &write_buffer_manager_, &write_controller_,
         /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
         /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
-        /*error_handler=*/nullptr, /*read_only=*/false));
+        /*error_handler=*/nullptr, /*unchanging=*/false));
     ASSERT_OK(new_versions->Recover(column_families_, false));
     const auto& wals = new_versions->GetWalSet().GetWals();
     ASSERT_EQ(wals.size(), 1);
@@ -2062,11 +2072,11 @@ TEST_F(VersionSetTest, WalDeletion) {
   // Recover from the new MANIFEST, only the non-closed WAL should show up.
   {
     std::unique_ptr<VersionSet> new_versions(new VersionSet(
-        dbname_, &db_options_, env_options_, table_cache_.get(),
-        &write_buffer_manager_, &write_controller_,
+        dbname_, &imm_db_options_, mutable_db_options_, env_options_,
+        table_cache_.get(), &write_buffer_manager_, &write_controller_,
         /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
         /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
-        /*error_handler=*/nullptr, /*read_only=*/false));
+        /*error_handler=*/nullptr, /*unchanging=*/false));
     ASSERT_OK(new_versions->Recover(column_families_, false));
     const auto& wals = new_versions->GetWalSet().GetWals();
     ASSERT_EQ(wals.size(), 1);
@@ -2183,11 +2193,11 @@ TEST_F(VersionSetTest, DeleteWalsBeforeNonExistingWalNumber) {
   // Recover a new VersionSet, WAL0 is deleted, WAL1 is not.
   {
     std::unique_ptr<VersionSet> new_versions(new VersionSet(
-        dbname_, &db_options_, env_options_, table_cache_.get(),
-        &write_buffer_manager_, &write_controller_,
+        dbname_, &imm_db_options_, mutable_db_options_, env_options_,
+        table_cache_.get(), &write_buffer_manager_, &write_controller_,
         /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
         /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
-        /*error_handler=*/nullptr, /*read_only=*/false));
+        /*error_handler=*/nullptr, /*unchanging=*/false));
     ASSERT_OK(new_versions->Recover(column_families_, false));
     const auto& wals = new_versions->GetWalSet().GetWals();
     ASSERT_EQ(wals.size(), 1);
@@ -2220,11 +2230,11 @@ TEST_F(VersionSetTest, DeleteAllWals) {
   // Recover a new VersionSet, all WALs are deleted.
   {
     std::unique_ptr<VersionSet> new_versions(new VersionSet(
-        dbname_, &db_options_, env_options_, table_cache_.get(),
-        &write_buffer_manager_, &write_controller_,
+        dbname_, &imm_db_options_, mutable_db_options_, env_options_,
+        table_cache_.get(), &write_buffer_manager_, &write_controller_,
         /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
         /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
-        /*error_handler=*/nullptr, /*read_only=*/false));
+        /*error_handler=*/nullptr, /*unchanging=*/false));
     ASSERT_OK(new_versions->Recover(column_families_, false));
     const auto& wals = new_versions->GetWalSet().GetWals();
     ASSERT_EQ(wals.size(), 0);
@@ -2263,11 +2273,11 @@ TEST_F(VersionSetTest, AtomicGroupWithWalEdits) {
   // kept.
   {
     std::unique_ptr<VersionSet> new_versions(new VersionSet(
-        dbname_, &db_options_, env_options_, table_cache_.get(),
-        &write_buffer_manager_, &write_controller_,
+        dbname_, &imm_db_options_, mutable_db_options_, env_options_,
+        table_cache_.get(), &write_buffer_manager_, &write_controller_,
         /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
         /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
-        /*error_handler=*/nullptr, /*read_only=*/false));
+        /*error_handler=*/nullptr, /*unchanging=*/false));
     std::string db_id;
     ASSERT_OK(
         new_versions->Recover(column_families_, /*read_only=*/false, &db_id));
@@ -2443,11 +2453,11 @@ class VersionSetWithTimestampTest : public VersionSetTest {
 
   void VerifyFullHistoryTsLow(uint64_t expected_ts_low) {
     std::unique_ptr<VersionSet> vset(new VersionSet(
-        dbname_, &db_options_, env_options_, table_cache_.get(),
-        &write_buffer_manager_, &write_controller_,
+        dbname_, &imm_db_options_, mutable_db_options_, env_options_,
+        table_cache_.get(), &write_buffer_manager_, &write_controller_,
         /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
         /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
-        /*error_handler=*/nullptr, /*read_only=*/false));
+        /*error_handler=*/nullptr, /*unchanging=*/false));
     ASSERT_OK(vset->Recover(column_families_, /*read_only=*/false,
                             /*db_id=*/nullptr));
     for (auto* cfd : *(vset->GetColumnFamilySet())) {
@@ -3499,7 +3509,7 @@ class VersionSetTestEmptyDb
                        std::unique_ptr<log::Writer>* log_writer) override {
     assert(nullptr != log_writer);
     VersionEdit new_db;
-    if (db_options_.write_dbid_to_manifest) {
+    if (imm_db_options_.write_dbid_to_manifest) {
       ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_,
                                 Temperature::kUnknown));
       DBOptions tmp_db_options;
@@ -3531,7 +3541,7 @@ class VersionSetTestEmptyDb
 const std::string VersionSetTestEmptyDb::kUnknownColumnFamilyName = "unknown";
 
 TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest0) {
-  db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
+  imm_db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
   PrepareManifest(nullptr, nullptr, &log_writer_);
   log_writer_.reset();
   CreateCurrentFile();
@@ -3563,7 +3573,7 @@ TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest0) {
 }
 
 TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest1) {
-  db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
+  imm_db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
   PrepareManifest(nullptr, nullptr, &log_writer_);
   // Only a subset of column families in the MANIFEST.
   VersionEdit new_cf1;
@@ -3604,7 +3614,7 @@ TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest1) {
 }
 
 TEST_P(VersionSetTestEmptyDb, OpenFromInCompleteManifest2) {
-  db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
+  imm_db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
   PrepareManifest(nullptr, nullptr, &log_writer_);
   // Write all column families but no log_number, next_file_number and
   // last_sequence.
@@ -3650,7 +3660,7 @@ TEST_P(VersionSetTestEmptyDb, OpenFromInCompleteManifest2) {
 }
 
 TEST_P(VersionSetTestEmptyDb, OpenManifestWithUnknownCF) {
-  db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
+  imm_db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
   PrepareManifest(nullptr, nullptr, &log_writer_);
   // Write all column families but no log_number, next_file_number and
   // last_sequence.
@@ -3707,7 +3717,7 @@ TEST_P(VersionSetTestEmptyDb, OpenManifestWithUnknownCF) {
 }
 
 TEST_P(VersionSetTestEmptyDb, OpenCompleteManifest) {
-  db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
+  imm_db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
   PrepareManifest(nullptr, nullptr, &log_writer_);
   // Write all column families but no log_number, next_file_number and
   // last_sequence.
@@ -3749,6 +3759,8 @@ TEST_P(VersionSetTestEmptyDb, OpenCompleteManifest) {
   }
   std::string db_id;
   bool has_missing_table_file = false;
+  SaveAndRestore<bool> override_unchanging(&versions_->TEST_unchanging(),
+                                           read_only);
   s = versions_->TryRecoverFromOneManifest(manifest_path, column_families,
                                            read_only, &db_id,
                                            &has_missing_table_file);
@@ -3825,7 +3837,7 @@ class VersionSetTestMissingFiles : public VersionSetTestBase,
     ASSERT_OK(s);
     log_writer->reset(new log::Writer(std::move(file_writer), 0, false));
     VersionEdit new_db;
-    if (db_options_.write_dbid_to_manifest) {
+    if (imm_db_options_.write_dbid_to_manifest) {
       DBOptions tmp_db_options;
       tmp_db_options.env = env_;
       std::unique_ptr<DBImpl> impl(new DBImpl(tmp_db_options, dbname_));
@@ -3935,7 +3947,8 @@ TEST_F(VersionSetTestMissingFiles, ManifestFarBehindSst) {
         largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0,
         file_num /* epoch_number */, kUnknownFileChecksum,
         kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0,
-        /* user_defined_timestamps_persisted */ true);
+        /* user_defined_timestamps_persisted */ true, /* min timestamp */ "",
+        /* max timestamp */ "");
     added_files.emplace_back(0, meta);
   }
   WriteFileAdditionAndDeletionToManifest(
@@ -3996,7 +4009,8 @@ TEST_F(VersionSetTestMissingFiles, ManifestAheadofSst) {
         largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0,
         file_num /* epoch_number */, kUnknownFileChecksum,
         kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0,
-        /* user_defined_timestamps_persisted */ true);
+        /* user_defined_timestamps_persisted */ true, /* min timestamp */ "",
+        /* max timestamp */ "");
     added_files.emplace_back(0, meta);
   }
   WriteFileAdditionAndDeletionToManifest(
@@ -4085,7 +4099,7 @@ TEST_F(VersionSetTestMissingFiles, NoFileMissing) {
 }
 
 TEST_F(VersionSetTestMissingFiles, MinLogNumberToKeep2PC) {
-  db_options_.allow_2pc = true;
+  imm_db_options_.allow_2pc = true;
   NewDB();
 
   SstInfo sst(100, kDefaultColumnFamilyName, "a", 0 /* level */,
diff --git a/db/version_util.h b/db/version_util.h
index 2690a00f48d9..7219d11c854b 100644
--- a/db/version_util.h
+++ b/db/version_util.h
@@ -1,4 +1,4 @@
-//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
@@ -23,7 +23,8 @@ class OfflineManifestWriter {
         immutable_db_options_(WithDbPath(options, db_path)),
         tc_(NewLRUCache(1 << 20 /* capacity */,
                         options.table_cache_numshardbits)),
-        versions_(db_path, &immutable_db_options_, sopt_, tc_.get(), &wb_, &wc_,
+        versions_(db_path, &immutable_db_options_, MutableDBOptions{options},
+                  sopt_, tc_.get(), &wb_, &wc_,
                   /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
                   /*db_id=*/"", /*db_session_id=*/"",
                   options.daily_offpeak_time_utc,
diff --git a/db/wal_manager.cc b/db/wal_manager.cc
index 60e85567be4a..67582c80552f 100644
--- a/db/wal_manager.cc
+++ b/db/wal_manager.cc
@@ -192,7 +192,13 @@ void WalManager::PurgeObsoleteWALFiles() {
                          s.ToString().c_str());
           continue;
         }
-        if (now_seconds - file_m_time > db_options_.WAL_ttl_seconds) {
+
+        // Avoid expression `now_seconds - file_m_time` when
+        // `file_m_time > now_seconds` to prevent unsigned underflow in case
+        // system clock goes backwards. Both timestamps are based on wall clock
+        // time, which is not guaranteed to be monotonic.
+        if (file_m_time <= now_seconds &&
+            now_seconds - file_m_time > db_options_.WAL_ttl_seconds) {
           s = DeleteDBFile(&db_options_, file_path, archival_dir, false,
                            /*force_fg=*/!wal_in_db_path_);
           if (!s.ok()) {
@@ -283,6 +289,7 @@ void WalManager::ArchiveWALFile(const std::string& fname, uint64_t number) {
   // The sync point below is used in (DBTest,TransactionLogIteratorRace)
   TEST_SYNC_POINT("WalManager::PurgeObsoleteFiles:1");
   Status s = env_->RenameFile(fname, archived_log_name);
+  IGNORE_STATUS_IF_ERROR(s);
   // The sync point below is used in (DBTest,TransactionLogIteratorRace)
   TEST_SYNC_POINT("WalManager::PurgeObsoleteFiles:2");
   // The sync point below is used in
diff --git a/db/wal_manager_test.cc b/db/wal_manager_test.cc
index 5b5ba7c0a872..e674e7b778c9 100644
--- a/db/wal_manager_test.cc
+++ b/db/wal_manager_test.cc
@@ -19,6 +19,7 @@
 #include "rocksdb/write_batch.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "table/mock_table.h"
+#include "test_util/mock_time_env.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/string_util.h"
@@ -39,7 +40,7 @@ class WalManagerTest : public testing::Test {
     EXPECT_OK(DestroyDB(dbname_, Options()));
   }
 
-  void Init() {
+  void Init(SystemClock* clock_override) {
     ASSERT_OK(env_->CreateDirIfMissing(dbname_));
     ASSERT_OK(env_->CreateDirIfMissing(ArchivalDirectory(dbname_)));
     db_options_.db_paths.emplace_back(dbname_,
@@ -47,11 +48,15 @@ class WalManagerTest : public testing::Test {
     db_options_.wal_dir = dbname_;
     db_options_.env = env_.get();
     db_options_.fs = env_->GetFileSystem();
-    db_options_.clock = env_->GetSystemClock().get();
+    if (clock_override == nullptr) {
+      db_options_.clock = env_->GetSystemClock().get();
+    } else {
+      db_options_.clock = clock_override;
+    }
 
     versions_.reset(new VersionSet(
-        dbname_, &db_options_, env_options_, table_cache_.get(),
-        &write_buffer_manager_, &write_controller_,
+        dbname_, &db_options_, MutableDBOptions{}, env_options_,
+        table_cache_.get(), &write_buffer_manager_, &write_controller_,
         /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
         /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
         /*error_handler=*/nullptr, /*read_only=*/false));
@@ -124,7 +129,7 @@ class WalManagerTest : public testing::Test {
 };
 
 TEST_F(WalManagerTest, ReadFirstRecordCache) {
-  Init();
+  Init(nullptr /* clock_override */);
   std::string path = dbname_ + "/000001.log";
   std::unique_ptr<FSWritableFile> file;
   ASSERT_OK(env_->GetFileSystem()->NewWritableFile(path, FileOptions(), &file,
@@ -221,7 +226,7 @@ int CountRecords(TransactionLogIterator* iter) {
 TEST_F(WalManagerTest, WALArchivalSizeLimit) {
   db_options_.WAL_ttl_seconds = 0;
   db_options_.WAL_size_limit_MB = 1000;
-  Init();
+  Init(nullptr /* clock_override */);
 
   // TEST : Create WalManager with huge size limit and no ttl.
   // Create some archived files and call PurgeObsoleteWALFiles().
@@ -258,7 +263,7 @@ TEST_F(WalManagerTest, WALArchivalSizeLimit) {
 
 TEST_F(WalManagerTest, WALArchivalTtl) {
   db_options_.WAL_ttl_seconds = 1000;
-  Init();
+  Init(nullptr /* clock_override */);
 
   // TEST : Create WalManager with a ttl and no size limit.
   // Create some archived log files and call PurgeObsoleteWALFiles().
@@ -282,8 +287,41 @@ TEST_F(WalManagerTest, WALArchivalTtl) {
   ASSERT_TRUE(log_files.empty());
 }
 
+TEST_F(WalManagerTest, WALArchivalTtlClockGoesBackwards) {
+  // This test used to trigger an unsigned underflow bug, where WAL files were
+  // incorrectly deleted when the system time moved backwards between writing
+  // to a WAL and running `WalManager::PurgeObsoleteWALFiles()`.
+  constexpr int kNumLogs = 5;
+  constexpr int kEntriesPerLog = 100;
+
+  db_options_.WAL_ttl_seconds = 86400;  // One day
+
+  // Configure mock clock to lag one second behind system time. That way, the
+  // WAL file's mtime will appear to be in the future when
+  // `WalManager::PurgeObsoleteWALFiles()` runs.
+  int64_t now_seconds;
+  ASSERT_OK(env_->GetSystemClock()->GetCurrentTime(&now_seconds));
+  auto mock_clock = std::make_shared<MockSystemClock>(env_->GetSystemClock());
+  mock_clock->SetCurrentTime(static_cast<uint64_t>(now_seconds - 1));
+  db_options_.clock = mock_clock.get();
+
+  Init(mock_clock.get() /* clock */);
+
+  CreateArchiveLogs(kNumLogs, kEntriesPerLog);
+
+  const std::string archive_dir = ArchivalDirectory(dbname_);
+  ASSERT_EQ(kNumLogs,
+            ListSpecificFiles(env_.get(), archive_dir, kWalFile).size());
+
+  wal_manager_->PurgeObsoleteWALFiles();
+
+  // All files must still be present because TTL has not elapsed.
+  ASSERT_EQ(kNumLogs,
+            ListSpecificFiles(env_.get(), archive_dir, kWalFile).size());
+}
+
 TEST_F(WalManagerTest, TransactionLogIteratorMoveOverZeroFiles) {
-  Init();
+  Init(nullptr /* clock_override */);
   RollTheLog(false);
   Put("key1", std::string(1024, 'a'));
   // Create a zero record WAL file.
@@ -297,7 +335,7 @@ TEST_F(WalManagerTest, TransactionLogIteratorMoveOverZeroFiles) {
 }
 
 TEST_F(WalManagerTest, TransactionLogIteratorJustEmptyFile) {
-  Init();
+  Init(nullptr /* clock_override */);
   RollTheLog(false);
   auto iter = OpenTransactionLogIter(0);
   // Check that an empty iterator is returned
@@ -305,7 +343,7 @@ TEST_F(WalManagerTest, TransactionLogIteratorJustEmptyFile) {
 }
 
 TEST_F(WalManagerTest, TransactionLogIteratorNewFileWhileScanning) {
-  Init();
+  Init(nullptr /* clock_override */);
   CreateArchiveLogs(2, 100);
   auto iter = OpenTransactionLogIter(0);
   CreateArchiveLogs(1, 100);
diff --git a/db/wide/db_wide_basic_test.cc b/db/wide/db_wide_basic_test.cc
index 886f71d7452f..5c46c3c6443f 100644
--- a/db/wide/db_wide_basic_test.cc
+++ b/db/wide/db_wide_basic_test.cc
@@ -714,7 +714,7 @@ TEST_F(DBWideBasicTest, MergePlainKeyValue) {
     // snapshot in between to make sure they do not get reconciled during the
     // subsequent flush)
     write_base();
-    ManagedSnapshot snapshot(db_);
+    ManagedSnapshot snapshot(db_.get());
     write_merge();
     verify();
 
@@ -958,7 +958,7 @@ TEST_F(DBWideBasicTest, MergeEntity) {
     // between to make sure they do not get reconciled during the subsequent
     // flush)
     write_base();
-    ManagedSnapshot snapshot(db_);
+    ManagedSnapshot snapshot(db_.get());
     write_merge();
     verify_basic();
     verify_merge_ops_pre_compaction();
@@ -1033,7 +1033,7 @@ class DBWideMergeV3Test : public DBWideBasicTest {
                              third_key,
                              third_columns));  // wide-column base value
 
-    snapshots_.emplace_back(db_);
+    snapshots_.emplace_back(db_.get());
 
     // First round of merge operands
     ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), first_key,
@@ -1043,7 +1043,7 @@ class DBWideMergeV3Test : public DBWideBasicTest {
     ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), third_key,
                          third_merge_op1));
 
-    snapshots_.emplace_back(db_);
+    snapshots_.emplace_back(db_.get());
 
     // Second round of merge operands
     ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), first_key,
@@ -1053,7 +1053,7 @@ class DBWideMergeV3Test : public DBWideBasicTest {
     ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), third_key,
                          third_merge_op2));
 
-    snapshots_.emplace_back(db_);
+    snapshots_.emplace_back(db_.get());
   }
 
   void VerifyKeyValues(const WideColumns& first_expected,
diff --git a/db/wide/wide_column_serialization.cc b/db/wide/wide_column_serialization.cc
index 0366a5db977d..8371b7cbbd30 100644
--- a/db/wide/wide_column_serialization.cc
+++ b/db/wide/wide_column_serialization.cc
@@ -5,10 +5,12 @@
 
 #include "db/wide/wide_column_serialization.h"
 
-#include <algorithm>
 #include <cassert>
-#include <limits>
+#include <cstring>
 
+#include "db/blob/blob_fetcher.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/prefetch_buffer_collection.h"
 #include "db/wide/wide_columns_helper.h"
 #include "rocksdb/slice.h"
 #include "util/autovector.h"
@@ -16,15 +18,46 @@
 
 namespace ROCKSDB_NAMESPACE {
 
+Status WideColumnSerialization::BuildBlobIndexMap(
+    size_t num_columns,
+    const std::vector<std::pair<size_t, BlobIndex>>& blob_columns,
+    std::vector<const BlobIndex*>& blob_index_map) {
+  if (Status s = ValidateWideColumnLimit(num_columns, "Too many wide columns");
+      !s.ok()) {
+    return s;
+  }
+
+  blob_index_map.assign(num_columns, nullptr);
+  for (const auto& blob_col : blob_columns) {
+    if (blob_col.first >= blob_index_map.size()) {
+      return Status::InvalidArgument("Blob column index out of range");
+    }
+    blob_index_map[blob_col.first] = &blob_col.second;
+  }
+
+  return Status::OK();
+}
+
+bool WideColumnSerialization::ContainsBlobType(const char* type_bytes,
+                                               uint32_t num_columns) {
+  for (uint32_t i = 0; i < num_columns; ++i) {
+    if (static_cast<uint8_t>(type_bytes[i]) == kTypeBlobIndex) {
+      return true;
+    }
+  }
+  return false;
+}
+
 Status WideColumnSerialization::Serialize(const WideColumns& columns,
                                           std::string& output) {
   const size_t num_columns = columns.size();
 
-  if (num_columns > static_cast<size_t>(std::numeric_limits<uint32_t>::max())) {
-    return Status::InvalidArgument("Too many wide columns");
+  if (Status sv = ValidateWideColumnLimit(num_columns, "Too many wide columns");
+      !sv.ok()) {
+    return sv;
   }
 
-  PutVarint32(&output, kCurrentVersion);
+  PutVarint32(&output, kVersion1);
 
   PutVarint32(&output, static_cast<uint32_t>(num_columns));
 
@@ -34,19 +67,23 @@ Status WideColumnSerialization::Serialize(const WideColumns& columns,
     const WideColumn& column = columns[i];
 
     const Slice& name = column.name();
-    if (name.size() >
-        static_cast<size_t>(std::numeric_limits<uint32_t>::max())) {
-      return Status::InvalidArgument("Wide column name too long");
+    if (Status s_name =
+            ValidateWideColumnLimit(name.size(), "Wide column name too long");
+        !s_name.ok()) {
+      return s_name;
     }
 
-    if (prev_name && prev_name->compare(name) >= 0) {
-      return Status::Corruption("Wide columns out of order");
+    if (prev_name) {
+      if (Status so = ValidateColumnOrder(*prev_name, name); !so.ok()) {
+        return so;
+      }
     }
 
     const Slice& value = column.value();
-    if (value.size() >
-        static_cast<size_t>(std::numeric_limits<uint32_t>::max())) {
-      return Status::InvalidArgument("Wide column value too long");
+    if (Status s_val =
+            ValidateWideColumnLimit(value.size(), "Wide column value too long");
+        !s_val.ok()) {
+      return s_val;
     }
 
     PutLengthPrefixedSlice(&output, name);
@@ -64,28 +101,151 @@ Status WideColumnSerialization::Serialize(const WideColumns& columns,
   return Status::OK();
 }
 
-Status WideColumnSerialization::Deserialize(Slice& input,
-                                            WideColumns& columns) {
-  assert(columns.empty());
-
-  uint32_t version = 0;
-  if (!GetVarint32(&input, &version)) {
-    return Status::Corruption("Error decoding wide column version");
+template <typename GetName, typename GetValue>
+Status WideColumnSerialization::SerializeV2Impl(
+    size_t num_columns,
+    const std::vector<std::pair<size_t, BlobIndex>>& blob_columns,
+    std::string& output, GetName get_name, GetValue get_value) {
+  std::vector<const BlobIndex*> blob_index_map;
+  if (Status s = BuildBlobIndexMap(num_columns, blob_columns, blob_index_map);
+      !s.ok()) {
+    return s;
   }
+  assert(blob_index_map.size() == num_columns);
+
+  // First pass: validate column ordering, compute sizes, serialize blob
+  // indices, and build column types.
+  std::vector<std::string> serialized_blob_indices(num_columns);
+  std::vector<uint32_t> name_sizes(num_columns);
+  std::vector<uint32_t> value_sizes(num_columns);
+  std::string column_types;
+  column_types.reserve(num_columns);
+
+  Slice prev_name_storage;
+  bool has_prev = false;
+  uint32_t name_sizes_bytes = 0;
+  uint32_t names_bytes = 0;
+  uint32_t total_value_sizes_bytes = 0;
+  uint32_t total_values_bytes = 0;
+
+  for (size_t i = 0; i < num_columns; ++i) {
+    const Slice name = get_name(i);
+    const Slice value = get_value(i);
+
+    if (Status sn =
+            ValidateWideColumnLimit(name.size(), "Wide column name too long");
+        !sn.ok()) {
+      return sn;
+    }
 
-  if (version > kCurrentVersion) {
-    return Status::NotSupported("Unsupported wide column version");
-  }
+    if (has_prev) {
+      if (Status so = ValidateColumnOrder(prev_name_storage, name); !so.ok()) {
+        return so;
+      }
+    }
 
-  uint32_t num_columns = 0;
-  if (!GetVarint32(&input, &num_columns)) {
-    return Status::Corruption("Error decoding number of wide columns");
+    name_sizes[i] = static_cast<uint32_t>(name.size());
+    name_sizes_bytes += VarintLength(name_sizes[i]);
+    names_bytes += name_sizes[i];
+
+    if (blob_index_map[i] != nullptr) {
+      const BlobIndex* blob_idx = blob_index_map[i];
+      blob_idx->EncodeTo(&serialized_blob_indices[i]);
+      value_sizes[i] = static_cast<uint32_t>(serialized_blob_indices[i].size());
+      column_types.push_back(static_cast<char>(kTypeBlobIndex));
+    } else {
+      if (Status svl = ValidateWideColumnLimit(value.size(),
+                                               "Wide column value too long");
+          !svl.ok()) {
+        return svl;
+      }
+      value_sizes[i] = static_cast<uint32_t>(value.size());
+      column_types.push_back(static_cast<char>(kTypeValue));
+    }
+
+    total_value_sizes_bytes += VarintLength(value_sizes[i]);
+    total_values_bytes += value_sizes[i];
+
+    prev_name_storage = name;
+    has_prev = true;
   }
 
-  if (!num_columns) {
+  // Second pass: write all V2 sections to output.
+  // Pre-allocate output string.
+  const size_t total_size =
+      VarintLength(kVersion2) +
+      VarintLength(static_cast<uint32_t>(num_columns)) +
+      num_columns +  // column types
+      VarintLength(name_sizes_bytes) + VarintLength(total_value_sizes_bytes) +
+      VarintLength(names_bytes) + name_sizes_bytes + total_value_sizes_bytes +
+      names_bytes + total_values_bytes;
+
+  const size_t base_offset = output.size();
+  output.reserve(base_offset + total_size);
+
+  // Sections 1-3: header, skip info, column types
+  PutVarint32(&output, kVersion2);
+  PutVarint32(&output, static_cast<uint32_t>(num_columns));
+  PutVarint32(&output, name_sizes_bytes);
+  PutVarint32(&output, total_value_sizes_bytes);
+  PutVarint32(&output, names_bytes);
+  output.append(column_types);
+
+  // Sections 4-7: resize to final size, then write all 4 sections in a
+  // single loop using independent pointers. Each section's start offset is
+  // known from the sizes computed in the first pass.
+  if (num_columns == 0) {
     return Status::OK();
   }
 
+  const size_t sec4_offset = output.size();
+  output.resize(base_offset + total_size);
+
+  char* s4 = &output[sec4_offset];          // section 4: name sizes
+  char* s5 = s4 + name_sizes_bytes;         // section 5: value sizes
+  char* s6 = s5 + total_value_sizes_bytes;  // section 6: names
+  char* s7 = s6 + names_bytes;              // section 7: values
+
+  for (size_t i = 0; i < num_columns; ++i) {
+    s4 = EncodeVarint32(s4, name_sizes[i]);
+    s5 = EncodeVarint32(s5, value_sizes[i]);
+
+    memcpy(s6, get_name(i).data(), name_sizes[i]);
+    s6 += name_sizes[i];
+
+    if (blob_index_map[i] != nullptr) {
+      memcpy(s7, serialized_blob_indices[i].data(), value_sizes[i]);
+    } else {
+      memcpy(s7, get_value(i).data(), value_sizes[i]);
+    }
+    s7 += value_sizes[i];
+  }
+
+  return Status::OK();
+}
+
+Status WideColumnSerialization::SerializeV2(
+    const std::vector<std::pair<std::string, std::string>>& columns,
+    const std::vector<std::pair<size_t, BlobIndex>>& blob_columns,
+    std::string& output) {
+  return SerializeV2Impl(
+      columns.size(), blob_columns, output,
+      [&](size_t i) { return Slice(columns[i].first); },
+      [&](size_t i) { return Slice(columns[i].second); });
+}
+
+Status WideColumnSerialization::SerializeV2(
+    const WideColumns& columns,
+    const std::vector<std::pair<size_t, BlobIndex>>& blob_columns,
+    std::string& output) {
+  return SerializeV2Impl(
+      columns.size(), blob_columns, output,
+      [&](size_t i) { return columns[i].name(); },
+      [&](size_t i) { return columns[i].value(); });
+}
+
+Status WideColumnSerialization::DeserializeV1(
+    Slice& input, uint32_t num_columns, std::vector<WideColumn>& columns) {
   columns.reserve(num_columns);
 
   autovector<uint32_t, 16> column_value_sizes;
@@ -97,8 +257,11 @@ Status WideColumnSerialization::Deserialize(Slice& input,
       return Status::Corruption("Error decoding wide column name");
     }
 
-    if (!columns.empty() && columns.back().name().compare(name) >= 0) {
-      return Status::Corruption("Wide columns out of order");
+    if (!columns.empty()) {
+      if (Status so = ValidateColumnOrder(columns.back().name(), name);
+          !so.ok()) {
+        return so;
+      }
     }
 
     columns.emplace_back(name, Slice());
@@ -129,12 +292,324 @@ Status WideColumnSerialization::Deserialize(Slice& input,
   return Status::OK();
 }
 
+Status WideColumnSerialization::DeserializeV2Impl(
+    Slice& input, uint32_t num_columns, std::vector<WideColumn>& columns,
+    std::vector<ValueType>& column_types) {
+  // Section 2: SKIP INFO (3 varints)
+  uint32_t name_sizes_bytes = 0;
+  uint32_t value_sizes_bytes = 0;
+  uint32_t names_bytes = 0;
+  if (!GetVarint32(&input, &name_sizes_bytes)) {
+    return Status::Corruption("Error decoding wide column name sizes bytes");
+  }
+  if (!GetVarint32(&input, &value_sizes_bytes)) {
+    return Status::Corruption("Error decoding wide column value sizes bytes");
+  }
+  if (!GetVarint32(&input, &names_bytes)) {
+    return Status::Corruption("Error decoding wide column names bytes");
+  }
+
+  // Section 3: COLUMN TYPES (N bytes, each is a ValueType)
+  if (input.size() < num_columns) {
+    return Status::Corruption("Error decoding wide column types");
+  }
+  column_types.resize(num_columns);
+  for (uint32_t i = 0; i < num_columns; ++i) {
+    column_types[i] = static_cast<ValueType>(input[i]);
+    if (!IsValidColumnValueType(column_types[i])) {
+      return Status::Corruption("Unsupported wide column ValueType");
+    }
+  }
+  input.remove_prefix(num_columns);
+
+  // Validate that sections 4-6 fit in the remaining input
+  const size_t metadata_size =
+      name_sizes_bytes + value_sizes_bytes + names_bytes;
+  if (input.size() < metadata_size) {
+    return Status::Corruption("Error decoding wide column sections");
+  }
+
+  // Set up 4 pointers into sections 4-7 for single-loop parsing.
+  // Skip info gives us exact boundaries for each section.
+  const char* s4 = input.data();  // section 4: name sizes
+  const char* s4_limit = s4 + name_sizes_bytes;
+  const char* s5 = s4_limit;  // section 5: value sizes
+  const char* s5_limit = s5 + value_sizes_bytes;
+  const char* s6 = s5_limit;          // section 6: names
+  const char* s7 = s6 + names_bytes;  // section 7: values
+  const char* input_end = input.data() + input.size();
+
+  columns.reserve(num_columns);
+  size_t name_pos = 0;
+  size_t value_pos = 0;
+
+  for (uint32_t i = 0; i < num_columns; ++i) {
+    // Decode name size from section 4
+    uint32_t ns = 0;
+    const char* s4_next = GetVarint32Ptr(s4, s4_limit, &ns);
+    if (s4_next == nullptr) {
+      return Status::Corruption("Error decoding wide column name size");
+    }
+    s4 = s4_next;
+
+    // Decode value size from section 5
+    uint32_t vs = 0;
+    const char* s5_next = GetVarint32Ptr(s5, s5_limit, &vs);
+    if (s5_next == nullptr) {
+      return Status::Corruption("Error decoding wide column value size");
+    }
+    s5 = s5_next;
+
+    // Read name from section 6
+    if (name_pos + ns > names_bytes) {
+      return Status::Corruption("Error decoding wide column name");
+    }
+    Slice name(s6 + name_pos, ns);
+
+    if (!columns.empty()) {
+      if (Status so = ValidateColumnOrder(columns.back().name(), name);
+          !so.ok()) {
+        return so;
+      }
+    }
+
+    // Read value from section 7
+    if (s7 + value_pos + vs > input_end) {
+      return Status::Corruption("Error decoding wide column value payload");
+    }
+
+    columns.emplace_back(name, Slice(s7 + value_pos, vs));
+    name_pos += ns;
+    value_pos += vs;
+  }
+
+  return Status::OK();
+}
+
+Status WideColumnSerialization::Deserialize(Slice& input,
+                                            WideColumns& columns) {
+  assert(columns.empty());
+
+  // Reuse DeserializeV2, then reject any blob references.
+  std::vector<std::pair<size_t, BlobIndex>> blob_columns;
+  if (Status s = DeserializeV2(input, columns, blob_columns); !s.ok()) {
+    return s;
+  }
+
+  if (!blob_columns.empty()) {
+    return Status::NotSupported(
+        "Wide column contains blob references. Use DeserializeV2.");
+  }
+
+  return Status::OK();
+}
+
+Status WideColumnSerialization::DeserializeV2(
+    Slice& input, std::vector<WideColumn>& columns,
+    std::vector<std::pair<size_t, BlobIndex>>& blob_columns) {
+  assert(columns.empty());
+  assert(blob_columns.empty());
+
+  uint32_t version = 0;
+  if (!GetVarint32(&input, &version)) {
+    return Status::Corruption("Error decoding wide column version");
+  }
+
+  if (version > kVersion2) {
+    return Status::NotSupported("Unsupported wide column version");
+  }
+
+  uint32_t num_columns = 0;
+  if (!GetVarint32(&input, &num_columns)) {
+    return Status::Corruption("Error decoding number of wide columns");
+  }
+
+  if (!num_columns) {
+    return Status::OK();
+  }
+
+  if (version >= kVersion2) {
+    // V2 layout: parse columns and extract blob column info
+    std::vector<ValueType> column_types;
+
+    if (Status s = DeserializeV2Impl(input, num_columns, columns, column_types);
+        !s.ok()) {
+      return s;
+    }
+    assert(column_types.size() == num_columns);
+    assert(columns.size() == num_columns);
+
+    // Decode blob indices from value data
+    for (uint32_t i = 0; i < num_columns; ++i) {
+      if (column_types[i] == kTypeBlobIndex) {
+        BlobIndex blob_idx;
+        Slice blob_slice = columns[i].value();
+        if (Status bs = blob_idx.DecodeFrom(blob_slice); !bs.ok()) {
+          return Status::Corruption("Error decoding blob index in wide column");
+        }
+        blob_columns.emplace_back(i, blob_idx);
+      }
+    }
+  } else {
+    return DeserializeV1(input, num_columns, columns);
+  }
+
+  return Status::OK();
+}
+
+Status WideColumnSerialization::HasBlobColumns(const Slice& input,
+                                               bool& has_blob_columns) {
+  has_blob_columns = false;
+
+  Slice input_ref = input;
+
+  uint32_t version = 0;
+  if (!GetVarint32(&input_ref, &version)) {
+    return Status::Corruption("Error decoding wide column version");
+  }
+
+  // Version 1 never has blob columns
+  if (version < kVersion2) {
+    return Status::OK();
+  }
+
+  uint32_t num_columns = 0;
+  if (!GetVarint32(&input_ref, &num_columns)) {
+    return Status::Corruption("Error decoding number of wide columns");
+  }
+
+  if (!num_columns) {
+    return Status::OK();
+  }
+
+  // V2: Skip over SKIP INFO (3 varints) to reach COLUMN TYPES section.
+  uint32_t unused_name_sizes_bytes = 0;
+  uint32_t unused_value_sizes_bytes = 0;
+  uint32_t unused_names_bytes = 0;
+  if (!GetVarint32(&input_ref, &unused_name_sizes_bytes) ||
+      !GetVarint32(&input_ref, &unused_value_sizes_bytes) ||
+      !GetVarint32(&input_ref, &unused_names_bytes)) {
+    return Status::Corruption("Error decoding wide column skip info");
+  }
+  if (input_ref.size() < num_columns) {
+    return Status::Corruption("Error decoding wide column types");
+  }
+  has_blob_columns = ContainsBlobType(input_ref.data(), num_columns);
+
+  return Status::OK();
+}
+
+Status WideColumnSerialization::GetVersion(const Slice& input,
+                                           uint32_t& version) {
+  Slice input_ref = input;
+
+  version = 0;
+  if (!GetVarint32(&input_ref, &version)) {
+    return Status::Corruption("Error decoding wide column version");
+  }
+
+  return Status::OK();
+}
+
 Status WideColumnSerialization::GetValueOfDefaultColumn(Slice& input,
                                                         Slice& value) {
+  Slice input_ref = input;
+
+  uint32_t version = 0;
+  if (!GetVarint32(&input_ref, &version)) {
+    return Status::Corruption("Error decoding wide column version");
+  }
+
+  if (version > kVersion2) {
+    return Status::NotSupported("Unsupported wide column version");
+  }
+
+  uint32_t num_columns = 0;
+  if (!GetVarint32(&input_ref, &num_columns)) {
+    return Status::Corruption("Error decoding number of wide columns");
+  }
+
+  if (!num_columns) {
+    value.clear();
+    return Status::OK();
+  }
+
+  if (version >= kVersion2) {
+    // V2 fast path: use skip info to jump directly to values without
+    // scanning through variable-length sections.
+
+    // Read SKIP INFO (3 varints, immediately after header)
+    uint32_t name_sizes_bytes = 0;
+    uint32_t value_sizes_bytes = 0;
+    uint32_t names_bytes = 0;
+    if (!GetVarint32(&input_ref, &name_sizes_bytes)) {
+      return Status::Corruption("Error decoding wide column name sizes bytes");
+    }
+    if (!GetVarint32(&input_ref, &value_sizes_bytes)) {
+      return Status::Corruption("Error decoding wide column value sizes bytes");
+    }
+    if (!GetVarint32(&input_ref, &names_bytes)) {
+      return Status::Corruption("Error decoding wide column names bytes");
+    }
+
+    // Read COLUMN TYPES (N bytes)
+    if (input_ref.size() < num_columns) {
+      return Status::Corruption("Error decoding wide column types");
+    }
+    // Check if default column (index 0) is a blob reference
+    if (static_cast<uint8_t>(input_ref[0]) == kTypeBlobIndex) {
+      return Status::NotSupported(
+          "Wide column contains blob references. Use DeserializeV2.");
+    }
+    input_ref.remove_prefix(num_columns);
+
+    // Peek first name size from NAME SIZES section
+    if (input_ref.size() < name_sizes_bytes) {
+      return Status::Corruption("Error decoding wide column name sizes");
+    }
+    Slice name_sizes_section(input_ref.data(), name_sizes_bytes);
+    uint32_t first_name_size = 0;
+    if (!GetVarint32(&name_sizes_section, &first_name_size)) {
+      return Status::Corruption("Error decoding wide column name size");
+    }
+    input_ref.remove_prefix(name_sizes_bytes);
+
+    // Peek first value size from VALUE SIZES section
+    if (input_ref.size() < value_sizes_bytes) {
+      return Status::Corruption("Error decoding wide column value sizes");
+    }
+    Slice value_sizes_section(input_ref.data(), value_sizes_bytes);
+    uint32_t first_value_size = 0;
+    if (!GetVarint32(&value_sizes_section, &first_value_size)) {
+      return Status::Corruption("Error decoding wide column value size");
+    }
+    // Skip entire VALUE SIZES section using value_sizes_bytes
+    input_ref.remove_prefix(value_sizes_bytes);
+
+    // Check if the first column is the default column (empty name)
+    if (first_name_size != 0) {
+      value.clear();
+      return Status::OK();
+    }
+
+    // Skip NAMES section
+    if (input_ref.size() < names_bytes) {
+      return Status::Corruption("Error decoding wide column names");
+    }
+    input_ref.remove_prefix(names_bytes);
+
+    // Read the first value from VALUES section
+    if (input_ref.size() < first_value_size) {
+      return Status::Corruption("Error decoding wide column value payload");
+    }
+    value = Slice(input_ref.data(), first_value_size);
+    return Status::OK();
+  }
+
+  // V1 fallback: full deserialization
   WideColumns columns;
 
-  const Status s = Deserialize(input, columns);
-  if (!s.ok()) {
+  if (Status s = Deserialize(input, columns); !s.ok()) {
     return s;
   }
 
@@ -148,4 +623,145 @@ Status WideColumnSerialization::GetValueOfDefaultColumn(Slice& input,
   return Status::OK();
 }
 
+Status WideColumnSerialization::ResolveEntityBlobColumns(
+    const Slice& entity_value, const Slice& user_key,
+    const BlobFetcher* blob_fetcher, PrefetchBufferCollection* prefetch_buffers,
+    std::string& resolved_entity, bool& resolved, uint64_t* total_bytes_read,
+    uint64_t* num_blobs_resolved) {
+  assert(blob_fetcher);
+
+  resolved = false;
+
+  std::vector<WideColumn> columns;
+  std::vector<std::pair<size_t, BlobIndex>> blob_columns;
+
+  Slice input_copy = entity_value;
+  if (Status s = DeserializeV2(input_copy, columns, blob_columns); !s.ok()) {
+    return s;
+  }
+
+  if (blob_columns.empty()) {
+    return Status::OK();
+  }
+
+  resolved = true;
+
+  // Fetch each blob value
+  std::vector<std::string> resolved_blob_values;
+  resolved_blob_values.reserve(blob_columns.size());
+
+  for (const auto& blob_col : blob_columns) {
+    const BlobIndex& blob_idx = blob_col.second;
+
+    if (blob_idx.IsInlined()) {
+      resolved_blob_values.emplace_back(blob_idx.value().data(),
+                                        blob_idx.value().size());
+      continue;
+    }
+
+    FilePrefetchBuffer* prefetch_buffer =
+        prefetch_buffers ? prefetch_buffers->GetOrCreatePrefetchBuffer(
+                               blob_idx.file_number())
+                         : nullptr;
+
+    uint64_t bytes_read = 0;
+
+    PinnableSlice blob_value;
+    const Status fetch_s = blob_fetcher->FetchBlob(
+        user_key, blob_idx, prefetch_buffer, &blob_value, &bytes_read);
+    if (!fetch_s.ok()) {
+      return fetch_s;
+    }
+
+    resolved_blob_values.emplace_back(blob_value.data(), blob_value.size());
+
+    if (total_bytes_read) {
+      *total_bytes_read += bytes_read;
+    }
+  }
+
+  if (num_blobs_resolved) {
+    *num_blobs_resolved += blob_columns.size();
+  }
+
+  return SerializeResolvedEntity(columns, blob_columns, resolved_blob_values,
+                                 resolved_entity);
+}
+
+Status WideColumnSerialization::GetValueOfDefaultColumnResolvingBlobs(
+    const Slice& entity_value, const Slice& user_key,
+    const BlobFetcher* blob_fetcher, PinnableSlice& result, bool& resolved) {
+  assert(blob_fetcher);
+
+  resolved = false;
+
+  std::vector<WideColumn> columns;
+  std::vector<std::pair<size_t, BlobIndex>> blob_columns;
+
+  Slice input_copy = entity_value;
+  if (Status s = DeserializeV2(input_copy, columns, blob_columns); !s.ok()) {
+    return s;
+  }
+
+  // The default column (empty name) is always at index 0 when present
+  // (columns are sorted by name).
+  if (columns.empty() || columns[0].name() != kDefaultWideColumnName) {
+    result.PinSelf(Slice());
+    return Status::OK();
+  }
+
+  // Check if the default column (index 0) is a blob reference
+  for (const auto& blob_col : blob_columns) {
+    if (blob_col.first == 0) {
+      const BlobIndex& blob_idx = blob_col.second;
+
+      resolved = true;
+
+      if (blob_idx.IsInlined()) {
+        result.PinSelf(blob_idx.value());
+        return Status::OK();
+      }
+
+      return blob_fetcher->FetchBlob(user_key, blob_idx,
+                                     nullptr /* prefetch_buffer */, &result,
+                                     nullptr /* bytes_read */);
+    }
+  }
+
+  // Default column is inline
+  result.PinSelf(columns[0].value());
+  return Status::OK();
+}
+
+Status WideColumnSerialization::SerializeResolvedEntity(
+    const std::vector<WideColumn>& columns,
+    const std::vector<std::pair<size_t, BlobIndex>>& blob_columns,
+    const std::vector<std::string>& resolved_blob_values, std::string& output) {
+  assert(blob_columns.size() == resolved_blob_values.size());
+
+  // blob_columns is sorted by column index and typically small, so use a
+  // linear scan with a cursor instead of an unordered_map.
+  size_t blob_cursor = 0;
+
+  // Build result columns with resolved blob values
+  WideColumns result_columns;
+  result_columns.reserve(columns.size());
+
+  for (size_t i = 0; i < columns.size(); ++i) {
+    if (blob_cursor < blob_columns.size() &&
+        blob_columns[blob_cursor].first == i) {
+      // This is a blob column - use the resolved value
+      result_columns.emplace_back(columns[i].name(),
+                                  Slice(resolved_blob_values[blob_cursor]));
+      ++blob_cursor;
+    } else {
+      // This is an inline column - use the original value
+      result_columns.emplace_back(columns[i].name(), columns[i].value());
+    }
+  }
+
+  // Serialize using V1 format (all values inline)
+  return Serialize(result_columns, output);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/wide/wide_column_serialization.h b/db/wide/wide_column_serialization.h
index 4a97f6a78690..0a819907ae7e 100644
--- a/db/wide/wide_column_serialization.h
+++ b/db/wide/wide_column_serialization.h
@@ -6,18 +6,28 @@
 #pragma once
 
 #include <cstdint>
+#include <limits>
 #include <string>
+#include <utility>
+#include <vector>
 
+#include "db/dbformat.h"
 #include "rocksdb/rocksdb_namespace.h"
 #include "rocksdb/status.h"
 #include "rocksdb/wide_columns.h"
 
 namespace ROCKSDB_NAMESPACE {
 
+class BlobFetcher;
+class BlobIndex;
+class FilePrefetchBuffer;
+class PinnableSlice;
+class PrefetchBufferCollection;
 class Slice;
 
 // Wide-column serialization/deserialization primitives.
 //
+// Version 1 Layout:
 // The two main parts of the layout are 1) a sorted index containing the column
 // names and column value sizes and 2) the column values themselves. Keeping the
 // index and the values separate will enable selectively reading column values
@@ -40,16 +50,224 @@ class Slice;
 //          ...---+----------+-------+----------+-------+---...---+-------+
 //                | varint32 | bytes | varint32 | bytes |         | bytes |
 //          ...---+----------+-------+----------+-------+---...---+-------+
+//
+// Version 2 Layout (with blob index support):
+// Groups all metadata upfront before variable-length data. This enables
+// efficient access patterns: index-based value access skips name data
+// entirely, default column access is O(1), and type checks are O(1).
+//
+// Legend: cn = column name, cv = column value, cns = column name size,
+//         cvs = column value size, ct = column type.
+//
+// Section 1: HEADER (2 varints)
+//   +----------+--------------+
+//   | version  | # of columns |
+//   | varint32 |   varint32   |
+//   +----------+--------------+
+//
+// Section 2: SKIP INFO (3 varints)
+//   +-------------------+---------------------+------------------+
+//   | name_sizes_bytes  | value_sizes_bytes   | names_bytes      |
+//   | varint32          | varint32            | varint32         |
+//   +-------------------+---------------------+------------------+
+//   name_sizes_bytes  = byte size of NAME SIZES section (section 4)
+//   value_sizes_bytes = byte size of VALUE SIZES section (section 5)
+//   names_bytes       = byte size of NAMES section (section 6)
+//
+//   Placed immediately after the header so that header + skip info form
+//   a contiguous varint sequence (5 varints), enabling future SIMD-based
+//   varint decoding.
+//
+// Section 3: COLUMN TYPES (N bytes, fixed-size)
+//   +------+------+---...---+--------+
+//   | ct_0 | ct_1 |         | ct_N-1 |
+//   | byte | byte |         |  byte  |
+//   +------+------+---...---+--------+
+//   ct values are ValueType entries from db/dbformat.h, e.g.:
+//     kTypeValue (0x01) = inline value
+//     kTypeBlobIndex (0x11) = blob index reference
+//   Future per-column types (kTypeMerge, kTypeDeletion, etc.) can be
+//   added without format changes.
+//
+// Section 4: NAME SIZES (N varints)
+//   +----------+----------+---...---+------------+
+//   | cns_0    | cns_1    |         | cns_{N-1}  |
+//   | varint32 | varint32 |         | varint32   |
+//   +----------+----------+---...---+------------+
+//
+// Section 5: VALUE SIZES (N varints)
+//   +----------+----------+---...---+------------+
+//   | cvs_0    | cvs_1    |         | cvs_{N-1}  |
+//   | varint32 | varint32 |         | varint32   |
+//   +----------+----------+---...---+------------+
+//
+// Section 6: COLUMN NAMES (concatenated, sorted)
+//   +------+------+---...---+--------+
+//   | cn_0 | cn_1 |         | cn_N-1 |
+//   | bytes| bytes|         | bytes  |
+//   +------+------+---...---+--------+
+//
+// Section 7: COLUMN VALUES (concatenated)
+//   +------+------+---...---+--------+
+//   | cv_0 | cv_1 |         | cv_N-1 |
+//   | bytes| bytes|         | bytes  |
+//   +------+------+---...---+--------+
+//
+// When ct = kTypeBlobIndex, the cv contains a serialized BlobIndex.
 
 class WideColumnSerialization {
  public:
+  // Version constants for wide column serialization format.
+  // - kVersion1: Original format with inline column values only.
+  // - kVersion2: Extended format that supports blob index references in
+  //              columns. Used when large column values are stored in blob
+  //              files.
+  static constexpr uint32_t kVersion1 = 1;
+  static constexpr uint32_t kVersion2 = 2;
+
+  // Serialize columns using version 1 format (no blob support)
   static Status Serialize(const WideColumns& columns, std::string& output);
 
+  // Serialize columns with some columns replaced by blob indices (version 2)
+  // columns: vector of (column_name, column_value) pairs
+  // blob_columns: vector of (column_index, blob_index) pairs indicating which
+  //               columns should be stored as blob references
+  static Status SerializeV2(
+      const std::vector<std::pair<std::string, std::string>>& columns,
+      const std::vector<std::pair<size_t, BlobIndex>>& blob_columns,
+      std::string& output);
+
+  // Overload that takes Slice-based WideColumns directly, avoiding the
+  // need to copy column names and values into string pairs.
+  static Status SerializeV2(
+      const WideColumns& columns,
+      const std::vector<std::pair<size_t, BlobIndex>>& blob_columns,
+      std::string& output);
+
+  // Deserialize columns (version 1 format only)
   static Status Deserialize(Slice& input, WideColumns& columns);
 
+  // Deserialize columns and separate inline columns from blob columns
+  // columns: receives inline column values
+  // blob_columns: receives (column_index, blob_index) pairs for blob references
+  static Status DeserializeV2(
+      Slice& input, std::vector<WideColumn>& columns,
+      std::vector<std::pair<size_t, BlobIndex>>& blob_columns);
+
+  // Check if the serialized entity has any blob column references.
+  // Sets *has_blob_columns to true if version >= 2 and at least one column
+  // has blob type; false otherwise.
+  // Returns Status::Corruption on decode errors.
+  static Status HasBlobColumns(const Slice& input, bool& has_blob_columns);
+
   static Status GetValueOfDefaultColumn(Slice& input, Slice& value);
 
-  static constexpr uint32_t kCurrentVersion = 1;
+  // Resolves all blob column references in a V2 wide-column entity,
+  // fetches the blob values, and re-serializes as a V1 entity (all inline).
+  // Handles inlined blobs (IsInlined) defensively.
+  //
+  // Used by the read path (GetContext, DBIter) when a V2 entity with blob
+  // column references needs to be converted to V1 format for consumption by
+  // APIs that only support V1 (e.g., TimedFullMerge,
+  // PinnableWideColumns::SetWideColumnValue).
+  //
+  // Sets *resolved to false and leaves resolved_entity unchanged when
+  // no blob columns are present.
+  //
+  // Optional parameters:
+  //   prefetch_buffers - for prefetch optimization (nullptr = no prefetch)
+  //   total_bytes_read - accumulates bytes read from blob files (nullptr =
+  //   skip) num_blobs_resolved - count of blob columns resolved (nullptr =
+  //   skip)
+  static Status ResolveEntityBlobColumns(
+      const Slice& entity_value, const Slice& user_key,
+      const BlobFetcher* blob_fetcher,
+      PrefetchBufferCollection* prefetch_buffers, std::string& resolved_entity,
+      bool& resolved, uint64_t* total_bytes_read, uint64_t* num_blobs_resolved);
+
+  // Extracts the default column value from a V2 entity, resolving its
+  // blob reference if needed. The default column (empty name) is always
+  // at index 0 when present (columns are sorted).
+  //
+  // Sets result to the resolved default column value (fetching from blob
+  // file if it's a blob reference). If there is no default column, result
+  // is set to empty. Sets *resolved to true if a blob was found for the
+  // default column, false otherwise.
+  static Status GetValueOfDefaultColumnResolvingBlobs(
+      const Slice& entity_value, const Slice& user_key,
+      const BlobFetcher* blob_fetcher, PinnableSlice& result, bool& resolved);
+
+ private:
+  friend class WideColumnSerializationTest;
+  // Get the serialization version from the input.
+  // Sets *version to the version number.
+  // Returns Status::Corruption on decode errors.
+  static Status GetVersion(const Slice& input, uint32_t& version);
+
+  // Merges deserialized columns with resolved blob values and serializes
+  // the result using version 1 format (all values inline).
+  static Status SerializeResolvedEntity(
+      const std::vector<WideColumn>& columns,
+      const std::vector<std::pair<size_t, BlobIndex>>& blob_columns,
+      const std::vector<std::string>& resolved_blob_values,
+      std::string& output);
+
+  // Returns InvalidArgument with the given message if size exceeds uint32_t.
+  static Status ValidateWideColumnLimit(size_t size, const char* msg) {
+    if (size > static_cast<size_t>(std::numeric_limits<uint32_t>::max())) {
+      return Status::InvalidArgument(msg);
+    }
+    return Status::OK();
+  }
+
+  // Returns Corruption if prev_name >= name (columns must be strictly ordered).
+  static Status ValidateColumnOrder(const Slice& prev_name, const Slice& name) {
+    if (prev_name.compare(name) >= 0) {
+      return Status::Corruption("Wide columns out of order");
+    }
+    return Status::OK();
+  }
+
+  // Shared implementation for both SerializeV2 overloads.
+  // get_name(i): returns Slice for column i's name
+  // get_value(i): returns Slice for column i's inline value
+  template <typename GetName, typename GetValue>
+  static Status SerializeV2Impl(
+      size_t num_columns,
+      const std::vector<std::pair<size_t, BlobIndex>>& blob_columns,
+      std::string& output, GetName get_name, GetValue get_value);
+
+  // Validates num_columns limit and builds a per-column lookup map from
+  // blob_columns. Returns InvalidArgument on validation failure.
+  static Status BuildBlobIndexMap(
+      size_t num_columns,
+      const std::vector<std::pair<size_t, BlobIndex>>& blob_columns,
+      std::vector<const BlobIndex*>& blob_index_map);
+
+  // Parses V1 layout (interleaved name/value_size pairs followed by values)
+  // into columns. Used by both Deserialize and DeserializeV2 to avoid
+  // code duplication.
+  static Status DeserializeV1(Slice& input, uint32_t num_columns,
+                              std::vector<WideColumn>& columns);
+
+  // Parses V2 layout sections 2-7 (skip info through values) into columns and
+  // column types. Used by both Deserialize and DeserializeV2 to avoid
+  // code duplication.
+  static Status DeserializeV2Impl(Slice& input, uint32_t num_columns,
+                                  std::vector<WideColumn>& columns,
+                                  std::vector<ValueType>& column_types);
+
+  // Returns true if t is a supported per-column ValueType. Currently only
+  // kTypeValue (inline) and kTypeBlobIndex are supported. Notably,
+  // kTypeWideColumnEntity is rejected to prevent recursive nesting.
+  static bool IsValidColumnValueType(ValueType t) {
+    return t == kTypeValue || t == kTypeBlobIndex;
+  }
+
+  // Returns true if any of the first num_columns type bytes equals
+  // kTypeBlobIndex. Typical entities have <10 columns, so a linear
+  // scan is sufficient; SIMD could be considered if column counts grow.
+  static bool ContainsBlobType(const char* type_bytes, uint32_t num_columns);
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/wide/wide_column_serialization_test.cc b/db/wide/wide_column_serialization_test.cc
index 83a849da9eb3..018324d855e8 100644
--- a/db/wide/wide_column_serialization_test.cc
+++ b/db/wide/wide_column_serialization_test.cc
@@ -5,13 +5,35 @@
 
 #include "db/wide/wide_column_serialization.h"
 
+#include <chrono>
+#include <limits>
+
+#include "db/blob/blob_index.h"
 #include "db/wide/wide_columns_helper.h"
 #include "test_util/testharness.h"
 #include "util/coding.h"
+#include "util/random.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-TEST(WideColumnSerializationTest, Construct) {
+class WideColumnSerializationTest : public testing::Test {
+ protected:
+  // Wrappers for private methods accessible via friend declaration.
+  static Status GetVersion(const Slice& input, uint32_t& version) {
+    return WideColumnSerialization::GetVersion(input, version);
+  }
+
+  static Status SerializeResolvedEntity(
+      const std::vector<WideColumn>& columns,
+      const std::vector<std::pair<size_t, BlobIndex>>& blob_columns,
+      const std::vector<std::string>& resolved_blob_values,
+      std::string& output) {
+    return WideColumnSerialization::SerializeResolvedEntity(
+        columns, blob_columns, resolved_blob_values, output);
+  }
+};
+
+TEST_F(WideColumnSerializationTest, Construct) {
   constexpr char foo[] = "foo";
   constexpr char bar[] = "bar";
 
@@ -87,7 +109,7 @@ TEST(WideColumnSerializationTest, Construct) {
   }
 }
 
-TEST(WideColumnSerializationTest, SerializeDeserialize) {
+TEST_F(WideColumnSerializationTest, SerializeDeserialize) {
   WideColumns columns{{"foo", "bar"}, {"hello", "world"}};
   std::string output;
 
@@ -126,7 +148,7 @@ TEST(WideColumnSerializationTest, SerializeDeserialize) {
   }
 }
 
-TEST(WideColumnSerializationTest, SerializeDuplicateError) {
+TEST_F(WideColumnSerializationTest, SerializeDuplicateError) {
   WideColumns columns{{"foo", "bar"}, {"foo", "baz"}};
   std::string output;
 
@@ -134,7 +156,7 @@ TEST(WideColumnSerializationTest, SerializeDuplicateError) {
       WideColumnSerialization::Serialize(columns, output).IsCorruption());
 }
 
-TEST(WideColumnSerializationTest, SerializeOutOfOrderError) {
+TEST_F(WideColumnSerializationTest, SerializeOutOfOrderError) {
   WideColumns columns{{"hello", "world"}, {"foo", "bar"}};
   std::string output;
 
@@ -142,7 +164,7 @@ TEST(WideColumnSerializationTest, SerializeOutOfOrderError) {
       WideColumnSerialization::Serialize(columns, output).IsCorruption());
 }
 
-TEST(WideColumnSerializationTest, DeserializeVersionError) {
+TEST_F(WideColumnSerializationTest, DeserializeVersionError) {
   // Can't decode version
 
   std::string buf;
@@ -155,7 +177,7 @@ TEST(WideColumnSerializationTest, DeserializeVersionError) {
   ASSERT_TRUE(std::strstr(s.getState(), "version"));
 }
 
-TEST(WideColumnSerializationTest, DeserializeUnsupportedVersion) {
+TEST_F(WideColumnSerializationTest, DeserializeUnsupportedVersion) {
   // Unsupported version
   constexpr uint32_t future_version = 1000;
 
@@ -170,11 +192,11 @@ TEST(WideColumnSerializationTest, DeserializeUnsupportedVersion) {
   ASSERT_TRUE(std::strstr(s.getState(), "version"));
 }
 
-TEST(WideColumnSerializationTest, DeserializeNumberOfColumnsError) {
+TEST_F(WideColumnSerializationTest, DeserializeNumberOfColumnsError) {
   // Can't decode number of columns
 
   std::string buf;
-  PutVarint32(&buf, WideColumnSerialization::kCurrentVersion);
+  PutVarint32(&buf, WideColumnSerialization::kVersion1);
 
   Slice input(buf);
   WideColumns columns;
@@ -184,10 +206,10 @@ TEST(WideColumnSerializationTest, DeserializeNumberOfColumnsError) {
   ASSERT_TRUE(std::strstr(s.getState(), "number"));
 }
 
-TEST(WideColumnSerializationTest, DeserializeColumnsError) {
+TEST_F(WideColumnSerializationTest, DeserializeV2Error) {
   std::string buf;
 
-  PutVarint32(&buf, WideColumnSerialization::kCurrentVersion);
+  PutVarint32(&buf, WideColumnSerialization::kVersion1);
 
   constexpr uint32_t num_columns = 2;
   PutVarint32(&buf, num_columns);
@@ -277,10 +299,10 @@ TEST(WideColumnSerializationTest, DeserializeColumnsError) {
   }
 }
 
-TEST(WideColumnSerializationTest, DeserializeColumnsOutOfOrder) {
+TEST_F(WideColumnSerializationTest, DeserializeV2OutOfOrder) {
   std::string buf;
 
-  PutVarint32(&buf, WideColumnSerialization::kCurrentVersion);
+  PutVarint32(&buf, WideColumnSerialization::kVersion1);
 
   constexpr uint32_t num_columns = 2;
   PutVarint32(&buf, num_columns);
@@ -302,6 +324,521 @@ TEST(WideColumnSerializationTest, DeserializeColumnsOutOfOrder) {
   ASSERT_TRUE(std::strstr(s.getState(), "order"));
 }
 
+TEST_F(WideColumnSerializationTest, DeserializeV2RejectsRecursiveType) {
+  // Manually construct a V2 entity where one column has type
+  // kTypeWideColumnEntity, which would create recursive nesting.
+  // Deserialization must reject this.
+  std::string buf;
+
+  PutVarint32(&buf, WideColumnSerialization::kVersion2);
+
+  constexpr uint32_t num_columns = 2;
+  PutVarint32(&buf, num_columns);
+
+  // Section 2: COLUMN TYPES -- first column inline, second recursive
+  buf.push_back(static_cast<char>(kTypeValue));
+  buf.push_back(static_cast<char>(kTypeWideColumnEntity));
+
+  // Section 3: SKIP INFO
+  PutVarint32(&buf, 2);  // name_sizes_bytes (varint(1) + varint(1))
+  PutVarint32(&buf, 2);  // value_sizes_bytes (varint(3) + varint(5))
+  PutVarint32(&buf, 2);  // names_bytes ("a" + "b")
+
+  // Section 4: NAME SIZES
+  PutVarint32(&buf, 1);  // "a"
+  PutVarint32(&buf, 1);  // "b"
+
+  // Section 5: VALUE SIZES
+  PutVarint32(&buf, 3);
+  PutVarint32(&buf, 5);
+
+  // Section 6: NAMES
+  buf.append("ab");
+
+  // Section 7: VALUES (8 bytes of placeholder data)
+  buf.append(8, 'x');
+
+  // DeserializeV2 should reject with Corruption
+  {
+    Slice input(buf);
+    std::vector<WideColumn> columns;
+    std::vector<std::pair<size_t, BlobIndex>> blob_columns;
+    const Status s =
+        WideColumnSerialization::DeserializeV2(input, columns, blob_columns);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "Unsupported wide column ValueType"));
+  }
+
+  // Deserialize (V1-only API) should also reject
+  {
+    Slice input(buf);
+    WideColumns columns;
+    const Status s = WideColumnSerialization::Deserialize(input, columns);
+    ASSERT_TRUE(s.IsCorruption());
+  }
+}
+
+// Helper: create a BlobIndex from EncodeBlob parameters.
+static BlobIndex MakeBlobIndex(uint64_t file_number, uint64_t offset,
+                               uint64_t size,
+                               CompressionType compression = kNoCompression) {
+  std::string encoded;
+  BlobIndex::EncodeBlob(&encoded, file_number, offset, size, compression);
+  BlobIndex bi;
+  Slice s(encoded);
+  assert(bi.DecodeFrom(s).ok());
+  return bi;
+}
+
+// Helper: V2 serialize → DeserializeV2 round-trip, returning
+// deserialized columns and blob column info.
+static void V2SerializeAndDeserialize(
+    const std::vector<std::pair<std::string, std::string>>& columns,
+    const std::vector<std::pair<size_t, BlobIndex>>& blob_columns_in,
+    std::vector<WideColumn>* deserialized,
+    std::vector<std::pair<size_t, BlobIndex>>* blob_columns_out,
+    std::string* serialized_out) {
+  ASSERT_OK(WideColumnSerialization::SerializeV2(columns, blob_columns_in,
+                                                 *serialized_out));
+
+  Slice input(*serialized_out);
+  ASSERT_OK(WideColumnSerialization::DeserializeV2(input, *deserialized,
+                                                   *blob_columns_out));
+  ASSERT_EQ(deserialized->size(), columns.size());
+  for (size_t i = 0; i < columns.size(); ++i) {
+    ASSERT_EQ((*deserialized)[i].name(), columns[i].first);
+  }
+}
+
+// Helper: build WideColumns from string pairs.
+static WideColumns ToWideColumns(
+    const std::vector<std::pair<std::string, std::string>>& columns) {
+  WideColumns wc;
+  wc.reserve(columns.size());
+  for (const auto& col : columns) {
+    wc.emplace_back(Slice(col.first), Slice(col.second));
+  }
+  return wc;
+}
+
+// Helper: deserialize and verify column names match expected.first
+// and column values match expected_values[i].
+static void VerifyDeserialize(
+    const std::string& serialized,
+    const std::vector<std::pair<std::string, std::string>>& expected,
+    const std::vector<std::string>& expected_values) {
+  Slice input(serialized);
+  WideColumns deserialized;
+  ASSERT_OK(WideColumnSerialization::Deserialize(input, deserialized));
+  ASSERT_EQ(deserialized.size(), expected.size());
+  for (size_t i = 0; i < expected.size(); ++i) {
+    ASSERT_EQ(deserialized[i].name(), expected[i].first);
+    ASSERT_EQ(deserialized[i].value(), expected_values[i]);
+  }
+}
+
+// Convenience overload: values come from expected[i].second.
+static void VerifyDeserialize(
+    const std::string& serialized,
+    const std::vector<std::pair<std::string, std::string>>& expected) {
+  std::vector<std::string> values;
+  values.reserve(expected.size());
+  for (const auto& col : expected) {
+    values.push_back(col.second);
+  }
+  VerifyDeserialize(serialized, expected, values);
+}
+
+// Helper: create a random non-inlined BlobIndex using the given RNG.
+// Only creates Blob or BlobTTL types (not InlinedTTL), because InlinedTTL
+// stores a Slice pointing into the encoded string, which would become a
+// dangling reference after this function returns.
+static BlobIndex MakeRandomBlobIndex(Random& rng) {
+  std::string bi_str;
+  if (rng.Uniform(2) == 0) {
+    BlobIndex::EncodeBlob(&bi_str, rng.Uniform(1000), rng.Uniform(10000),
+                          rng.Uniform(5000), kNoCompression);
+  } else {
+    BlobIndex::EncodeBlobTTL(&bi_str, rng.Uniform(1000000), rng.Uniform(1000),
+                             rng.Uniform(10000), rng.Uniform(5000),
+                             kSnappyCompression);
+  }
+  BlobIndex bi;
+  Slice s(bi_str);
+  assert(bi.DecodeFrom(s).ok());
+  return bi;
+}
+
+// Helper: V2 serialize with no blobs then GetValueOfDefaultColumn.
+static void VerifyGetDefaultColumn(
+    const std::vector<std::pair<std::string, std::string>>& columns,
+    const Slice& expected_value) {
+  std::vector<std::pair<size_t, BlobIndex>> no_blobs;
+  std::string serialized;
+  ASSERT_OK(
+      WideColumnSerialization::SerializeV2(columns, no_blobs, serialized));
+
+  Slice input(serialized);
+  Slice value;
+  ASSERT_OK(WideColumnSerialization::GetValueOfDefaultColumn(input, value));
+  ASSERT_EQ(value, expected_value);
+}
+
+TEST_F(WideColumnSerializationTest, SerializeResolvedEntity) {
+  // Test resolve with mixed, all-blob, and no-blob configurations
+  struct TestCase {
+    std::vector<std::pair<std::string, std::string>> columns;
+    std::vector<std::pair<size_t, BlobIndex>> blob_cols;
+    std::vector<std::string> resolved_values;
+    std::vector<std::string> expected_values;
+  };
+
+  std::vector<TestCase> cases = {
+      // Mixed inline and blob
+      {.columns = {{"a", "inline_a"}, {"b", "ph"}, {"c", "inline_c"}},
+       .blob_cols = {{1, MakeBlobIndex(50, 500, 100)}},
+       .resolved_values = {"resolved_b"},
+       .expected_values = {"inline_a", "resolved_b", "inline_c"}},
+      // All blob columns
+      {.columns = {{"x", "ph1"}, {"y", "ph2"}, {"z", "ph3"}},
+       .blob_cols = {{0, MakeBlobIndex(10, 100, 50)},
+                     {1, MakeBlobIndex(20, 200, 60)},
+                     {2, MakeBlobIndex(30, 300, 70)}},
+       .resolved_values = {"val_x", "val_y", "val_z"},
+       .expected_values = {"val_x", "val_y", "val_z"}},
+      // No blob columns
+      {.columns = {{"alpha", "val_alpha"}, {"beta", "val_beta"}},
+       .blob_cols = {},
+       .resolved_values = {},
+       .expected_values = {"val_alpha", "val_beta"}},
+  };
+
+  for (const auto& tc : cases) {
+    std::string serialized;
+    std::vector<WideColumn> deserialized;
+    std::vector<std::pair<size_t, BlobIndex>> blob_out;
+    V2SerializeAndDeserialize(tc.columns, tc.blob_cols, &deserialized,
+                              &blob_out, &serialized);
+
+    std::string resolved_output;
+    ASSERT_OK(WideColumnSerializationTest::SerializeResolvedEntity(
+        deserialized, blob_out, tc.resolved_values, resolved_output));
+
+    uint32_t v = 0;
+    ASSERT_OK(GetVersion(Slice(resolved_output), v));
+    ASSERT_EQ(v, WideColumnSerialization::kVersion1);
+
+    VerifyDeserialize(resolved_output, tc.columns, tc.expected_values);
+  }
+}
+
+TEST_F(WideColumnSerializationTest, V2GetValueOfDefaultColumn) {
+  // V2 with default column present
+  VerifyGetDefaultColumn({{"", "default_value"}, {"col1", "value1"}},
+                         "default_value");
+  // V2 without default column
+  VerifyGetDefaultColumn({{"col1", "value1"}, {"col2", "value2"}}, Slice());
+  // V2 with zero columns
+  VerifyGetDefaultColumn({}, Slice());
+
+  // V1 fallback
+  {
+    WideColumns columns{{"", "v1_default"}, {"col1", "v1"}};
+    std::string serialized;
+    ASSERT_OK(WideColumnSerialization::Serialize(columns, serialized));
+
+    Slice input(serialized);
+    Slice value;
+    ASSERT_OK(WideColumnSerialization::GetValueOfDefaultColumn(input, value));
+    ASSERT_EQ(value, "v1_default");
+  }
+}
+
+TEST_F(WideColumnSerializationTest, V2BlobColumnRejectsDeserialize) {
+  std::vector<std::pair<std::string, std::string>> columns = {
+      {"a", "inline"}, {"b", "placeholder"}};
+  std::vector<std::pair<size_t, BlobIndex>> blob_columns = {
+      {1, MakeBlobIndex(1, 2, 3)}};
+
+  std::string serialized;
+  ASSERT_OK(
+      WideColumnSerialization::SerializeV2(columns, blob_columns, serialized));
+
+  Slice input(serialized);
+  WideColumns deserialized;
+  ASSERT_TRUE(WideColumnSerialization::Deserialize(input, deserialized)
+                  .IsNotSupported());
+}
+
+TEST_F(WideColumnSerializationTest, V2GetValueOfDefaultColumnBlobRef) {
+  // When default column (index 0) is a blob reference,
+  // GetValueOfDefaultColumn should return NotSupported.
+  std::vector<std::pair<std::string, std::string>> columns = {
+      {"", "placeholder"}, {"col1", "value1"}};
+  std::vector<std::pair<size_t, BlobIndex>> blob_columns = {
+      {0, MakeBlobIndex(10, 100, 500)}};
+
+  std::string serialized;
+  ASSERT_OK(
+      WideColumnSerialization::SerializeV2(columns, blob_columns, serialized));
+
+  Slice input(serialized);
+  Slice value;
+  ASSERT_TRUE(WideColumnSerialization::GetValueOfDefaultColumn(input, value)
+                  .IsNotSupported());
+}
+
+TEST_F(WideColumnSerializationTest, SerializeV2Errors) {
+  // Blob column index out of range
+  {
+    std::vector<std::pair<std::string, std::string>> columns = {{"a", "val"}};
+    std::vector<std::pair<size_t, BlobIndex>> blob_columns = {
+        {5, MakeBlobIndex(1, 2, 3)}};  // index 5 but only 1 column
+
+    std::string output;
+    ASSERT_TRUE(
+        WideColumnSerialization::SerializeV2(columns, blob_columns, output)
+            .IsInvalidArgument());
+  }
+
+  // Columns out of order (V2)
+  {
+    std::vector<std::pair<std::string, std::string>> columns = {{"b", "val_b"},
+                                                                {"a", "val_a"}};
+    std::vector<std::pair<size_t, BlobIndex>> no_blobs;
+
+    std::string output;
+    ASSERT_TRUE(WideColumnSerialization::SerializeV2(columns, no_blobs, output)
+                    .IsCorruption());
+  }
+
+  // Duplicate column names (V2)
+  {
+    std::vector<std::pair<std::string, std::string>> columns = {{"a", "val1"},
+                                                                {"a", "val2"}};
+    std::vector<std::pair<size_t, BlobIndex>> no_blobs;
+
+    std::string output;
+    ASSERT_TRUE(WideColumnSerialization::SerializeV2(columns, no_blobs, output)
+                    .IsCorruption());
+  }
+}
+
+TEST_F(WideColumnSerializationTest, BlobIndexEncodeToRoundTrip) {
+  // Test EncodeTo produces identical output to static Encode methods
+  // for all three blob index types.
+  auto verify_encode_to = [](const std::string& encoded_static) {
+    BlobIndex bi;
+    Slice s(encoded_static);
+    ASSERT_OK(bi.DecodeFrom(s));
+    std::string encoded_instance;
+    bi.EncodeTo(&encoded_instance);
+    ASSERT_EQ(encoded_static, encoded_instance);
+  };
+
+  std::string blob_str;
+  std::string blob_ttl_str;
+  std::string inlined_str;
+  BlobIndex::EncodeBlob(&blob_str, 42, 1024, 2048, kSnappyCompression);
+  BlobIndex::EncodeBlobTTL(&blob_ttl_str, 9999, 10, 200, 3000,
+                           kZlibCompression);
+  BlobIndex::EncodeInlinedTTL(&inlined_str, 12345, "inline_data");
+
+  verify_encode_to(blob_str);
+  verify_encode_to(blob_ttl_str);
+  verify_encode_to(inlined_str);
+}
+
+TEST_F(WideColumnSerializationTest, V2LayoutStructureVerification) {
+  // Verify the V2 binary layout structure by manually parsing sections
+  std::vector<std::pair<std::string, std::string>> columns = {
+      {"aa", "val_aa"}, {"bbb", "val_bbb"}};
+  std::vector<std::pair<size_t, BlobIndex>> empty_blob_columns;
+
+  std::string serialized;
+  ASSERT_OK(WideColumnSerialization::SerializeV2(columns, empty_blob_columns,
+                                                 serialized));
+
+  Slice data(serialized);
+
+  // Section 1: HEADER
+  uint32_t version = 0;
+  ASSERT_TRUE(GetVarint32(&data, &version));
+  ASSERT_EQ(version, WideColumnSerialization::kVersion2);
+
+  uint32_t num_columns = 0;
+  ASSERT_TRUE(GetVarint32(&data, &num_columns));
+  ASSERT_EQ(num_columns, 2u);
+
+  // Section 2: SKIP INFO (3 varints)
+  uint32_t name_sizes_bytes = 0;
+  uint32_t value_sizes_bytes = 0;
+  uint32_t names_bytes = 0;
+  ASSERT_TRUE(GetVarint32(&data, &name_sizes_bytes));
+  ASSERT_TRUE(GetVarint32(&data, &value_sizes_bytes));
+  ASSERT_TRUE(GetVarint32(&data, &names_bytes));
+  // name sizes: varint(2) + varint(3) = 1 + 1 = 2 bytes
+  ASSERT_EQ(name_sizes_bytes, 2u);
+  // value sizes: varint(6) + varint(7) = 1 + 1 = 2 bytes
+  ASSERT_EQ(value_sizes_bytes, 2u);
+  // names: "aa" + "bbb" = 2 + 3 = 5 bytes
+  ASSERT_EQ(names_bytes, 5u);
+
+  // Section 3: COLUMN TYPES (2 bytes, both inline)
+  ASSERT_GE(data.size(), 2u);
+  ASSERT_EQ(static_cast<uint8_t>(data[0]), static_cast<uint8_t>(kTypeValue));
+  ASSERT_EQ(static_cast<uint8_t>(data[1]), static_cast<uint8_t>(kTypeValue));
+  data.remove_prefix(2);
+
+  // Section 4: NAME SIZES
+  uint32_t ns0 = 0;
+  uint32_t ns1 = 0;
+  ASSERT_TRUE(GetVarint32(&data, &ns0));
+  ASSERT_TRUE(GetVarint32(&data, &ns1));
+  ASSERT_EQ(ns0, 2u);
+  ASSERT_EQ(ns1, 3u);
+
+  // Section 5: VALUE SIZES
+  uint32_t vs0 = 0;
+  uint32_t vs1 = 0;
+  ASSERT_TRUE(GetVarint32(&data, &vs0));
+  ASSERT_TRUE(GetVarint32(&data, &vs1));
+  ASSERT_EQ(vs0, 6u);  // "val_aa" = 6
+  ASSERT_EQ(vs1, 7u);  // "val_bbb" = 7
+
+  // Section 6: COLUMN NAMES
+  ASSERT_GE(data.size(), 5u);
+  ASSERT_EQ(Slice(data.data(), 2), "aa");
+  ASSERT_EQ(Slice(data.data() + 2, 3), "bbb");
+  data.remove_prefix(5);
+
+  // Section 7: COLUMN VALUES
+  ASSERT_GE(data.size(), 13u);
+  ASSERT_EQ(Slice(data.data(), 6), "val_aa");
+  ASSERT_EQ(Slice(data.data() + 6, 7), "val_bbb");
+}
+
+// Randomized correctness test: serialize and deserialize with random column
+// counts, name sizes, value sizes, and randomly chosen blob columns.
+// Validates the full round-trip for both V1 (Serialize) and V2
+// (SerializeV2) formats.
+TEST_F(WideColumnSerializationTest, RandomizedSerializeDeserializeRoundTrip) {
+  uint32_t seed = static_cast<uint32_t>(
+      std::chrono::system_clock::now().time_since_epoch().count());
+  Random rng(seed);
+  SCOPED_TRACE("seed=" + std::to_string(seed));
+
+  constexpr int kNumIterations = 100;
+
+  for (int iter = 0; iter < kNumIterations; ++iter) {
+    int num_cols = rng.Uniform(17);     // 0..16
+    int name_sz = 1 + rng.Uniform(64);  // 1..64
+    int val_sz = rng.Uniform(1025);     // 0..1024
+
+    // Generate sorted column names and random values
+    std::vector<std::pair<std::string, std::string>> columns;
+    columns.reserve(num_cols);
+    for (int c = 0; c < num_cols; ++c) {
+      // Build a sorted, unique name of exactly name_sz bytes.
+      // Use hex-encoded index as prefix to guarantee sort order,
+      // then pad with random characters.
+      char idx_str[16];
+      snprintf(idx_str, sizeof(idx_str), "%04x", c);
+      std::string name(idx_str);
+      if (static_cast<int>(name.size()) < name_sz) {
+        name.append(name_sz - name.size(),
+                    static_cast<char>('a' + rng.Uniform(26)));
+      }
+      // Ensure exactly name_sz bytes. For name_sz < 4, use just the
+      // low-order hex digits to maintain sort order.
+      if (static_cast<int>(name.size()) > name_sz) {
+        name = name.substr(name.size() - name_sz);
+      }
+
+      // Random value content
+      std::string value(val_sz, '\0');
+      for (int j = 0; j < val_sz; ++j) {
+        value[j] = static_cast<char>(rng.Uniform(256));
+      }
+      columns.emplace_back(std::move(name), std::move(value));
+    }
+
+    // Randomly select some columns as blob columns
+    std::vector<std::pair<size_t, BlobIndex>> blob_columns;
+    for (int c = 0; c < num_cols; ++c) {
+      if (rng.Uniform(3) == 0) {  // ~33% chance of being a blob column
+        blob_columns.emplace_back(c, MakeRandomBlobIndex(rng));
+      }
+    }
+
+    // V2 serialize → DeserializeV2 round-trip
+    std::string serialized;
+    std::vector<WideColumn> deserialized;
+    std::vector<std::pair<size_t, BlobIndex>> blob_out;
+    V2SerializeAndDeserialize(columns, blob_columns, &deserialized, &blob_out,
+                              &serialized);
+
+    // Verify version and HasBlobColumns
+    uint32_t v = 0;
+    ASSERT_OK(GetVersion(Slice(serialized), v));
+    ASSERT_EQ(v, WideColumnSerialization::kVersion2);
+
+    bool hb = false;
+    ASSERT_OK(WideColumnSerialization::HasBlobColumns(Slice(serialized), hb));
+    ASSERT_EQ(hb, !blob_columns.empty());
+
+    // Verify blob column round-trip
+    ASSERT_EQ(blob_out.size(), blob_columns.size());
+    for (size_t b = 0; b < blob_columns.size(); ++b) {
+      ASSERT_EQ(blob_out[b].first, blob_columns[b].first);
+      const BlobIndex& orig = blob_columns[b].second;
+      const BlobIndex& decoded = blob_out[b].second;
+      ASSERT_EQ(decoded.IsInlined(), orig.IsInlined());
+      ASSERT_EQ(decoded.HasTTL(), orig.HasTTL());
+      if (!decoded.IsInlined()) {
+        ASSERT_EQ(decoded.file_number(), orig.file_number());
+        ASSERT_EQ(decoded.offset(), orig.offset());
+        ASSERT_EQ(decoded.size(), orig.size());
+      }
+    }
+
+    // Verify inline column values
+    size_t blob_idx = 0;
+    for (int c = 0; c < num_cols; ++c) {
+      if (blob_idx < blob_columns.size() &&
+          blob_columns[blob_idx].first == static_cast<size_t>(c)) {
+        ++blob_idx;
+      } else {
+        ASSERT_EQ(deserialized[c].value(), columns[c].second);
+      }
+    }
+
+    // If no blob columns, also verify Deserialize() and both overloads
+    if (blob_columns.empty()) {
+      VerifyDeserialize(serialized, columns);
+
+      // WideColumns overload should produce identical output
+      std::string serialized2;
+      WideColumns wc = ToWideColumns(columns);
+      ASSERT_OK(
+          WideColumnSerialization::SerializeV2(wc, blob_columns, serialized2));
+      ASSERT_EQ(serialized, serialized2);
+    }
+
+    // V1 Serialize round-trip
+    {
+      WideColumns wc = ToWideColumns(columns);
+      std::string serialized_v1;
+      ASSERT_OK(WideColumnSerialization::Serialize(wc, serialized_v1));
+
+      ASSERT_OK(GetVersion(Slice(serialized_v1), v));
+      ASSERT_EQ(v, WideColumnSerialization::kVersion1);
+
+      VerifyDeserialize(serialized_v1, columns);
+    }
+  }
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/write_batch.cc b/db/write_batch.cc
index 15034e5c3fcc..c2f7a7eddf51 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -551,9 +551,6 @@ Status WriteBatchInternal::Iterate(const WriteBatch* wb,
 
     if (LIKELY(!s.IsTryAgain())) {
       last_was_try_again = false;
-      tag = 0;
-      column_family = 0;  // default
-
       s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value,
                                    &blob, &xid, &write_unix_time);
       if (!s.ok()) {
@@ -815,6 +812,12 @@ WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(
         s = Status::InvalidArgument("Default cf timestamp size mismatch");
       }
     }
+    auto* cfd =
+        static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
+    if (cfd && cfd->ioptions().disallow_memtable_writes) {
+      s = Status::InvalidArgument(
+          "This column family has disallow_memtable_writes=true");
+    }
   } else if (b->default_cf_ts_sz_ > 0) {
     ts_sz = b->default_cf_ts_sz_;
   }
@@ -836,6 +839,12 @@ Status CheckColumnFamilyTimestampSize(ColumnFamilyHandle* column_family,
   if (cf_ts_sz != ts.size()) {
     return Status::InvalidArgument("timestamp size mismatch");
   }
+  auto* cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
+  if (cfd && cfd->ioptions().disallow_memtable_writes) {
+    return Status::InvalidArgument(
+        "This column family has disallow_memtable_writes=true");
+  }
   return Status::OK();
 }
 }  // anonymous namespace
@@ -1885,7 +1894,6 @@ Status WriteBatch::VerifyChecksum() const {
     // ReadRecordFromWriteBatch
     key.clear();
     value.clear();
-    column_family = 0;
     s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value,
                                  &blob, &xid, /*write_unix_time=*/nullptr);
     if (!s.ok()) {
@@ -2185,6 +2193,13 @@ class MemTableInserter : public WriteBatch::Handler {
       }
       return false;
     }
+    auto* current = cf_mems_->current();
+    if (current && current->ioptions().disallow_memtable_writes) {
+      *s = Status::InvalidArgument(
+          "This column family has disallow_memtable_writes=true");
+      return false;
+    }
+
     if (recovering_log_number_ != 0 &&
         recovering_log_number_ < cf_mems_->GetLogNumber()) {
       // This is true only in recovery environment (recovering_log_number_ is
@@ -3195,11 +3210,11 @@ Status WriteBatchInternal::InsertInto(
     ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler,
     TrimHistoryScheduler* trim_history_scheduler,
     bool ignore_missing_column_families, uint64_t recovery_log_number, DB* db,
-    bool concurrent_memtable_writes, bool seq_per_batch, bool batch_per_txn) {
+    bool seq_per_batch, bool batch_per_txn) {
   MemTableInserter inserter(
       sequence, memtables, flush_scheduler, trim_history_scheduler,
       ignore_missing_column_families, recovery_log_number, db,
-      concurrent_memtable_writes, nullptr /* prot_info */,
+      /*concurrent_memtable_writes=*/false, nullptr /* prot_info */,
       nullptr /*has_valid_writes*/, seq_per_batch, batch_per_txn);
   for (auto w : write_group) {
     if (w->CallbackFailed()) {
diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h
index 3cf3f4689a8c..f7b36a4133cf 100644
--- a/db/write_batch_internal.h
+++ b/db/write_batch_internal.h
@@ -185,18 +185,19 @@ class WriteBatchInternal {
   // If flush_scheduler is non-null, it will be invoked if the memtable
   // should be flushed.
   //
-  // Under concurrent use, the caller is responsible for making sure that
-  // the memtables object itself is thread-local.
+  // This overload is for non-concurrent insertion only.
   static Status InsertInto(
       WriteThread::WriteGroup& write_group, SequenceNumber sequence,
       ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler,
       TrimHistoryScheduler* trim_history_scheduler,
       bool ignore_missing_column_families = false, uint64_t log_number = 0,
-      DB* db = nullptr, bool concurrent_memtable_writes = false,
-      bool seq_per_batch = false, bool batch_per_txn = true);
+      DB* db = nullptr, bool seq_per_batch = false, bool batch_per_txn = true);
 
   // Convenience form of InsertInto when you have only one batch
   // next_seq returns the seq after last sequence number used in MemTable insert
+  //
+  // Under concurrent use, the caller is responsible for making sure that
+  // the memtables object itself is thread-local.
   static Status InsertInto(
       const WriteBatch* batch, ColumnFamilyMemTables* memtables,
       FlushScheduler* flush_scheduler,
diff --git a/db/write_callback_test.cc b/db/write_callback_test.cc
index 53094eca4b9b..4fd1d8bcdc65 100644
--- a/db/write_callback_test.cc
+++ b/db/write_callback_test.cc
@@ -419,7 +419,7 @@ TEST_F(WriteCallbackTest, WriteCallBackTest) {
   WriteOptions write_options;
   ReadOptions read_options;
   string value;
-  DB* db;
+  std::unique_ptr<DB> db;
   DBImpl* db_impl;
 
   ASSERT_OK(DestroyDB(dbname, options));
@@ -428,7 +428,7 @@ TEST_F(WriteCallbackTest, WriteCallBackTest) {
   Status s = DB::Open(options, dbname, &db);
   ASSERT_OK(s);
 
-  db_impl = dynamic_cast<DBImpl*>(db);
+  db_impl = dynamic_cast<DBImpl*>(db.get());
   ASSERT_TRUE(db_impl);
 
   WriteBatch wb;
@@ -481,7 +481,7 @@ TEST_F(WriteCallbackTest, WriteCallBackTest) {
   ASSERT_TRUE(user_write_cb.write_enqueued_.load());
   ASSERT_TRUE(user_write_cb.wal_write_done_.load());
 
-  delete db;
+  db.reset();
   ASSERT_OK(DestroyDB(dbname, options));
 }
 
diff --git a/db/write_thread.h b/db/write_thread.h
index 42256970f413..6c2dc5dcd02a 100644
--- a/db/write_thread.h
+++ b/db/write_thread.h
@@ -132,7 +132,7 @@ class WriteThread {
     size_t protection_bytes_per_key;
     PreReleaseCallback* pre_release_callback;
     PostMemTableCallback* post_memtable_callback;
-    uint64_t log_used;  // log number that this batch was inserted into
+    uint64_t wal_used;  // log number that this batch was inserted into
     uint64_t log_ref;   // log number that memtable insert should reference
     WriteCallback* callback;
     UserWriteCallback* user_write_cb;
@@ -161,7 +161,7 @@ class WriteThread {
           protection_bytes_per_key(0),
           pre_release_callback(nullptr),
           post_memtable_callback(nullptr),
-          log_used(0),
+          wal_used(0),
           log_ref(0),
           callback(nullptr),
           user_write_cb(nullptr),
@@ -179,7 +179,7 @@ class WriteThread {
            PostMemTableCallback* _post_memtable_callback = nullptr,
            bool _ingest_wbwi = false)
         : batch(_batch),
-          // TODO: store a copy of WriteOptions instead of its seperated data
+          // TODO: store a copy of WriteOptions instead of its separated data
           // members
           sync(write_options.sync),
           no_slowdown(write_options.no_slowdown),
@@ -190,7 +190,7 @@ class WriteThread {
           protection_bytes_per_key(_batch->GetProtectionBytesPerKey()),
           pre_release_callback(_pre_release_callback),
           post_memtable_callback(_post_memtable_callback),
-          log_used(0),
+          wal_used(0),
           log_ref(_log_ref),
           callback(_callback),
           user_write_cb(_user_write_cb),
diff --git a/db_stress_tool/CMakeLists.txt b/db_stress_tool/CMakeLists.txt
index be34778ddd44..90200f342bf4 100644
--- a/db_stress_tool/CMakeLists.txt
+++ b/db_stress_tool/CMakeLists.txt
@@ -2,13 +2,14 @@ add_executable(db_stress${ARTIFACT_SUFFIX}
   batched_ops_stress.cc
   cf_consistency_stress.cc
   db_stress.cc
+  db_stress_compaction_service.cc
+  db_stress_compression_manager.cc
   db_stress_common.cc
   db_stress_driver.cc
   db_stress_filters.cc
   db_stress_gflags.cc
   db_stress_listener.cc
   db_stress_shared_state.cc
-  db_stress_stat.cc
   db_stress_test_base.cc
   db_stress_wide_merge_operator.cc
   db_stress_tool.cc
diff --git a/db_stress_tool/cf_consistency_stress.cc b/db_stress_tool/cf_consistency_stress.cc
index 1df4fc7cb7fc..d18c47281a69 100644
--- a/db_stress_tool/cf_consistency_stress.cc
+++ b/db_stress_tool/cf_consistency_stress.cc
@@ -1047,7 +1047,7 @@ class CfConsistencyStressTest : public StressTest {
     assert(thread);
     Status status;
 
-    DB* db_ptr = secondary_db_ ? secondary_db_ : db_;
+    DB* db_ptr = secondary_db_ ? secondary_db_.get() : db_;
     const auto& cfhs = secondary_db_ ? secondary_cfhs_ : column_families_;
 
     // Take a snapshot to preserve the state of primary db.
diff --git a/db_stress_tool/db_stress_common.cc b/db_stress_tool/db_stress_common.cc
index 968a6c16c0f8..c26401352234 100644
--- a/db_stress_tool/db_stress_common.cc
+++ b/db_stress_tool/db_stress_common.cc
@@ -13,6 +13,7 @@
 
 #include <cmath>
 
+#include "file/file_util.h"
 #include "rocksdb/secondary_cache.h"
 #include "util/file_checksum_helper.h"
 #include "util/xxhash.h"
@@ -228,6 +229,280 @@ void CompressedCacheSetCapacityThread(void* v) {
   }
 }
 
+#ifndef NDEBUG
+static void SetupFaultInjectionForRemoteCompaction(SharedState* shared) {
+  if (!fault_fs_guard) {
+    return;
+  }
+
+  fault_fs_guard->SetThreadLocalErrorContext(
+      FaultInjectionIOType::kRead, shared->GetSeed(), FLAGS_read_fault_one_in,
+      FLAGS_inject_error_severity == 1 /* retryable */,
+      FLAGS_inject_error_severity == 2 /* has_data_loss*/);
+  fault_fs_guard->EnableThreadLocalErrorInjection(FaultInjectionIOType::kRead);
+
+  fault_fs_guard->SetThreadLocalErrorContext(
+      FaultInjectionIOType::kWrite, shared->GetSeed(), FLAGS_write_fault_one_in,
+      FLAGS_inject_error_severity == 1 /* retryable */,
+      FLAGS_inject_error_severity == 2 /* has_data_loss*/);
+  fault_fs_guard->EnableThreadLocalErrorInjection(FaultInjectionIOType::kWrite);
+
+  fault_fs_guard->SetThreadLocalErrorContext(
+      FaultInjectionIOType::kMetadataRead, shared->GetSeed(),
+      FLAGS_metadata_read_fault_one_in,
+      FLAGS_inject_error_severity == 1 /* retryable */,
+      FLAGS_inject_error_severity == 2 /* has_data_loss*/);
+  fault_fs_guard->EnableThreadLocalErrorInjection(
+      FaultInjectionIOType::kMetadataRead);
+
+  fault_fs_guard->SetThreadLocalErrorContext(
+      FaultInjectionIOType::kMetadataWrite, shared->GetSeed(),
+      FLAGS_metadata_write_fault_one_in,
+      FLAGS_inject_error_severity == 1 /* retryable */,
+      FLAGS_inject_error_severity == 2 /* has_data_loss*/);
+  fault_fs_guard->EnableThreadLocalErrorInjection(
+      FaultInjectionIOType::kMetadataWrite);
+}
+#endif  // NDEBUG
+
+static CompactionServiceOptionsOverride CreateOverrideOptions(
+    const Options& options, const CompactionServiceJobInfo& job_info) {
+  CompactionServiceOptionsOverride override_options{
+      .env = db_stress_env,
+      .file_checksum_gen_factory = options.file_checksum_gen_factory,
+      .merge_operator = options.merge_operator,
+      .compaction_filter = options.compaction_filter,
+      .compaction_filter_factory = options.compaction_filter_factory,
+      .prefix_extractor = options.prefix_extractor,
+      .sst_partitioner_factory = options.sst_partitioner_factory,
+      .listeners = options.listeners,
+      .statistics = options.statistics,
+      .table_properties_collector_factories =
+          options.table_properties_collector_factories};
+
+  // TODO(jaykorean) - create a new compaction filter / merge operator and
+  // others for remote compactions
+  //
+  // Create a new Table Factory
+  ConfigOptions config_options;
+  config_options.ignore_unknown_options = false;
+  config_options.ignore_unsupported_options = false;
+
+  Status s = TableFactory::CreateFromString(config_options,
+                                            options.table_factory->Name(),
+                                            &override_options.table_factory);
+
+  if (s.ok()) {
+    std::string options_str;
+    s = options.table_factory->GetOptionString(config_options, &options_str);
+    if (s.ok()) {
+      s = override_options.table_factory->ConfigureFromString(config_options,
+                                                              options_str);
+    }
+  }
+
+  if (!s.ok()) {
+    fprintf(stdout,
+            "Failed to set up TableFactory for remote compaction - (%s): %s\n",
+            job_info.db_name.c_str(), s.ToString().c_str());
+  }
+
+  return override_options;
+}
+
+static Status CleanupOutputDirectory(const std::string& output_directory) {
+#ifndef NDEBUG
+  // Temporarily disable fault injection to ensure deletion always succeeds
+  if (fault_fs_guard) {
+    fault_fs_guard->DisableAllThreadLocalErrorInjection();
+  }
+#endif  // NDEBUG
+
+  Status s = DestroyDir(db_stress_env, output_directory);
+  if (!s.ok()) {
+    fprintf(stderr,
+            "Failed to destroy output directory %s when allow_resumption is "
+            "false: %s\n",
+            output_directory.c_str(), s.ToString().c_str());
+  }
+
+  if (s.ok()) {
+    s = db_stress_env->CreateDir(output_directory);
+    if (!s.ok()) {
+      fprintf(stderr,
+              "Failed to recreate output directory %s when allow_resumption is "
+              "false: %s\n",
+              output_directory.c_str(), s.ToString().c_str());
+    }
+  }
+
+#ifndef NDEBUG
+  // Re-enable fault injection after deletion
+  if (fault_fs_guard) {
+    fault_fs_guard->EnableAllThreadLocalErrorInjection();
+  }
+#endif  // NDEBUG
+
+  return s;
+}
+
+// Set up cancellation mechanism for testing resumable remote compactions.
+// Spawns a detached thread to trigger cancellation after a delay (50ms
+// initially, or 2/3 of the previous successful compaction time for adaptive
+// timing). First-time jobs are always canceled; retries have a 10% chance
+// to test consecutive cancellation scenarios.
+static std::shared_ptr<std::atomic<bool>> SetupCancellation(
+    OpenAndCompactOptions& open_compact_options, bool was_canceled,
+    Random& rand, uint64_t successful_compaction_end_to_end_micros) {
+  auto canceled = std::make_shared<std::atomic<bool>>(false);
+  open_compact_options.canceled = canceled.get();
+
+  bool should_cancel = !was_canceled || rand.OneIn(10);
+
+  if (should_cancel) {
+    std::thread interruption_thread(
+        [canceled, successful_compaction_end_to_end_micros]() {
+          uint64_t sleep_micros =
+              successful_compaction_end_to_end_micros == 0
+                  ? 50000
+                  : successful_compaction_end_to_end_micros * 2 / 3;
+          std::this_thread::sleep_for(std::chrono::microseconds(sleep_micros));
+          canceled->store(true);
+        });
+    interruption_thread.detach();
+  }
+
+  return canceled;
+}
+
+// Process the result of OpenAndCompact operation
+static void ProcessCompactionResult(
+    const Status& s, const std::string& job_id,
+    const CompactionServiceJobInfo& job_info,
+    const std::string& serialized_input, const std::string& output_directory,
+    const std::string& serialized_output, SharedState* shared,
+    uint64_t& successful_compaction_end_to_end_micros, uint64_t start_micros,
+    Env* env) {
+  if (s.IsManualCompactionPaused() && FLAGS_allow_resumption_one_in > 0) {
+    // Re-enqueue for retry
+    shared->EnqueueRemoteCompaction(job_id, job_info, serialized_input,
+                                    output_directory, true /* was_cancelled */);
+    return;
+  }
+
+  if (!s.ok()) {
+    if (!StressTest::IsErrorInjectedAndRetryable(s)) {
+      // Print in stdout instead of stderr to avoid stress test failure,
+      // because OpenAndCompact() failure doesn't necessarily mean
+      // primary db instance failure.
+      fprintf(stdout, "Failed to run OpenAndCompact(%s): %s\n",
+              job_info.db_name.c_str(), s.ToString().c_str());
+    }
+  } else {
+    // Track successful completion time
+    successful_compaction_end_to_end_micros = env->NowMicros() - start_micros;
+  }
+
+  // Add the output regardless of status, so that primary DB doesn't rely
+  // on the timeout to finish waiting. The actual failure from the
+  // deserialization can fail the compaction properly
+  shared->AddRemoteCompactionResult(job_id, s, serialized_output);
+}
+
+static void ProcessRemoteCompactionJob(
+    const std::string& job_id, const CompactionServiceJobInfo& job_info,
+    const std::string& serialized_input, const std::string& output_directory,
+    bool was_canceled, SharedState* shared, StressTest* stress_test,
+    Random& rand, uint64_t& successful_compaction_end_to_end_micros) {
+  auto options = stress_test->GetOptions(job_info.cf_id);
+  assert(options.env != nullptr);
+
+  auto override_options = CreateOverrideOptions(options, job_info);
+
+  OpenAndCompactOptions open_compact_options;
+  if (FLAGS_allow_resumption_one_in > 0) {
+    open_compact_options.allow_resumption =
+        rand.OneIn(FLAGS_allow_resumption_one_in);
+  } else {
+    open_compact_options.allow_resumption = false;
+  }
+
+  if (!open_compact_options.allow_resumption) {
+    CleanupOutputDirectory(output_directory);
+  }
+
+  std::shared_ptr<std::atomic<bool>> canceled = nullptr;
+  if (FLAGS_allow_resumption_one_in > 0) {
+    canceled = SetupCancellation(open_compact_options, was_canceled, rand,
+                                 successful_compaction_end_to_end_micros);
+  }
+
+  std::string serialized_output;
+  uint64_t start_micros = options.env->NowMicros();
+
+  Status s = DB::OpenAndCompact(open_compact_options, job_info.db_name,
+                                output_directory, serialized_input,
+                                &serialized_output, override_options);
+
+  ProcessCompactionResult(s, job_id, job_info, serialized_input,
+                          output_directory, serialized_output, shared,
+                          successful_compaction_end_to_end_micros, start_micros,
+                          options.env);
+}
+
+void RemoteCompactionWorkerThread(void* v) {
+  assert(FLAGS_remote_compaction_worker_threads > 0);
+  assert(FLAGS_remote_compaction_worker_interval > 0);
+
+  auto* thread = static_cast<ThreadState*>(v);
+  SharedState* shared = thread->shared;
+  StressTest* stress_test = shared->GetStressTest();
+  assert(stress_test != nullptr);
+
+#ifndef NDEBUG
+  SetupFaultInjectionForRemoteCompaction(shared);
+#endif  // NDEBUG
+
+  // Tracks the duration (in microseconds) of the most recent successfully
+  // completed compaction from start to finish. This value is used in
+  // SetupCancellation() to adaptively set up cancellation point for a
+  // compaction
+  uint64_t successful_compaction_end_to_end_micros = 0;
+  Random rand(static_cast<uint32_t>(FLAGS_seed));
+
+  // Main worker loop
+  while (true) {
+    // Check if we should stop
+    {
+      MutexLock l(shared->GetMutex());
+      if (shared->ShouldStopBgThread()) {
+        shared->IncBgThreadsFinished();
+        if (shared->BgThreadsFinished()) {
+          shared->GetCondVar()->SignalAll();
+        }
+        return;
+      }
+    }
+
+    std::string job_id;
+    CompactionServiceJobInfo job_info;
+    std::string serialized_input;
+    std::string output_directory;
+    bool was_canceled;
+
+    if (shared->DequeueRemoteCompaction(&job_id, &job_info, &serialized_input,
+                                        &output_directory, &was_canceled)) {
+      ProcessRemoteCompactionJob(
+          job_id, job_info, serialized_input, output_directory, was_canceled,
+          shared, stress_test, rand, successful_compaction_end_to_end_micros);
+    }
+
+    db_stress_env->SleepForMicroseconds(
+        thread->rand.Next() % FLAGS_remote_compaction_worker_interval * 1000 +
+        1);
+  }
+}
+
 void PrintKeyValue(int cf, uint64_t key, const char* value, size_t sz) {
   if (!FLAGS_verbose) {
     return;
@@ -602,5 +877,24 @@ Status DestroyUnverifiedSubdir(const std::string& dirname) {
   return s;
 }
 
+Status DbStressDestroyDb(const std::string& db_path) {
+  Status s;
+  Options options;
+  // NOTE: using db_stress_listener_env in order to see obsolete MANIFEST files
+  options.env = db_stress_listener_env;
+  // Remove DB files in a principled way to avoid issues
+  if (FLAGS_use_blob_db) {
+    s = blob_db::DestroyBlobDB(db_path, options, blob_db::BlobDBOptions());
+  } else {
+    s = DestroyDB(db_path, options);
+  }
+  if (!s.ok()) {
+    return s;
+  }
+  // Remove everything else recursively, only reporting success if able to
+  // delete everything
+  return DestroyDir(db_stress_listener_env, db_path);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 #endif  // GFLAGS
diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h
index 0871a87f9e70..fff3720f150d 100644
--- a/db_stress_tool/db_stress_common.h
+++ b/db_stress_tool/db_stress_common.h
@@ -100,13 +100,14 @@ DECLARE_bool(enable_pipelined_write);
 DECLARE_bool(verify_before_write);
 DECLARE_bool(histogram);
 DECLARE_bool(destroy_db_initially);
+DECLARE_bool(destroy_db_and_exit);
+DECLARE_string(delete_dir_and_exit);
 DECLARE_bool(verbose);
 DECLARE_bool(progress_reports);
 DECLARE_uint64(db_write_buffer_size);
 DECLARE_int32(write_buffer_size);
 DECLARE_int32(max_write_buffer_number);
 DECLARE_int32(min_write_buffer_number_to_merge);
-DECLARE_int32(max_write_buffer_number_to_maintain);
 DECLARE_int64(max_write_buffer_size_to_maintain);
 DECLARE_bool(use_write_buffer_manager);
 DECLARE_double(memtable_prefix_bloom_size_ratio);
@@ -160,6 +161,8 @@ DECLARE_uint64(periodic_compaction_seconds);
 DECLARE_string(daily_offpeak_time_utc);
 DECLARE_uint64(compaction_ttl);
 DECLARE_bool(fifo_allow_compaction);
+DECLARE_uint64(fifo_compaction_max_data_files_size_mb);
+DECLARE_bool(fifo_compaction_use_kv_ratio_compaction);
 DECLARE_bool(allow_concurrent_memtable_write);
 DECLARE_double(experimental_mempurge_threshold);
 DECLARE_bool(enable_write_thread_adaptive_yield);
@@ -174,6 +177,7 @@ DECLARE_uint32(sqfc_version);
 DECLARE_bool(use_sqfc_for_range_queries);
 DECLARE_int32(index_type);
 DECLARE_int32(data_block_index_type);
+DECLARE_int32(index_block_search_type);
 DECLARE_string(db);
 DECLARE_string(secondaries_base);
 DECLARE_bool(test_secondary);
@@ -218,6 +222,7 @@ DECLARE_int32(reset_stats_one_in);
 DECLARE_int32(pause_background_one_in);
 DECLARE_int32(disable_file_deletions_one_in);
 DECLARE_int32(disable_manual_compaction_one_in);
+DECLARE_int32(abort_and_resume_compactions_one_in);
 DECLARE_int32(compact_range_width);
 DECLARE_int32(acquire_snapshot_one_in);
 DECLARE_bool(compare_full_db_state_snapshot);
@@ -249,6 +254,7 @@ DECLARE_string(fs_uri);
 DECLARE_uint64(ops_per_thread);
 DECLARE_uint64(log2_keys_per_lock);
 DECLARE_uint64(max_manifest_file_size);
+DECLARE_int32(max_manifest_space_amp_pct);
 DECLARE_bool(in_place_update);
 DECLARE_string(memtablerep);
 DECLARE_int32(prefix_size);
@@ -276,6 +282,7 @@ DECLARE_string(last_level_temperature);
 DECLARE_string(default_write_temperature);
 DECLARE_string(default_temperature);
 DECLARE_bool(paranoid_memory_checks);
+DECLARE_bool(memtable_veirfy_per_key_checksum_on_seek);
 
 // Options for transaction dbs.
 // Use TransactionDB (a.k.a. Pessimistic Transaction DB)
@@ -285,6 +292,7 @@ DECLARE_bool(use_txn);
 // Options for TransactionDB (a.k.a. Pessimistic Transaction DB)
 DECLARE_uint64(txn_write_policy);
 DECLARE_bool(unordered_write);
+DECLARE_bool(use_per_key_point_lock_mgr);
 
 // Options for OptimisticTransactionDB
 DECLARE_bool(use_optimistic_txn);
@@ -294,11 +302,8 @@ DECLARE_uint32(occ_lock_bucket_count);
 
 // Options for StackableDB-based BlobDB
 DECLARE_bool(use_blob_db);
-DECLARE_uint64(blob_db_min_blob_size);
-DECLARE_uint64(blob_db_bytes_per_sync);
 DECLARE_uint64(blob_db_file_size);
 DECLARE_bool(blob_db_enable_gc);
-DECLARE_double(blob_db_gc_cutoff);
 
 // Options for integrated BlobDB
 DECLARE_bool(allow_setting_blob_options_dynamically);
@@ -321,7 +326,6 @@ DECLARE_int32(approximate_size_one_in);
 DECLARE_bool(best_efforts_recovery);
 DECLARE_bool(skip_verifydb);
 DECLARE_bool(paranoid_file_checks);
-DECLARE_bool(fail_if_options_file_error);
 DECLARE_uint64(batch_protection_bytes_per_key);
 DECLARE_uint32(memtable_protection_bytes_per_key);
 DECLARE_uint32(block_protection_bytes_per_key);
@@ -397,9 +401,9 @@ DECLARE_bool(enable_index_compression);
 DECLARE_uint32(index_shortening);
 DECLARE_uint32(metadata_charge_policy);
 DECLARE_bool(use_adaptive_mutex_lru);
-DECLARE_uint32(compress_format_version);
 DECLARE_uint64(manifest_preallocation_size);
 DECLARE_bool(enable_checksum_handoff);
+DECLARE_string(compression_manager);
 DECLARE_uint64(max_total_wal_size);
 DECLARE_double(high_pri_pool_ratio);
 DECLARE_double(low_pri_pool_ratio);
@@ -409,6 +413,8 @@ DECLARE_uint64(max_sequential_skip_in_iterations);
 DECLARE_bool(enable_sst_partitioner_factory);
 DECLARE_bool(enable_do_not_compress_roles);
 DECLARE_bool(block_align);
+DECLARE_uint64(super_block_alignment_size);
+DECLARE_uint64(super_block_alignment_space_overhead_ratio);
 DECLARE_uint32(lowest_used_cache_tier);
 DECLARE_bool(enable_custom_split_merge);
 DECLARE_uint32(adm_policy);
@@ -420,10 +426,27 @@ DECLARE_uint32(uncache_aggressiveness);
 DECLARE_int32(test_ingest_standalone_range_deletion_one_in);
 DECLARE_bool(allow_unprepared_value);
 DECLARE_string(file_temperature_age_thresholds);
+DECLARE_bool(allow_trivial_copy_when_change_temperature);
 DECLARE_uint32(commit_bypass_memtable_one_in);
 DECLARE_bool(track_and_verify_wals);
-DECLARE_bool(enable_remote_compaction);
+DECLARE_int32(remote_compaction_worker_threads);
+DECLARE_int32(remote_compaction_worker_interval);
+DECLARE_bool(remote_compaction_failure_fall_back_to_local);
+DECLARE_int32(allow_resumption_one_in);
 DECLARE_bool(auto_refresh_iterator_with_snapshot);
+DECLARE_uint32(memtable_op_scan_flush_trigger);
+DECLARE_uint32(memtable_avg_op_scan_flush_trigger);
+DECLARE_uint32(ingest_wbwi_one_in);
+DECLARE_bool(universal_reduce_file_locking);
+DECLARE_bool(use_multiscan);
+DECLARE_bool(multiscan_use_async_io);
+
+// Compaction deletion trigger declarations for stress testing
+DECLARE_bool(enable_compaction_on_deletion_trigger);
+DECLARE_uint64(compaction_on_deletion_min_file_size);
+DECLARE_int32(compaction_on_deletion_trigger_count);
+DECLARE_int32(compaction_on_deletion_window_size);
+DECLARE_double(compaction_on_deletion_ratio);
 
 constexpr long KB = 1024;
 constexpr int kRandomValueMaxFactor = 3;
@@ -751,6 +774,8 @@ void PoolSizeChangeThread(void* v);
 
 void DbVerificationThread(void* v);
 
+void RemoteCompactionWorkerThread(void* v);
+
 void CompressedCacheSetCapacityThread(void* v);
 
 void TimestampedSnapshotsThread(void* v);
@@ -797,5 +822,10 @@ Status SaveFilesInDirectory(const std::string& src_dirname,
                             const std::string& dst_dirname);
 Status DestroyUnverifiedSubdir(const std::string& dirname);
 Status InitUnverifiedSubdir(const std::string& dirname);
+
+// Destroy the DB at the given path under the env configured for db_stress.
+// Handles both regular DB and BlobDB, and cleans and removes the entire dir.
+Status DbStressDestroyDb(const std::string& db_path);
+
 }  // namespace ROCKSDB_NAMESPACE
 #endif  // GFLAGS
diff --git a/db_stress_tool/db_stress_compaction_service.cc b/db_stress_tool/db_stress_compaction_service.cc
new file mode 100644
index 000000000000..b64fe56095e6
--- /dev/null
+++ b/db_stress_tool/db_stress_compaction_service.cc
@@ -0,0 +1,61 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifdef GFLAGS
+
+#include "db_stress_tool/db_stress_compaction_service.h"
+
+#include <string>
+
+#include "db_stress_tool/db_stress_test_base.h"
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+CompactionServiceJobStatus DbStressCompactionService::Wait(
+    const std::string& scheduled_job_id, std::string* result) {
+  while (true) {
+    if (aborted_.load()) {
+      return CompactionServiceJobStatus::kAborted;
+    }
+    const auto& maybeResultStatus =
+        shared_->GetRemoteCompactionResult(scheduled_job_id, result);
+    if (maybeResultStatus.has_value()) {
+      auto s = maybeResultStatus.value();
+      if (s.ok()) {
+        assert(result);
+        assert(!result->empty());
+        return CompactionServiceJobStatus::kSuccess;
+      } else {
+        // Remote Compaction failed
+        if (failure_should_fall_back_to_local_) {
+          return CompactionServiceJobStatus::kUseLocal;
+        }
+        if (StressTest::IsErrorInjectedAndRetryable(s)) {
+          return CompactionServiceJobStatus::kUseLocal;
+        }
+        if (result && result->empty()) {
+          // If result is empty, set the compaction status in the result so
+          // that it can be bubbled up to main thread
+          CompactionServiceResult compaction_result;
+          compaction_result.status = s;
+          if (compaction_result.Write(result).ok()) {
+            assert(result);
+            assert(!result->empty());
+          }
+        }
+        return CompactionServiceJobStatus::kFailure;
+      }
+    } else {
+      // Remote Compaction is still running
+      Env::Default()->SleepForMicroseconds(kWaitIntervalInMicros);
+    }
+  }
+  return CompactionServiceJobStatus::kFailure;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // GFLAGS
diff --git a/db_stress_tool/db_stress_compaction_service.h b/db_stress_tool/db_stress_compaction_service.h
index f1fc04ea4467..a3566cef52a2 100644
--- a/db_stress_tool/db_stress_compaction_service.h
+++ b/db_stress_tool/db_stress_compaction_service.h
@@ -3,37 +3,91 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#ifdef GFLAGS
 #pragma once
 
+#include "db/compaction/compaction_job.h"
+#include "db_stress_shared_state.h"
 #include "rocksdb/options.h"
+#include "utilities/fault_injection_fs.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 // Service to simulate Remote Compaction in Stress Test
 class DbStressCompactionService : public CompactionService {
  public:
-  explicit DbStressCompactionService() {}
+  explicit DbStressCompactionService(SharedState* shared,
+                                     bool failure_should_fall_back_to_local)
+      : shared_(shared),
+        aborted_(false),
+        failure_should_fall_back_to_local_(failure_should_fall_back_to_local) {}
 
   static const char* kClassName() { return "DbStressCompactionService"; }
 
   const char* Name() const override { return kClassName(); }
 
+  static constexpr uint64_t kWaitIntervalInMicros = 10 * 1000;  // 10ms
+
+  static constexpr const char* kTempOutputDirectoryPrefix = "tmp_output_";
+
   CompactionServiceScheduleResponse Schedule(
-      const CompactionServiceJobInfo& /*info*/,
-      const std::string& /*compaction_service_input*/) override {
+      const CompactionServiceJobInfo& info,
+      const std::string& compaction_service_input) override {
+    std::string job_id = info.db_id + "_" + info.db_session_id + "_" +
+                         std::to_string(info.job_id);
+
+    if (aborted_.load()) {
+      return CompactionServiceScheduleResponse(
+          job_id, CompactionServiceJobStatus::kUseLocal);
+    }
+    std::string output_directory = info.db_name + "/" +
+                                   kTempOutputDirectoryPrefix +
+                                   Env::Default()->GenerateUniqueId();
+
+    shared_->EnqueueRemoteCompaction(
+        job_id, info, compaction_service_input, output_directory,
+        false /* was_cancelled */);  // Not canceled initially
     CompactionServiceScheduleResponse response(
-        "Implement Me", CompactionServiceJobStatus::kUseLocal);
+        job_id, CompactionServiceJobStatus::kSuccess);
     return response;
   }
 
-  CompactionServiceJobStatus Wait(const std::string& /*scheduled_job_id*/,
-                                  std::string* /*result*/) override {
-    // TODO - Implement
-    return CompactionServiceJobStatus::kUseLocal;
+  CompactionServiceJobStatus Wait(const std::string& scheduled_job_id,
+                                  std::string* result) override;
+
+  void OnInstallation(const std::string& scheduled_job_id,
+                      CompactionServiceJobStatus /*status*/) override {
+    // Clean up tmp directory
+    std::string serialized;
+    CompactionServiceResult result;
+    if (shared_->GetRemoteCompactionResult(scheduled_job_id, &serialized)
+            .has_value()) {
+      if (CompactionServiceResult::Read(serialized, &result).ok()) {
+        std::vector<std::string> filenames;
+        Status s = Env::Default()->GetChildren(result.output_path, &filenames);
+        for (size_t i = 0; s.ok() && i < filenames.size(); ++i) {
+          s = Env::Default()->DeleteFile(result.output_path + "/" +
+                                         filenames[i]);
+          if (!s.ok()) {
+            // TODO - Handle clean up failure?
+            break;
+          }
+        }
+        if (s.ok()) {
+          Env::Default()->DeleteDir(result.output_path).PermitUncheckedError();
+        }
+      }
+      shared_->RemoveRemoteCompactionResult(scheduled_job_id);
+    }
   }
 
-  // TODO - Implement
-  void CancelAwaitingJobs() override {}
-};
+  void CancelAwaitingJobs() override { aborted_.store(true); }
 
+ private:
+  SharedState* shared_;
+  std::atomic_bool aborted_{false};
+  bool failure_should_fall_back_to_local_;
+};
 }  // namespace ROCKSDB_NAMESPACE
+
+#endif  // GFLAGS
diff --git a/db_stress_tool/db_stress_compression_manager.cc b/db_stress_tool/db_stress_compression_manager.cc
new file mode 100644
index 000000000000..9746c490333f
--- /dev/null
+++ b/db_stress_tool/db_stress_compression_manager.cc
@@ -0,0 +1,28 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#include "db_stress_compression_manager.h"
+
+#include "rocksdb/utilities/object_registry.h"
+
+namespace ROCKSDB_NAMESPACE {
+void DbStressCustomCompressionManager::Register() {
+  // We must register any compression managers with a custom
+  // CompatibilityName() so that if it was used in a past invocation but not
+  // the current invocation, we can still read the SST files requiring it.
+  static std::once_flag loaded;
+  std::call_once(loaded, [&]() {
+    TEST_AllowUnsupportedFormatVersion() = true;
+    auto& library = *ObjectLibrary::Default();
+    library.AddFactory<CompressionManager>(
+        DbStressCustomCompressionManager().CompatibilityName(),
+        [](const std::string& /*uri*/,
+           std::unique_ptr<CompressionManager>* guard,
+           std::string* /*errmsg*/) {
+          *guard = std::make_unique<DbStressCustomCompressionManager>();
+          return guard->get();
+        });
+  });
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/db_stress_tool/db_stress_compression_manager.h b/db_stress_tool/db_stress_compression_manager.h
new file mode 100644
index 000000000000..8438a6583c7d
--- /dev/null
+++ b/db_stress_tool/db_stress_compression_manager.h
@@ -0,0 +1,67 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DbStressCustomCompressionManager : public CompressionManager {
+ public:
+  const char* Name() const override {
+    return "DbStressCustomCompressionManager";
+  }
+  const char* CompatibilityName() const override { return "DbStressCustom1"; }
+
+  bool SupportsCompressionType(CompressionType type) const override {
+    return default_->SupportsCompressionType(type) ||
+           type == kCustomCompressionAA || type == kCustomCompressionAB ||
+           type == kCustomCompressionAC;
+  }
+
+  std::unique_ptr<Compressor> GetCompressor(const CompressionOptions& opts,
+                                            CompressionType type) override {
+    // db_stress never specifies a custom type, so we randomly use them anyway
+    // when this compression manager is used.
+    std::array<CompressionType, 4> choices = {
+        type, kCustomCompressionAA, kCustomCompressionAB, kCustomCompressionAC};
+    type = choices[Random::GetTLSInstance()->Uniform(4)];
+    switch (static_cast<unsigned char>(type)) {
+      case kCustomCompressionAA:
+        return std::make_unique<
+            test::CompressorCustomAlg<kCustomCompressionAA>>();
+      case kCustomCompressionAB:
+        return std::make_unique<
+            test::CompressorCustomAlg<kCustomCompressionAB>>();
+      case kCustomCompressionAC:
+        return std::make_unique<
+            test::CompressorCustomAlg<kCustomCompressionAC>>();
+      // Also support built-in compression algorithms
+      default:
+        return GetBuiltinV2CompressionManager()->GetCompressor(opts, type);
+    }
+  }
+
+  std::shared_ptr<Decompressor> GetDecompressor() override {
+    return std::make_shared<test::DecompressorCustomAlg>();
+  }
+
+  std::shared_ptr<Decompressor> GetDecompressorForTypes(
+      const CompressionType* types_begin,
+      const CompressionType* types_end) override {
+    auto decomp = std::make_shared<test::DecompressorCustomAlg>();
+    decomp->SetAllowedTypes(types_begin, types_end);
+    return decomp;
+  }
+
+  static void Register();
+
+ protected:
+  std::shared_ptr<CompressionManager> default_ =
+      GetBuiltinV2CompressionManager();
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/db_stress_tool/db_stress_driver.cc b/db_stress_tool/db_stress_driver.cc
index d5fb3e643652..aa93de97ec4a 100644
--- a/db_stress_tool/db_stress_driver.cc
+++ b/db_stress_tool/db_stress_driver.cc
@@ -102,6 +102,14 @@ bool RunStressTestImpl(SharedState* shared) {
     shared->IncBgThreads();
   }
 
+  uint32_t remote_compaction_worker_thread_count =
+      FLAGS_remote_compaction_worker_threads;
+  if (remote_compaction_worker_thread_count > 0) {
+    for (uint32_t i = 0; i < remote_compaction_worker_thread_count; i++) {
+      shared->IncBgThreads();
+    }
+  }
+
   std::vector<ThreadState*> threads(n);
   for (uint32_t i = 0; i < n; i++) {
     threads[i] = new ThreadState(i, shared);
@@ -126,6 +134,17 @@ bool RunStressTestImpl(SharedState* shared) {
                                &compressed_cache_set_capacity_thread);
   }
 
+  std::vector<ThreadState*> remote_compaction_worker_threads;
+  if (remote_compaction_worker_thread_count > 0) {
+    remote_compaction_worker_threads.reserve(
+        remote_compaction_worker_thread_count);
+    for (uint32_t i = 0; i < remote_compaction_worker_thread_count; i++) {
+      ThreadState* ts = new ThreadState(i, shared);
+      remote_compaction_worker_threads.push_back(ts);
+      db_stress_env->StartThread(RemoteCompactionWorkerThread, ts);
+    }
+  }
+
   // Each thread goes through the following states:
   // initializing -> wait for others to init -> read/populate/depopulate
   // wait for others to operate -> verify -> done
@@ -218,6 +237,7 @@ bool RunStressTestImpl(SharedState* shared) {
     delete threads[i];
     threads[i] = nullptr;
   }
+
   now = clock->NowMicros();
   if (!FLAGS_skip_verifydb && !FLAGS_test_batches_snapshots &&
       !shared->HasVerificationFailedYet()) {
@@ -232,7 +252,8 @@ bool RunStressTestImpl(SharedState* shared) {
   if (FLAGS_compaction_thread_pool_adjust_interval > 0 ||
       FLAGS_continuous_verification_interval > 0 ||
       FLAGS_compressed_secondary_cache_size > 0 ||
-      FLAGS_compressed_secondary_cache_ratio > 0.0) {
+      FLAGS_compressed_secondary_cache_ratio > 0.0 ||
+      remote_compaction_worker_thread_count > 0) {
     MutexLock l(shared->GetMutex());
     shared->SetShouldStopBgThread();
     while (!shared->BgThreadsFinished()) {
@@ -240,6 +261,15 @@ bool RunStressTestImpl(SharedState* shared) {
     }
   }
 
+  assert(remote_compaction_worker_threads.size() ==
+         remote_compaction_worker_thread_count);
+  if (remote_compaction_worker_thread_count > 0) {
+    for (ThreadState* thread_state : remote_compaction_worker_threads) {
+      delete thread_state;
+    }
+    remote_compaction_worker_threads.clear();
+  }
+
   if (shared->HasVerificationFailedYet()) {
     fprintf(stderr, "Verification failed :(\n");
     return false;
diff --git a/db_stress_tool/db_stress_env_wrapper.h b/db_stress_tool/db_stress_env_wrapper.h
index 5ea9e8b6ef1c..4186bc41f653 100644
--- a/db_stress_tool/db_stress_env_wrapper.h
+++ b/db_stress_tool/db_stress_env_wrapper.h
@@ -9,8 +9,11 @@
 
 #ifdef GFLAGS
 #pragma once
+
 #include "db_stress_tool/db_stress_common.h"
+#include "file/filename.h"
 #include "monitoring/thread_status_util.h"
+#include "rocksdb/file_checksum.h"
 
 namespace ROCKSDB_NAMESPACE {
 namespace {
@@ -173,6 +176,35 @@ class DbStressFSWrapper : public FileSystemWrapper {
                                const FileOptions& file_opts,
                                std::unique_ptr<FSRandomAccessFile>* r,
                                IODebugContext* dbg) override {
+    // verify that file checksums are propagated through FileOptions
+    // for SST file opens.
+
+    std::string basename = f.substr(f.rfind('/') + 1);
+    uint64_t file_number;
+    FileType file_type;
+    if (ParseFileName(basename, &file_number, &file_type) &&
+        file_type == kTableFile) {
+      // file_checksum_func_name must always be populated to be sure each call
+      // site within RocksDB is intentional about populating the fields with the
+      // best available information:
+      //  - kNoFileChecksumFuncName: no checksum context available
+      //    (e.g., SstFileDumper, SstFileReader, checksum generation),
+      //    always paired with empty checksum
+      //  - kUnknownFileChecksumFuncName: file created without a
+      //    checksum factory (from MANIFEST), always paired with
+      //    empty checksum
+      //  - a real name (e.g., "FileChecksumCrc32c"): checksum exists
+      assert(!file_opts.file_checksum_func_name.empty());
+      if (file_opts.file_checksum_func_name == kUnknownFileChecksumFuncName ||
+          file_opts.file_checksum_func_name == kNoFileChecksumFuncName) {
+        // No checksum available — checksum value must be empty
+        assert(file_opts.file_checksum.empty());
+      } else {
+        // A real checksum function — checksum value must be present
+        assert(!file_opts.file_checksum.empty());
+      }
+    }
+
     std::unique_ptr<FSRandomAccessFile> file;
     IOStatus s = target()->NewRandomAccessFile(f, file_opts, &file, dbg);
     if (s.ok()) {
diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc
index 94028f07b40c..19b4c602e7c3 100644
--- a/db_stress_tool/db_stress_gflags.cc
+++ b/db_stress_tool/db_stress_gflags.cc
@@ -135,6 +135,14 @@ DEFINE_bool(histogram, false, "Print histogram of operation timings");
 DEFINE_bool(destroy_db_initially, true,
             "Destroys the database dir before start if this is true");
 
+DEFINE_bool(destroy_db_and_exit, false,
+            "Destroys the database dir and exits. Useful for cleanup without "
+            "running stress test. Other options are mostly ignored.");
+
+DEFINE_string(delete_dir_and_exit, "",
+              "Recursively deletes the specified directory and exits. "
+              "Useful for cleaning up TEST_TMPDIR after crash tests.");
+
 DEFINE_bool(verbose, false, "Verbose");
 
 DEFINE_bool(progress_reports, true,
@@ -168,20 +176,6 @@ DEFINE_int32(min_write_buffer_number_to_merge,
              "writing less data to storage if there are duplicate records in"
              " each of these individual write buffers.");
 
-DEFINE_int32(max_write_buffer_number_to_maintain,
-             ROCKSDB_NAMESPACE::Options().max_write_buffer_number_to_maintain,
-             "The total maximum number of write buffers to maintain in memory "
-             "including copies of buffers that have already been flushed. "
-             "Unlike max_write_buffer_number, this parameter does not affect "
-             "flushing. This controls the minimum amount of write history "
-             "that will be available in memory for conflict checking when "
-             "Transactions are used. If this value is too low, some "
-             "transactions may fail at commit time due to not being able to "
-             "determine whether there were any write conflicts. Setting this "
-             "value to 0 will cause write buffers to be freed immediately "
-             "after they are flushed.  If this value is set to -1, "
-             "'max_write_buffer_number' will be used.");
-
 DEFINE_int64(max_write_buffer_size_to_maintain,
              ROCKSDB_NAMESPACE::Options().max_write_buffer_size_to_maintain,
              "The total maximum size of write buffers to maintain in memory "
@@ -421,6 +415,17 @@ DEFINE_bool(fifo_allow_compaction, false,
             "If true, set `Options::compaction_options_fifo.allow_compaction = "
             "true`. It only take effect when FIFO compaction is used.");
 
+DEFINE_uint64(fifo_compaction_max_data_files_size_mb, 0,
+              "If non-zero, set "
+              "`Options::compaction_options_fifo.max_data_files_size` to this "
+              "value (in MB). Only takes effect with FIFO compaction.");
+
+DEFINE_bool(fifo_compaction_use_kv_ratio_compaction, false,
+            "If true, set "
+            "`Options::compaction_options_fifo.use_kv_ratio_compaction = "
+            "true`. Requires fifo_allow_compaction and "
+            "fifo_compaction_max_data_files_size_mb > 0.");
+
 DEFINE_bool(allow_concurrent_memtable_write, false,
             "Allow multi-writers to update mem tables in parallel.");
 
@@ -435,17 +440,6 @@ DEFINE_bool(enable_write_thread_adaptive_yield,
 // Options for StackableDB-based BlobDB
 DEFINE_bool(use_blob_db, false, "[Stacked BlobDB] Use BlobDB.");
 
-DEFINE_uint64(
-    blob_db_min_blob_size,
-    ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().min_blob_size,
-    "[Stacked BlobDB] Smallest blob to store in a file. Blobs "
-    "smaller than this will be inlined with the key in the LSM tree.");
-
-DEFINE_uint64(
-    blob_db_bytes_per_sync,
-    ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().bytes_per_sync,
-    "[Stacked BlobDB] Sync blob files once per every N bytes written.");
-
 DEFINE_uint64(blob_db_file_size,
               ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().blob_file_size,
               "[Stacked BlobDB] Target size of each blob file.");
@@ -455,11 +449,6 @@ DEFINE_bool(
     ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().enable_garbage_collection,
     "[Stacked BlobDB] Enable BlobDB garbage collection.");
 
-DEFINE_double(
-    blob_db_gc_cutoff,
-    ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().garbage_collection_cutoff,
-    "[Stacked BlobDB] Cutoff ratio for BlobDB garbage collection.");
-
 // Options for integrated BlobDB
 DEFINE_bool(allow_setting_blob_options_dynamically, false,
             "[Integrated BlobDB] Allow setting blob options dynamically.");
@@ -481,7 +470,9 @@ DEFINE_uint64(blob_file_size,
 DEFINE_string(blob_compression_type, "none",
               "[Integrated BlobDB] The compression algorithm to use for large "
               "values stored in blob files.");
-
+DEFINE_string(compression_manager, "mixed",
+              "Ability to change compression manager specified in "
+              "simple_mixed_manager.h (mixed -> roundRobin)");
 DEFINE_bool(enable_blob_garbage_collection,
             ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
                 .enable_blob_garbage_collection,
@@ -550,6 +541,9 @@ DEFINE_string(file_temperature_age_thresholds, "",
               "See CompactionOptionsFIFO::file_temperature_age_thresholds. "
               "empty == unset");
 
+DEFINE_bool(allow_trivial_copy_when_change_temperature, true,
+            "Allow kChangeTemperature to do trivial copy");
+
 static const bool FLAGS_subcompactions_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_subcompactions, &ValidateUint32Range);
 
@@ -613,6 +607,12 @@ DEFINE_int32(
         ROCKSDB_NAMESPACE::BlockBasedTableOptions().data_block_index_type),
     "Index type for data blocks (see `enum DataBlockIndexType` in table.h)");
 
+DEFINE_int32(index_block_search_type,
+             static_cast<int32_t>(ROCKSDB_NAMESPACE::BlockBasedTableOptions()
+                                      .index_block_search_type),
+             "Search algorithm for index blocks (see `enum BlockSearchType` in "
+             "table.h)");
+
 DEFINE_string(db, "", "Use the db with the following name.");
 
 DEFINE_string(secondaries_base, "",
@@ -733,6 +733,10 @@ DEFINE_uint64(txn_write_policy, 0,
               "TxnDBWritePolicy::WRITE_COMMITTED. Note that this should not be "
               "changed across crashes.");
 
+DEFINE_bool(use_per_key_point_lock_mgr, true,
+            "Use PointLockManager(false) or PerKeyPointLockManager(true) in "
+            "TransactionDB.");
+
 DEFINE_bool(use_optimistic_txn, false, "Use OptimisticTransactionDB.");
 DEFINE_uint64(occ_validation_policy, 1,
               "Optimistic Concurrency Control Validation Policy for "
@@ -813,6 +817,10 @@ DEFINE_int32(
     "If non-zero, then DisableManualCompaction()+Enable will be called "
     "once for every N ops on average.  0 disables.");
 
+DEFINE_int32(abort_and_resume_compactions_one_in, 0,
+             "If non-zero, then AbortAllCompactions()+Resume will be called "
+             "once for every N ops on average. 0 disables.");
+
 DEFINE_int32(compact_range_width, 10000,
              "The width of the ranges passed to CompactRange().");
 
@@ -853,8 +861,28 @@ DEFINE_bool(track_and_verify_wals,
             ROCKSDB_NAMESPACE::Options().track_and_verify_wals,
             "See Options::track_and_verify_wals");
 
-DEFINE_bool(enable_remote_compaction, false,
-            "Enable (simulated) Remote Compaction");
+DEFINE_int32(
+    remote_compaction_worker_threads, 2,
+    "Remote Compaction Worker Thread count. If 0, remote compaction is "
+    "disabled");
+
+DEFINE_int32(remote_compaction_worker_interval, 10,
+             "Remote Compaction Worker Thread dequeue tasks every N "
+             "milliseconds. (Default: 10ms)");
+
+DEFINE_bool(remote_compaction_failure_fall_back_to_local, true,
+            "If true, remote compaction failures will be ignored and "
+            "compactions will fall back to local and retried");
+
+DEFINE_int32(allow_resumption_one_in, 0,
+             "If non-zero, enable resumable compaction with 1/N probability "
+             "for each OpenAndCompact call.Requires "
+             "remote_compaction_worker_threads > 0");
+
+DEFINE_uint32(ingest_wbwi_one_in, 0,
+              "If set, will call"
+              "IngestWriteBatchWithIndex() instead of regular write operations "
+              "once every N writes.");
 
 static bool ValidateInt32Percent(const char* flagname, int32_t value) {
   if (value < 0 || value > 100) {
@@ -963,7 +991,11 @@ DEFINE_uint64(log2_keys_per_lock, 2, "Log2 of number of keys per lock");
 static const bool FLAGS_log2_keys_per_lock_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_log2_keys_per_lock, &ValidateUint32Range);
 
-DEFINE_uint64(max_manifest_file_size, 16384, "Maximum size of a MANIFEST file");
+DEFINE_uint64(max_manifest_file_size, 16384,
+              "Maximum size of a MANIFEST file (without auto-tuning)");
+
+DEFINE_int32(max_manifest_space_amp_pct, 500,
+             "Max manifest space amp percentage for auto-tuning");
 
 DEFINE_bool(in_place_update, false, "On true, does inplace update in memtable");
 
@@ -1098,10 +1130,6 @@ DEFINE_bool(paranoid_file_checks, true,
             "After writing every SST file, reopen it and read all the keys "
             "and validate checksums");
 
-DEFINE_bool(fail_if_options_file_error, false,
-            "Fail operations that fail to detect or properly persist options "
-            "file.");
-
 DEFINE_uint64(batch_protection_bytes_per_key, 0,
               "If nonzero, enables integrity protection in `WriteBatch` at the "
               "specified number of bytes per key. Currently the only supported "
@@ -1379,12 +1407,6 @@ DEFINE_bool(use_adaptive_mutex_lru,
             ROCKSDB_NAMESPACE::LRUCacheOptions().use_adaptive_mutex,
             "LRUCacheOptions.use_adaptive_mutex");
 
-DEFINE_uint32(
-    compress_format_version,
-    static_cast<uint32_t>(ROCKSDB_NAMESPACE::CompressedSecondaryCacheOptions()
-                              .compress_format_version),
-    "CompressedSecondaryCacheOptions.compress_format_version");
-
 DEFINE_uint64(manifest_preallocation_size,
               ROCKSDB_NAMESPACE::Options().manifest_preallocation_size,
               "Options.manifest_preallocation_size");
@@ -1426,6 +1448,17 @@ DEFINE_bool(block_align,
             ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_align,
             "BlockBasedTableOptions.block_align");
 
+DEFINE_uint64(
+    super_block_alignment_size,
+    ROCKSDB_NAMESPACE::BlockBasedTableOptions().super_block_alignment_size,
+    "BlockBasedTableOptions.super_block_alignment_size");
+
+DEFINE_uint64(
+    super_block_alignment_space_overhead_ratio,
+    ROCKSDB_NAMESPACE::BlockBasedTableOptions()
+        .super_block_alignment_space_overhead_ratio,
+    "BlockBasedTableOptions.super_block_alignment_space_overhead_ratio");
+
 DEFINE_uint32(
     lowest_used_cache_tier,
     static_cast<uint32_t>(ROCKSDB_NAMESPACE::Options().lowest_used_cache_tier),
@@ -1478,13 +1511,63 @@ DEFINE_bool(paranoid_memory_checks,
             ROCKSDB_NAMESPACE::Options().paranoid_memory_checks,
             "Sets CF option paranoid_memory_checks.");
 
+DEFINE_bool(
+    memtable_veirfy_per_key_checksum_on_seek,
+    ROCKSDB_NAMESPACE::Options().memtable_veirfy_per_key_checksum_on_seek,
+    "Sets CF option memtable_veirfy_per_key_checksum_on_seek.");
+
 DEFINE_uint32(commit_bypass_memtable_one_in, 0,
               "If greater than zero, transaction option will set "
               "commit_bypass_memtable to per every N transactions on average.");
 
+// Compaction on deletion trigger flags
+DEFINE_bool(enable_compaction_on_deletion_trigger, false,
+            "Enable CompactOnDeletionCollectorFactory for stress testing "
+            "deletion-triggered compaction scenarios.");
+
+DEFINE_uint64(compaction_on_deletion_min_file_size, 32 * 1024,
+              "Minimum file size (in bytes) for deletion-triggered compaction. "
+              "Files smaller than this will not trigger compaction even if "
+              "deletion ratio is exceeded. Default: 32KB");
+
+DEFINE_int32(compaction_on_deletion_trigger_count, 50,
+             "Number of deletions that triggers compaction when deletion "
+             "ratio is exceeded. Default: 50");
+
+DEFINE_int32(compaction_on_deletion_window_size, 100,
+             "Size of the sliding window for tracking deletions. "
+             "Default: 100");
+
+DEFINE_double(compaction_on_deletion_ratio, 0.5,
+              "Deletion ratio threshold for triggering compaction. "
+              "Default: 0.5 (50%)");
+
 DEFINE_bool(
     auto_refresh_iterator_with_snapshot,
     ROCKSDB_NAMESPACE::ReadOptions().auto_refresh_iterator_with_snapshot,
     "ReadOptions.auto_refresh_iterator_with_snapshot");
 
+DEFINE_uint32(
+    memtable_op_scan_flush_trigger,
+    ROCKSDB_NAMESPACE::ColumnFamilyOptions().memtable_op_scan_flush_trigger,
+    "Sets CF option memtable_op_scan_flush_trigger.");
+
+DEFINE_uint32(
+    memtable_avg_op_scan_flush_trigger,
+    ROCKSDB_NAMESPACE::ColumnFamilyOptions().memtable_avg_op_scan_flush_trigger,
+    "Sets CF option memtable_avg_op_scan_flush_trigger.");
+
+DEFINE_bool(
+    universal_reduce_file_locking,
+    ROCKSDB_NAMESPACE::ColumnFamilyOptions()
+        .compaction_options_universal.reduce_file_locking,
+    "Sets "
+    "ColumnFamilyOptions().compaciton_options_universal.reduce_file_locking.");
+
+DEFINE_bool(use_multiscan, false,
+            "If set, use the batched MultiScan API for scans.");
+
+DEFINE_bool(multiscan_use_async_io, false,
+            "If set, enable async_io for MultiScan operations.");
+
 #endif  // GFLAGS
diff --git a/db_stress_tool/db_stress_listener.h b/db_stress_tool/db_stress_listener.h
index 35c70b5a1036..fd28d5b4ced0 100644
--- a/db_stress_tool/db_stress_listener.h
+++ b/db_stress_tool/db_stress_listener.h
@@ -9,6 +9,7 @@
 #include <mutex>
 #include <unordered_set>
 
+#include "db_stress_tool/db_stress_compaction_service.h"
 #include "db_stress_tool/db_stress_shared_state.h"
 #include "file/filename.h"
 #include "file/writable_file_writer.h"
@@ -21,7 +22,6 @@
 #include "util/gflags_compat.h"
 #include "util/random.h"
 #include "utilities/fault_injection_fs.h"
-
 DECLARE_int32(compact_files_one_in);
 
 extern std::shared_ptr<ROCKSDB_NAMESPACE::FaultInjectionTestFS> fault_fs_guard;
@@ -265,7 +265,7 @@ class DbStressListener : public EventListener {
       fault_fs_guard->DisableAllThreadLocalErrorInjection();
       // TODO(hx235): only exempt the flush thread during error recovery instead
       // of all the flush threads from error injection
-      fault_fs_guard->SetIOActivtiesExcludedFromFaultInjection(
+      fault_fs_guard->SetIOActivitiesExcludedFromFaultInjection(
           {Env::IOActivity::kFlush});
     }
   }
@@ -275,7 +275,7 @@ class DbStressListener : public EventListener {
     RandomSleep();
     if (FLAGS_error_recovery_with_no_fault_injection && fault_fs_guard) {
       fault_fs_guard->EnableAllThreadLocalErrorInjection();
-      fault_fs_guard->SetIOActivtiesExcludedFromFaultInjection({});
+      fault_fs_guard->SetIOActivitiesExcludedFromFaultInjection({});
     }
   }
 
@@ -310,6 +310,11 @@ class DbStressListener : public EventListener {
         }
       }
     }
+    // We can't do exact matching since remote workers use dynamic temp paths
+    if (file_dir.find(DbStressCompactionService::kTempOutputDirectoryPrefix) !=
+        std::string::npos) {
+      return;
+    }
     assert(false);
 #else
     (void)file_dir;
diff --git a/db_stress_tool/db_stress_shared_state.h b/db_stress_tool/db_stress_shared_state.h
index 5d9fb34ac10c..b4546cd3bad2 100644
--- a/db_stress_tool/db_stress_shared_state.h
+++ b/db_stress_tool/db_stress_shared_state.h
@@ -51,6 +51,24 @@ DECLARE_bool(enable_compaction_filter);
 namespace ROCKSDB_NAMESPACE {
 class StressTest;
 
+struct RemoteCompactionQueueItem {
+  std::string job_id;
+  CompactionServiceJobInfo job_info;
+  std::string serialized_input;
+  std::string output_directory;
+  bool canceled;
+
+  RemoteCompactionQueueItem(const std::string& id,
+                            const CompactionServiceJobInfo& info,
+                            const std::string& input,
+                            const std::string& output_dir, bool was_canceled)
+      : job_id(id),
+        job_info(info),
+        serialized_input(input),
+        output_directory(output_dir),
+        canceled(was_canceled) {}
+};
+
 // State shared by all concurrent executions of the same benchmark.
 class SharedState {
  public:
@@ -137,7 +155,7 @@ class SharedState {
     for (int i = 0; i < FLAGS_column_families; ++i) {
       key_locks_[i].reset(new port::Mutex[num_locks]);
     }
-    if (FLAGS_read_fault_one_in) {
+    if (FLAGS_read_fault_one_in || FLAGS_metadata_read_fault_one_in) {
 #ifdef NDEBUG
       // Unsupported in release mode because it relies on
       // `IGNORE_STATUS_IF_ERROR` to distinguish faults not expected to lead to
@@ -276,6 +294,64 @@ class SharedState {
     return expected_state_manager_->GetPersistedSeqno();
   }
 
+  void EnqueueRemoteCompaction(const std::string& job_id,
+                               const CompactionServiceJobInfo& job_info,
+                               const std::string& serialized_input,
+                               const std::string& output_directory,
+                               bool canceled) {
+    MutexLock l(&remote_compaction_queue_mu_);
+    remote_compaction_queue_.emplace(job_id, job_info, serialized_input,
+                                     output_directory, canceled);
+  }
+
+  bool DequeueRemoteCompaction(std::string* job_id,
+                               CompactionServiceJobInfo* job_info,
+                               std::string* serialized_input,
+                               std::string* output_directory, bool* canceled) {
+    assert(job_id);
+    assert(job_info);
+    assert(serialized_input);
+    assert(output_directory);
+    assert(canceled);
+    MutexLock l(&remote_compaction_queue_mu_);
+    if (!remote_compaction_queue_.empty()) {
+      const RemoteCompactionQueueItem& item = remote_compaction_queue_.front();
+      *job_id = item.job_id;
+      *job_info = item.job_info;
+      *serialized_input = item.serialized_input;
+      *output_directory = item.output_directory;
+      *canceled = item.canceled;
+      remote_compaction_queue_.pop();
+      return true;
+    }
+    return false;
+  }
+
+  void AddRemoteCompactionResult(const std::string& job_id,
+                                 const Status& status,
+                                 const std::string& result) {
+    MutexLock l(&remote_compaction_result_map_mu_);
+    remote_compaction_result_map_.emplace(
+        job_id, std::pair<Status, std::string>{status, result});
+  }
+
+  std::optional<Status> GetRemoteCompactionResult(const std::string& job_id,
+                                                  std::string* result) {
+    MutexLock l(&remote_compaction_result_map_mu_);
+    if (remote_compaction_result_map_.find(job_id) !=
+        remote_compaction_result_map_.end()) {
+      const auto& pair = remote_compaction_result_map_.at(job_id);
+      *result = pair.second;
+      return pair.first;
+    }
+    return std::nullopt;
+  }
+
+  void RemoveRemoteCompactionResult(const std::string& job_id) {
+    MutexLock l(&remote_compaction_result_map_mu_);
+    remote_compaction_result_map_.erase(job_id);
+  }
+
   // Prepare a Put that will be started but not finish yet
   // This is useful for crash-recovery testing when the process may crash
   // before updating the corresponding expected value
@@ -430,6 +506,15 @@ class SharedState {
   std::atomic<bool> verification_failure_;
   std::atomic<bool> should_stop_test_;
 
+  // Queue for the remote compaction.
+  port::Mutex remote_compaction_queue_mu_;
+  std::queue<RemoteCompactionQueueItem> remote_compaction_queue_;
+  // Result Map for the remote compaciton. Key is the scheduled_job_id and value
+  // is serialized compaction_service_result
+  port::Mutex remote_compaction_result_map_mu_;
+  std::unordered_map<std::string, std::pair<Status, std::string>>
+      remote_compaction_result_map_;
+
   // Keys that should not be overwritten
   const std::unordered_set<int64_t> no_overwrite_ids_;
 
diff --git a/db_stress_tool/db_stress_stat.cc b/db_stress_tool/db_stress_stat.cc
deleted file mode 100644
index 6a7883a52ac7..000000000000
--- a/db_stress_tool/db_stress_stat.cc
+++ /dev/null
@@ -1,17 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#ifdef GFLAGS
-
-#include "db_stress_tool/db_stress_stat.h"
-
-namespace ROCKSDB_NAMESPACE {
-
-std::shared_ptr<ROCKSDB_NAMESPACE::Statistics> dbstats;
-std::shared_ptr<ROCKSDB_NAMESPACE::Statistics> dbstats_secondaries;
-
-}  // namespace ROCKSDB_NAMESPACE
-
-#endif  // GFLAGS
diff --git a/db_stress_tool/db_stress_stat.h b/db_stress_tool/db_stress_stat.h
index 5b38c6e2bb5d..e4a8a8fb5999 100644
--- a/db_stress_tool/db_stress_stat.h
+++ b/db_stress_tool/db_stress_stat.h
@@ -22,10 +22,6 @@ DECLARE_bool(progress_reports);
 
 namespace ROCKSDB_NAMESPACE {
 
-// Database statistics
-extern std::shared_ptr<ROCKSDB_NAMESPACE::Statistics> dbstats;
-extern std::shared_ptr<ROCKSDB_NAMESPACE::Statistics> dbstats_secondaries;
-
 class Stats {
  private:
   uint64_t start_;
diff --git a/db_stress_tool/db_stress_table_properties_collector.h b/db_stress_tool/db_stress_table_properties_collector.h
index 4723f6fc5d2f..b3f76e446436 100644
--- a/db_stress_tool/db_stress_table_properties_collector.h
+++ b/db_stress_tool/db_stress_table_properties_collector.h
@@ -26,25 +26,50 @@ class DbStressTablePropertiesCollector : public TablePropertiesCollector {
   Status AddUserKey(const Slice& /* key */, const Slice& /* value */,
                     EntryType /*type*/, SequenceNumber /*seq*/,
                     uint64_t /*file_size*/) override {
+    ++keys_added;
+    ++all_calls;
     return Status::OK();
   }
 
-  Status Finish(UserCollectedProperties* /* properties */) override {
+  void BlockAdd(uint64_t /* block_uncomp_bytes */,
+                uint64_t /* block_compressed_bytes_fast */,
+                uint64_t /* block_compressed_bytes_slow */) override {
+    ++blocks_added;
+    ++all_calls;
+  }
+
+  Status Finish(UserCollectedProperties* properties) override {
+    ++all_calls;
+    (*properties)["db_stress_collector_property"] =
+        std::to_string(keys_added) + ";" + std::to_string(blocks_added) + ";" +
+        std::to_string(all_calls);
     return Status::OK();
   }
 
   UserCollectedProperties GetReadableProperties() const override {
-    return UserCollectedProperties{};
+    UserCollectedProperties props;
+    const_cast<DbStressTablePropertiesCollector*>(this)->Finish(&props);
+    return props;
   }
 
   const char* Name() const override {
     return "DbStressTablePropertiesCollector";
   }
 
-  bool NeedCompact() const override { return need_compact_; }
+  bool NeedCompact() const override {
+    ++all_calls;
+    return need_compact_;
+  }
 
  private:
   const bool need_compact_;
+  // These are tracked to detect race conditions that would arise from RocksDB
+  // invoking TablePropertiesCollector functions in an unsynchronized way, as
+  // TablePropertiesCollectors are allowed (encouraged) not to be thread safe.
+  size_t keys_added = 0;
+  size_t blocks_added = 0;
+  // Including race between BlockAdd and AddUserKey (etc.)
+  mutable size_t all_calls = 0;
 };
 
 // A `DbStressTablePropertiesCollectorFactory` creates
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 8403ee3e9c4b..a57199e2d226 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -11,6 +11,7 @@
 #include <ios>
 #include <thread>
 
+#include "db_stress_tool/db_stress_compression_manager.h"
 #include "db_stress_tool/db_stress_listener.h"
 #include "rocksdb/io_status.h"
 #include "rocksdb/options.h"
@@ -24,16 +25,19 @@
 #include "db_stress_tool/db_stress_filters.h"
 #include "db_stress_tool/db_stress_table_properties_collector.h"
 #include "db_stress_tool/db_stress_wide_merge_operator.h"
+#include "file/file_util.h"
 #include "options/options_parser.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/secondary_cache.h"
 #include "rocksdb/sst_file_manager.h"
+#include "rocksdb/table_properties.h"
 #include "rocksdb/types.h"
 #include "rocksdb/utilities/object_registry.h"
 #include "rocksdb/utilities/write_batch_with_index.h"
 #include "test_util/testutil.h"
 #include "util/cast_util.h"
+#include "util/simple_mixed_compressor.h"
 #include "utilities/backup/backup_engine_impl.h"
 #include "utilities/fault_injection_fs.h"
 #include "utilities/fault_injection_secondary_cache.h"
@@ -70,25 +74,9 @@ StressTest::StressTest()
       new_column_family_name_(1),
       num_times_reopened_(0),
       db_preload_finished_(false),
-      secondary_db_(nullptr),
       is_db_stopped_(false) {
   if (FLAGS_destroy_db_initially) {
-    std::vector<std::string> files;
-    db_stress_env->GetChildren(FLAGS_db, &files);
-    for (unsigned int i = 0; i < files.size(); i++) {
-      if (Slice(files[i]).starts_with("heap-")) {
-        db_stress_env->DeleteFile(FLAGS_db + "/" + files[i]);
-      }
-    }
-
-    Options options;
-    options.env = db_stress_env;
-    // Remove files without preserving manfiest files
-    const Status s = !FLAGS_use_blob_db
-                         ? DestroyDB(FLAGS_db, options)
-                         : blob_db::DestroyBlobDB(FLAGS_db, options,
-                                                  blob_db::BlobDBOptions());
-
+    const Status s = DbStressDestroyDb(FLAGS_db);
     if (!s.ok()) {
       fprintf(stderr, "Cannot destroy original db: %s\n", s.ToString().c_str());
       exit(1);
@@ -109,11 +97,10 @@ void StressTest::CleanUp() {
   if (db_) {
     db_->Close();
   }
-  delete db_;
+  db_owner_.reset();
   db_ = nullptr;
 
-  delete secondary_db_;
-  secondary_db_ = nullptr;
+  secondary_db_.reset();
 }
 
 void StressTest::CleanUpColumnFamilies() {
@@ -163,7 +150,6 @@ std::shared_ptr<Cache> StressTest::NewCache(size_t capacity,
     }
     CompressedSecondaryCacheOptions opts;
     opts.capacity = FLAGS_compressed_secondary_cache_size;
-    opts.compress_format_version = FLAGS_compress_format_version;
     if (FLAGS_enable_do_not_compress_roles) {
       opts.do_not_compress_roles = {CacheEntryRoleSet::All()};
     }
@@ -191,10 +177,10 @@ std::shared_ptr<Cache> StressTest::NewCache(size_t capacity,
     exit(1);
   } else if (EndsWith(cache_type, "hyper_clock_cache")) {
     size_t estimated_entry_charge;
-    if (cache_type == "fixed_hyper_clock_cache" ||
-        cache_type == "hyper_clock_cache") {
+    if (cache_type == "fixed_hyper_clock_cache") {
       estimated_entry_charge = FLAGS_block_size;
-    } else if (cache_type == "auto_hyper_clock_cache") {
+    } else if (cache_type == "auto_hyper_clock_cache" ||
+               cache_type == "hyper_clock_cache") {
       estimated_entry_charge = 0;
     } else {
       fprintf(stderr, "Cache type not supported.");
@@ -346,7 +332,6 @@ bool StressTest::BuildOptionsTable() {
            "1",
            "2",
        }},
-      {"max_sequential_skip_in_iterations", {"4", "8", "12"}},
       {"block_based_table_factory",
        {
            keepRibbonFilterPolicyOnly ? "{filter_policy=ribbonfilter:2.35}"
@@ -359,6 +344,13 @@ bool StressTest::BuildOptionsTable() {
                std::to_string(FLAGS_block_size + (FLAGS_seed & 0xFFFU)) + "}",
        }},
   };
+  if (FLAGS_use_multiscan == 0) {
+    // TODO: this can fail MultiScan when consecutive data blocks share the
+    // same user at boundary. MultiScan uses user key to locate the block to
+    // reach which can move the scan earlier than its current block.
+    options_tbl.emplace("max_sequential_skip_in_iterations",
+                        std::vector<std::string>{"4", "8", "12"});
+  }
   if (FLAGS_compaction_style == kCompactionStyleUniversal &&
       FLAGS_universal_max_read_amp > 0) {
     // level0_file_num_compaction_trigger needs to be at most max_read_amp
@@ -425,8 +417,16 @@ bool StressTest::BuildOptionsTable() {
     options_tbl.emplace(
         "file_temperature_age_thresholds",
         std::vector<std::string>{
+            "{{temperature=kWarm;age=10}:{temperature=kCool;age=30}:{"
+            "temperature=kCold;age=100}:{"
+            "temperature=kIce;age=300}}",
             "{{temperature=kWarm;age=30}:{temperature=kCold;age=300}}",
             "{{temperature=kCold;age=100}}", "{}"});
+    options_tbl.emplace(
+        "allow_trivial_copy_when_change_temperature",
+        std::vector<std::string>{
+            FLAGS_allow_trivial_copy_when_change_temperature ? "true"
+                                                             : "false"});
   }
 
   // NOTE: allow -1 to mean starting disabled but dynamically changing
@@ -646,12 +646,20 @@ std::string StressTest::DebugString(const Slice& value,
 }
 
 void StressTest::PrintStatistics() {
-  if (dbstats) {
-    fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str());
+  // Print statistics from the DB instance instead of global dbstats
+  if (db_) {
+    auto stats = db_->GetOptions().statistics;
+    if (stats) {
+      fprintf(stdout, "STATISTICS:\n%s\n", stats->ToString().c_str());
+    }
   }
-  if (dbstats_secondaries) {
-    fprintf(stdout, "Secondary instances STATISTICS:\n%s\n",
-            dbstats_secondaries->ToString().c_str());
+  // Print statistics from secondary DB instance if it exists
+  if (secondary_db_) {
+    auto stats = secondary_db_->GetOptions().statistics;
+    if (stats) {
+      fprintf(stdout, "Secondary instance STATISTICS:\n%s\n",
+              stats->ToString().c_str());
+    }
   }
 }
 
@@ -743,12 +751,11 @@ void StressTest::PreloadDbAndReopenAsReadOnly(int64_t number_of_keys,
   }
   if (s.ok()) {
     CleanUpColumnFamilies();
-    delete db_;
+    db_owner_.reset();
     db_ = nullptr;
     txn_db_ = nullptr;
     optimistic_txn_db_ = nullptr;
-    delete secondary_db_;
-    secondary_db_ = nullptr;
+    secondary_db_.reset();
 
     db_preload_finished_.store(true);
     auto now = clock_->NowMicros();
@@ -786,6 +793,12 @@ Status StressTest::SetOptions(ThreadState* thread) {
   return db_->SetOptions(cfh, opts);
 }
 
+Options StressTest::GetOptions(int cf_id) {
+  auto cfh = column_families_[cf_id];
+  assert(cfh);
+  return db_->GetOptions(cfh);
+}
+
 void StressTest::ProcessRecoveredPreparedTxns(SharedState* shared) {
   assert(txn_db_);
   std::vector<Transaction*> recovered_prepared_trans;
@@ -838,11 +851,21 @@ Status StressTest::NewTxn(WriteOptions& write_opts, ThreadState* thread,
         FLAGS_use_only_the_last_commit_time_batch_for_recovery;
     txn_options.lock_timeout = 600000;  // 10 min
     txn_options.deadlock_detect = true;
-    if (FLAGS_commit_bypass_memtable_one_in > 0) {
+    if (FLAGS_commit_bypass_memtable_one_in > 0 &&
+        thread->rand.OneIn(FLAGS_commit_bypass_memtable_one_in)) {
       assert(FLAGS_txn_write_policy == 0);
       assert(FLAGS_user_timestamp_size == 0);
-      txn_options.commit_bypass_memtable =
-          thread->rand.OneIn(FLAGS_commit_bypass_memtable_one_in);
+      if (thread->rand.OneIn(2)) {
+        txn_options.commit_bypass_memtable = true;
+      }
+      if (thread->rand.OneIn(2)) {
+        txn_options.large_txn_commit_optimize_threshold = 1;
+      }
+      if (thread->rand.OneIn(2) ||
+          (!txn_options.commit_bypass_memtable &&
+           txn_options.large_txn_commit_optimize_threshold != 1)) {
+        txn_options.large_txn_commit_optimize_byte_threshold = 1;
+      }
       if (commit_bypass_memtable) {
         *commit_bypass_memtable = txn_options.commit_bypass_memtable;
       }
@@ -859,6 +882,10 @@ Status StressTest::CommitTxn(Transaction& txn, ThreadState* thread) {
     return Status::InvalidArgument("CommitTxn when FLAGS_use_txn is not set");
   }
   Status s = Status::OK();
+  // We don't issue write to transaction's underlying WriteBatch in stress test
+  assert(txn.GetWriteBatch()->GetWriteBatch()->Count());
+  assert(txn.GetWriteBatch()->GetWBWIOpCount() ==
+         txn.GetWriteBatch()->GetWriteBatch()->Count());
   if (FLAGS_use_optimistic_txn) {
     assert(optimistic_txn_db_);
     s = txn.Commit();
@@ -1240,6 +1267,11 @@ void StressTest::OperateDb(ThreadState* thread) {
         ProcessStatus(shared, "TestDisableManualCompaction", status);
       }
 
+      if (thread->rand.OneInOpt(FLAGS_abort_and_resume_compactions_one_in)) {
+        Status status = TestAbortAndResumeCompactions(thread);
+        ProcessStatus(shared, "TestAbortAndResumeCompactions", status);
+      }
+
       if (thread->rand.OneInOpt(FLAGS_verify_checksum_one_in)) {
         ThreadStatusUtil::SetEnableTracking(FLAGS_enable_thread_tracking);
         ThreadStatusUtil::SetThreadOperation(
@@ -1437,9 +1469,23 @@ void StressTest::OperateDb(ThreadState* thread) {
       } else if (prob_op < iterate_bound) {
         assert(delrange_bound <= prob_op);
         // OPERATION iterate
-        if (!FLAGS_skip_verifydb &&
-            thread->rand.OneInOpt(
-                FLAGS_verify_iterator_with_expected_state_one_in)) {
+        if (FLAGS_use_multiscan) {
+          int num_seeks = static_cast<int>(
+              std::min(static_cast<uint64_t>(thread->rand.Uniform(64)),
+                       static_cast<uint64_t>(FLAGS_ops_per_thread - i - 1)));
+          // Generate 2x num_seeks random keys, as each scan has a start key
+          // and an upper bound
+          rand_keys = GenerateNKeys(thread, num_seeks * 2, i);
+          i += num_seeks - 1;
+          ThreadStatusUtil::SetEnableTracking(FLAGS_enable_thread_tracking);
+          ThreadStatusUtil::SetThreadOperation(
+              ThreadStatus::OperationType::OP_DBITERATOR);
+          Status s;
+          s = TestMultiScan(thread, read_opts, rand_column_families, rand_keys);
+          ThreadStatusUtil::ResetThreadStatus();
+        } else if (!FLAGS_skip_verifydb &&
+                   thread->rand.OneInOpt(
+                       FLAGS_verify_iterator_with_expected_state_one_in)) {
           ThreadStatusUtil::SetEnableTracking(FLAGS_enable_thread_tracking);
           ThreadStatusUtil::SetThreadOperation(
               ThreadStatus::OperationType::OP_DBITERATOR);
@@ -1617,6 +1663,184 @@ Status StressTest::TestIterateAttributeGroups(
       verify_func);
 }
 
+Status StressTest::TestMultiScan(ThreadState* thread,
+                                 const ReadOptions& read_opts,
+                                 const std::vector<int>& rand_column_families,
+                                 const std::vector<int64_t>& rand_keys) {
+  size_t num_scans = rand_keys.size() / 2;
+  assert(!rand_column_families.empty());
+  assert(!rand_keys.empty());
+
+  ThreadStatus::OperationType cur_op_type =
+      ThreadStatusUtil::GetThreadOperation();
+  ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType::OP_UNKNOWN);
+  ManagedSnapshot snapshot_guard(db_);
+  ThreadStatusUtil::SetThreadOperation(cur_op_type);
+
+  ReadOptions ro = read_opts;
+  ro.snapshot = snapshot_guard.snapshot();
+
+  std::string read_ts_str;
+  Slice read_ts_slice;
+  MaybeUseOlderTimestampForRangeScan(thread, read_ts_str, read_ts_slice, ro);
+
+  std::vector<std::string> start_key_strs;
+  std::vector<std::string> end_key_strs;
+  // TODO support reverse BytewiseComparator in the stress test
+  MultiScanArgs scan_opts(options_.comparator);
+  scan_opts.use_async_io =
+      FLAGS_multiscan_use_async_io &&
+      CheckFSFeatureSupport(options_.env->GetFileSystem().get(),
+                            FSSupportedOps::kAsyncIO);
+  start_key_strs.reserve(num_scans);
+  end_key_strs.reserve(num_scans);
+
+  // Will be initialized before Seek() below.
+  Slice ub;
+  ro.iterate_upper_bound = &ub;
+  for (size_t i = 0; i < num_scans * 2; i += 2) {
+    assert(rand_keys[i] <= rand_keys[i + 1]);
+    start_key_strs.emplace_back(Key(rand_keys[i]));
+    end_key_strs.emplace_back(Key(rand_keys[i + 1]));
+    scan_opts.insert(Slice(start_key_strs.back()), Slice(end_key_strs.back()));
+  }
+
+  std::string op_logs;
+  ro.pin_data = thread->rand.OneIn(2);
+  ro.background_purge_on_iterator_cleanup = thread->rand.OneIn(2);
+
+  assert(options_.prefix_extractor.get() == nullptr);
+
+  std::unique_ptr<Iterator> iter;
+  iter.reset(db_->NewIterator(ro, column_families_[rand_column_families[0]]));
+  iter->Prepare(scan_opts);
+
+  constexpr size_t kOpLogsLimit = 50000;
+
+  auto verify_func = [](Iterator* iterator) {
+    if (!VerifyWideColumns(iterator->value(), iterator->columns())) {
+      fprintf(stderr,
+              "Value and columns inconsistent for iterator: value: %s, "
+              "columns: %s\n",
+              iterator->value().ToString(/* hex */ true).c_str(),
+              WideColumnsToHex(iterator->columns()).c_str());
+      return false;
+    }
+    return true;
+  };
+
+  for (const ScanOptions& scan_opt : scan_opts.GetScanRanges()) {
+    if (op_logs.size() > kOpLogsLimit) {
+      // Shouldn't take too much memory for the history log. Clear it.
+      op_logs = "(cleared...)\n";
+    }
+
+    // Set up an iterator, perform the same operations without bounds and with
+    // total order seek, and compare the results. This is to identify bugs
+    // related to bounds, prefix extractor, or reseeking. Sometimes we are
+    // comparing iterators with the same set-up, and it doesn't hurt to check
+    // them to be equal.
+    //
+    // This `ReadOptions` is for validation purposes. Ignore
+    // `FLAGS_rate_limit_user_ops` to avoid slowing any validation.
+    ReadOptions cmp_ro;
+    cmp_ro.timestamp = ro.timestamp;
+    cmp_ro.iter_start_ts = ro.iter_start_ts;
+    cmp_ro.snapshot = snapshot_guard.snapshot();
+    cmp_ro.auto_refresh_iterator_with_snapshot =
+        ro.auto_refresh_iterator_with_snapshot;
+    cmp_ro.total_order_seek = true;
+
+    ColumnFamilyHandle* const cmp_cfh =
+        GetControlCfh(thread, rand_column_families[0]);
+    assert(cmp_cfh);
+
+    std::unique_ptr<Iterator> cmp_iter(db_->NewIterator(cmp_ro, cmp_cfh));
+
+    bool diverged = false;
+
+    assert(scan_opt.range.start);
+    assert(scan_opt.range.limit);
+    Slice key = scan_opt.range.start.value();
+    ub = scan_opt.range.limit.value();
+
+    LastIterateOp last_op;
+    iter->Seek(key);
+    cmp_iter->Seek(key);
+    last_op = kLastOpSeek;
+    op_logs += "S " + key.ToString(true) + " ";
+
+    if (iter->Valid() && ro.allow_unprepared_value) {
+      op_logs += "*";
+
+      if (!iter->PrepareValue()) {
+        assert(!iter->Valid());
+        assert(!iter->status().ok());
+      }
+    }
+
+    if (!iter->status().ok() && IsErrorInjectedAndRetryable(iter->status())) {
+      return iter->status();
+    } else if (!cmp_iter->status().ok() &&
+               IsErrorInjectedAndRetryable(cmp_iter->status())) {
+      return cmp_iter->status();
+    }
+
+    VerifyIterator(thread, cmp_cfh, ro, iter.get(), cmp_iter.get(), last_op,
+                   key, op_logs, verify_func, &diverged);
+
+    while (iter->Valid()) {
+      iter->Next();
+      if (!diverged) {
+        assert(cmp_iter->Valid());
+        cmp_iter->Next();
+      }
+      op_logs += "N";
+
+      if (iter->Valid() && ro.allow_unprepared_value) {
+        op_logs += "*";
+
+        if (!iter->PrepareValue()) {
+          assert(!iter->Valid());
+          assert(!iter->status().ok());
+        }
+      }
+
+      if (!iter->status().ok() && IsErrorInjectedAndRetryable(iter->status())) {
+        return iter->status();
+      } else if (!cmp_iter->status().ok() &&
+                 IsErrorInjectedAndRetryable(cmp_iter->status())) {
+        return cmp_iter->status();
+      }
+
+      VerifyIterator(thread, cmp_cfh, ro, iter.get(), cmp_iter.get(), last_op,
+                     key, op_logs, verify_func, &diverged);
+
+      if (diverged) {
+        if (thread->shared->HasVerificationFailedYet()) {
+          const std::vector<ScanOptions>& scanoptions =
+              scan_opts.GetScanRanges();
+          for (const auto& t : scanoptions) {
+            fprintf(stdout, "Multiscan options: %s to %s \n",
+                    t.range.start.value().ToString(true).c_str(),
+                    t.range.limit.value().ToString(true).c_str());
+          }
+        }
+        break;
+      }
+    }
+
+    thread->stats.AddIterations(1);
+
+    op_logs += "; ";
+    if (diverged) {
+      break;
+    }
+  }
+
+  return Status::OK();
+}
+
 template <typename IterType, typename NewIterFunc, typename VerifyFunc>
 Status StressTest::TestIterateImpl(ThreadState* thread,
                                    const ReadOptions& read_opts,
@@ -2279,7 +2503,7 @@ Status StressTest::TestBackupRestore(
       from = "BackupEngine::PurgeOldBackups";
     }
   }
-  DB* restored_db = nullptr;
+  std::unique_ptr<DB> restored_db;
   std::vector<ColumnFamilyHandle*> restored_cf_handles;
 
   // Not yet implemented: opening restored BlobDB or TransactionDB
@@ -2367,8 +2591,7 @@ Status StressTest::TestBackupRestore(
     for (auto* cf_handle : restored_cf_handles) {
       restored_db->DestroyColumnFamilyHandle(cf_handle);
     }
-    delete restored_db;
-    restored_db = nullptr;
+    restored_db.reset();
   }
   if (s.ok() && inplace_not_restore) {
     // Purge late if inplace open read-only
@@ -2603,7 +2826,7 @@ Status StressTest::TestCheckpoint(ThreadState* thread,
   delete checkpoint;
   checkpoint = nullptr;
   std::vector<ColumnFamilyHandle*> cf_handles;
-  DB* checkpoint_db = nullptr;
+  std::unique_ptr<DB> checkpoint_db;
   if (s.ok()) {
     Options options(options_);
     options.best_efforts_recovery = false;
@@ -2667,8 +2890,7 @@ Status StressTest::TestCheckpoint(ThreadState* thread,
       delete cfh;
     }
     cf_handles.clear();
-    delete checkpoint_db;
-    checkpoint_db = nullptr;
+    checkpoint_db.reset();
   }
 
   //  Temporarily disable error injection for clean-up
@@ -2824,8 +3046,9 @@ void StressTest::TestCompactFiles(ThreadState* thread,
         // TOOD (hx235): allow an exact list of tolerable failures under stress
         // test
         bool non_ok_status_allowed =
-            s.IsManualCompactionPaused() || IsErrorInjectedAndRetryable(s) ||
-            s.IsAborted() || s.IsInvalidArgument() || s.IsNotSupported();
+            s.IsManualCompactionPaused() || s.IsCompactionAborted() ||
+            IsErrorInjectedAndRetryable(s) || s.IsAborted() ||
+            s.IsInvalidArgument() || s.IsNotSupported();
         if (!non_ok_status_allowed) {
           fprintf(stderr,
                   "Unable to perform CompactFiles(): %s under specified "
@@ -2918,6 +3141,20 @@ Status StressTest::TestDisableManualCompaction(ThreadState* thread) {
   return Status::OK();
 }
 
+Status StressTest::TestAbortAndResumeCompactions(ThreadState* thread) {
+  // Abort all running compactions and prevent new ones from starting
+  db_->AbortAllCompactions();
+  // Sleep to allow other threads to attempt operations while aborted
+  // Uses same sleep pattern as TestPauseBackground and
+  // TestDisableManualCompaction
+  int pwr2_micros =
+      std::min(thread->rand.Uniform(25), thread->rand.Uniform(25));
+  clock_->SleepForMicroseconds(1 << pwr2_micros);
+  // Resume compactions
+  db_->ResumeAllCompactions();
+  return Status::OK();
+}
+
 void StressTest::TestAcquireSnapshot(ThreadState* thread,
                                      int rand_column_family,
                                      const std::string& keystr, uint64_t i) {
@@ -3093,7 +3330,7 @@ void StressTest::TestCompactRange(ThreadState* thread, int64_t rand_key,
   if (!status.ok()) {
     // TOOD (hx235): allow an exact list of tolerable failures under stress test
     bool non_ok_status_allowed =
-        status.IsManualCompactionPaused() ||
+        status.IsManualCompactionPaused() || status.IsCompactionAborted() ||
         IsErrorInjectedAndRetryable(status) || status.IsAborted() ||
         status.IsInvalidArgument() || status.IsNotSupported();
     if (!non_ok_status_allowed) {
@@ -3296,8 +3533,9 @@ void StressTest::PrintEnv() const {
   fprintf(stdout, "Verification only         : %s\n",
           FLAGS_verification_only ? "true" : "false");
 
-  const char* memtablerep = "";
+  const char* memtablerep;
   switch (FLAGS_rep_factory) {
+    default:
     case kSkipList:
       memtablerep = "skip_list";
       break;
@@ -3376,8 +3614,6 @@ void StressTest::PrintEnv() const {
           FLAGS_sync_fault_injection);
   fprintf(stdout, "Best efforts recovery     : %d\n",
           static_cast<int>(FLAGS_best_efforts_recovery));
-  fprintf(stdout, "Fail if OPTIONS file error: %d\n",
-          static_cast<int>(FLAGS_fail_if_options_file_error));
   fprintf(stdout, "User timestamp size bytes : %d\n",
           static_cast<int>(FLAGS_user_timestamp_size));
   fprintf(stdout, "Persist user defined timestamps : %d\n",
@@ -3398,7 +3634,28 @@ void StressTest::Open(SharedState* shared, bool reopen) {
     InitializeOptionsFromFlags(cache_, filter_policy_, options_);
   }
   InitializeOptionsGeneral(cache_, filter_policy_, sqfc_factory_, options_);
-
+  DbStressCustomCompressionManager::Register();
+
+  if (!strcasecmp(FLAGS_compression_manager.c_str(), "custom")) {
+    options_.compression_manager =
+        std::make_shared<DbStressCustomCompressionManager>();
+  } else if (!strcasecmp(FLAGS_compression_manager.c_str(), "mixed")) {
+    options_.compression_manager =
+        std::make_shared<RoundRobinManager>(GetBuiltinV2CompressionManager());
+  } else if (!strcasecmp(FLAGS_compression_manager.c_str(), "randommixed")) {
+    options_.compression_manager =
+        std::make_shared<RandomMixedCompressionManager>(
+            GetBuiltinV2CompressionManager());
+  } else if (!strcasecmp(FLAGS_compression_manager.c_str(), "autoskip")) {
+    options_.compression_manager =
+        CreateAutoSkipCompressionManager(GetBuiltinV2CompressionManager());
+  } else if (!strcasecmp(FLAGS_compression_manager.c_str(), "none")) {
+    // Nothing to do using default compression manager
+  } else {
+    fprintf(stderr, "Unknown compression manager: %s\n",
+            FLAGS_compression_manager.c_str());
+    exit(1);
+  }
   if (FLAGS_prefix_size == 0 && FLAGS_rep_factory == kHashSkipList) {
     fprintf(stderr,
             "prefeix_size cannot be zero if memtablerep == prefix_hash\n");
@@ -3410,6 +3667,40 @@ void StressTest::Open(SharedState* shared, bool reopen) {
             "memtablerep != prefix_hash\n");
   }
 
+  // Remote Compaction
+  if (FLAGS_remote_compaction_worker_threads > 0) {
+    // TODO(jaykorean) Remove this after fix - remote worker shouldn't recover
+    // from WAL
+    if (!FLAGS_disable_wal) {
+      fprintf(stderr,
+              "WAL is not compatible with Remote Compaction in Stress Test\n");
+      exit(1);
+    }
+    if ((options_.enable_blob_files ||
+         options_.enable_blob_garbage_collection ||
+         FLAGS_allow_setting_blob_options_dynamically)) {
+      fprintf(stderr,
+              "Integrated BlobDB is currently incompatible with Remote "
+              "Compaction\n");
+      exit(1);
+    }
+    // Each DB open/reopen gets a fresh compaction service instance with a clean
+    // aborted_ state
+    auto compaction_service = std::make_shared<DbStressCompactionService>(
+        shared, FLAGS_remote_compaction_failure_fall_back_to_local);
+
+    options_.compaction_service = compaction_service;
+  }
+
+  if (FLAGS_allow_resumption_one_in > 0) {
+    if (FLAGS_remote_compaction_worker_threads == 0) {
+      fprintf(stderr,
+              "allow_resumption or randomize_allow_resumption requires "
+              "remote_compaction_worker_threads > 0\n");
+      exit(1);
+    }
+  }
+
   if ((options_.enable_blob_files || options_.enable_blob_garbage_collection ||
        FLAGS_allow_setting_blob_options_dynamically) &&
       FLAGS_best_efforts_recovery) {
@@ -3567,26 +3858,28 @@ void StressTest::Open(SharedState* shared, bool reopen) {
         // StackableDB-based BlobDB
         if (FLAGS_use_blob_db) {
           blob_db::BlobDBOptions blob_db_options;
-          blob_db_options.min_blob_size = FLAGS_blob_db_min_blob_size;
-          blob_db_options.bytes_per_sync = FLAGS_blob_db_bytes_per_sync;
           blob_db_options.blob_file_size = FLAGS_blob_db_file_size;
           blob_db_options.enable_garbage_collection = FLAGS_blob_db_enable_gc;
-          blob_db_options.garbage_collection_cutoff = FLAGS_blob_db_gc_cutoff;
 
           blob_db::BlobDB* blob_db = nullptr;
           s = blob_db::BlobDB::Open(options_, blob_db_options, FLAGS_db,
                                     cf_descriptors, &column_families_,
                                     &blob_db);
           if (s.ok()) {
+            db_owner_.reset(blob_db);
             db_ = blob_db;
           }
         } else {
           if (db_preload_finished_.load() && FLAGS_read_only) {
             s = DB::OpenForReadOnly(DBOptions(options_), FLAGS_db,
-                                    cf_descriptors, &column_families_, &db_);
+                                    cf_descriptors, &column_families_,
+                                    &db_owner_);
           } else {
             s = DB::Open(DBOptions(options_), FLAGS_db, cf_descriptors,
-                         &column_families_, &db_);
+                         &column_families_, &db_owner_);
+          }
+          if (s.ok()) {
+            db_ = db_owner_.get();
           }
         }
 
@@ -3602,10 +3895,9 @@ void StressTest::Open(SharedState* shared, bool reopen) {
             s = db_->GetRootDB()->WaitForCompact(WaitForCompactOptions());
             if (!s.ok()) {
               CleanUpColumnFamilies();
-              delete db_;
+              db_owner_.reset();
               db_ = nullptr;
-              delete secondary_db_;
-              secondary_db_ = nullptr;
+              secondary_db_.reset();
             }
           }
           if (!s.ok()) {
@@ -3662,6 +3954,7 @@ void StressTest::Open(SharedState* shared, bool reopen) {
         }
         assert(s.ok());
         {
+          db_owner_.reset(optimistic_txn_db_);
           db_ = optimistic_txn_db_;
           db_aptr_.store(optimistic_txn_db_, std::memory_order_release);
         }
@@ -3683,6 +3976,8 @@ void StressTest::Open(SharedState* shared, bool reopen) {
             static_cast<size_t>(FLAGS_wp_snapshot_cache_bits);
         txn_db_options.wp_commit_cache_bits =
             static_cast<size_t>(FLAGS_wp_commit_cache_bits);
+        txn_db_options.use_per_key_point_lock_mgr =
+            FLAGS_use_per_key_point_lock_mgr;
         PrepareTxnDbOptions(shared, txn_db_options);
         s = TransactionDB::Open(options_, txn_db_options, FLAGS_db,
                                 cf_descriptors, &column_families_, &txn_db_);
@@ -3695,6 +3990,7 @@ void StressTest::Open(SharedState* shared, bool reopen) {
 
         // Do not swap the order of the following.
         {
+          db_owner_.reset(txn_db_);
           db_ = txn_db_;
           db_aptr_.store(txn_db_, std::memory_order_release);
         }
@@ -3707,6 +4003,13 @@ void StressTest::Open(SharedState* shared, bool reopen) {
     assert(s.ok());
     assert(column_families_.size() ==
            static_cast<size_t>(FLAGS_column_families));
+    // Clear statistics reference from options_ to intentionally shorten the
+    // statistics object lifetime to be same as the db object (which is the
+    // common case in practice) and detect if RocksDB access the statistics
+    // beyond its lifetime.
+    if (FLAGS_statistics) {
+      options_.statistics.reset();
+    }
 
     // Secondary instance does not support write-prepared/write-unprepared
     // transactions, thus just disable secondary instance if we use
@@ -3726,6 +4029,7 @@ void StressTest::Open(SharedState* shared, bool reopen) {
   } else {
     DBWithTTL* db_with_ttl;
     s = DBWithTTL::Open(options_, FLAGS_db, &db_with_ttl, FLAGS_ttl);
+    db_owner_.reset(db_with_ttl);
     db_ = db_with_ttl;
   }
 
@@ -3805,12 +4109,11 @@ void StressTest::Reopen(ThreadState* thread) {
   }
   assert((txn_db_ == nullptr && optimistic_txn_db_ == nullptr) ||
          (db_ == txn_db_ || db_ == optimistic_txn_db_));
-  delete db_;
+  db_owner_.reset();
   db_ = nullptr;
   txn_db_ = nullptr;
   optimistic_txn_db_ = nullptr;
-  delete secondary_db_;
-  secondary_db_ = nullptr;
+  secondary_db_.reset();
 
   num_times_reopened_++;
   auto now = clock_->NowMicros();
@@ -4023,6 +4326,9 @@ void InitializeOptionsFromFlags(
   block_based_options.data_block_index_type =
       static_cast<BlockBasedTableOptions::DataBlockIndexType>(
           FLAGS_data_block_index_type);
+  block_based_options.index_block_search_type =
+      static_cast<BlockBasedTableOptions::BlockSearchType>(
+          FLAGS_index_block_search_type);
   block_based_options.prepopulate_block_cache =
       static_cast<BlockBasedTableOptions::PrepopulateBlockCache>(
           FLAGS_prepopulate_block_cache);
@@ -4041,14 +4347,16 @@ void InitializeOptionsFromFlags(
       static_cast<BlockBasedTableOptions::IndexShorteningMode>(
           FLAGS_index_shortening);
   block_based_options.block_align = FLAGS_block_align;
+  block_based_options.super_block_alignment_size =
+      fLU64::FLAGS_super_block_alignment_size;
+  block_based_options.super_block_alignment_space_overhead_ratio =
+      fLU64::FLAGS_super_block_alignment_space_overhead_ratio;
   options.table_factory.reset(NewBlockBasedTableFactory(block_based_options));
   options.db_write_buffer_size = FLAGS_db_write_buffer_size;
   options.write_buffer_size = FLAGS_write_buffer_size;
   options.max_write_buffer_number = FLAGS_max_write_buffer_number;
   options.min_write_buffer_number_to_merge =
       FLAGS_min_write_buffer_number_to_merge;
-  options.max_write_buffer_number_to_maintain =
-      FLAGS_max_write_buffer_number_to_maintain;
   options.max_write_buffer_size_to_maintain =
       FLAGS_max_write_buffer_size_to_maintain;
   options.memtable_prefix_bloom_size_ratio =
@@ -4071,6 +4379,17 @@ void InitializeOptionsFromFlags(
       ROCKSDB_NAMESPACE::CompactionStyle::kCompactionStyleFIFO) {
     options.compaction_options_fifo.allow_compaction =
         FLAGS_fifo_allow_compaction;
+    if (FLAGS_fifo_compaction_max_data_files_size_mb > 0) {
+      options.compaction_options_fifo.max_data_files_size =
+          FLAGS_fifo_compaction_max_data_files_size_mb * 1024 * 1024;
+      // max_table_files_size is ignored when max_data_files_size is non-zero,
+      // but validation requires max_data_files_size >= max_table_files_size.
+      options.compaction_options_fifo.max_table_files_size =
+          std::min(options.compaction_options_fifo.max_table_files_size,
+                   options.compaction_options_fifo.max_data_files_size);
+    }
+    options.compaction_options_fifo.use_kv_ratio_compaction =
+        FLAGS_fifo_compaction_use_kv_ratio_compaction;
   }
   options.compaction_pri =
       static_cast<ROCKSDB_NAMESPACE::CompactionPri>(FLAGS_compaction_pri);
@@ -4083,7 +4402,9 @@ void InitializeOptionsFromFlags(
     }
   }
   options.max_open_files = FLAGS_open_files;
-  options.statistics = dbstats;
+  if (FLAGS_statistics) {
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  }
   options.env = db_stress_env;
   options.use_fsync = FLAGS_use_fsync;
   options.compaction_readahead_size = FLAGS_compaction_readahead_size;
@@ -4125,6 +4446,7 @@ void InitializeOptionsFromFlags(
     options.compression_opts.checksum = true;
   }
   options.max_manifest_file_size = FLAGS_max_manifest_file_size;
+  options.max_manifest_space_amp_pct = FLAGS_max_manifest_space_amp_pct;
   options.max_subcompactions = static_cast<uint32_t>(FLAGS_subcompactions);
   options.allow_concurrent_memtable_write =
       FLAGS_allow_concurrent_memtable_write;
@@ -4165,6 +4487,8 @@ void InitializeOptionsFromFlags(
       FLAGS_memtable_protection_bytes_per_key;
   options.block_protection_bytes_per_key = FLAGS_block_protection_bytes_per_key;
   options.paranoid_memory_checks = FLAGS_paranoid_memory_checks;
+  options.memtable_veirfy_per_key_checksum_on_seek =
+      FLAGS_memtable_veirfy_per_key_checksum_on_seek;
 
   // Integrated BlobDB
   options.enable_blob_files = FLAGS_enable_blob_files;
@@ -4220,10 +4544,14 @@ void InitializeOptionsFromFlags(
       StringToTemperature(FLAGS_default_temperature.c_str());
 
   if (!FLAGS_file_temperature_age_thresholds.empty()) {
+    const std::string allowTrivialCopyBoolStr =
+        FLAGS_allow_trivial_copy_when_change_temperature ? "true" : "false";
     Status s = GetColumnFamilyOptionsFromString(
         {}, options,
         "compaction_options_fifo={file_temperature_age_thresholds=" +
-            FLAGS_file_temperature_age_thresholds + "}",
+            FLAGS_file_temperature_age_thresholds +
+            ";allow_trivial_copy_when_change_temperature=" +
+            allowTrivialCopyBoolStr + "}",
         &options);
     if (!s.ok()) {
       fprintf(stderr, "While setting file_temperature_age_thresholds: %s\n",
@@ -4257,7 +4585,6 @@ void InitializeOptionsFromFlags(
 
   options.best_efforts_recovery = FLAGS_best_efforts_recovery;
   options.paranoid_file_checks = FLAGS_paranoid_file_checks;
-  options.fail_if_options_file_error = FLAGS_fail_if_options_file_error;
 
   if (FLAGS_user_timestamp_size > 0) {
     CheckAndSetOptionsForUserTimestamp(options);
@@ -4317,10 +4644,9 @@ void InitializeOptionsFromFlags(
   options.inplace_update_support = FLAGS_inplace_update_support;
   options.uncache_aggressiveness = FLAGS_uncache_aggressiveness;
 
-  // Remote Compaction
-  if (FLAGS_enable_remote_compaction) {
-    options.compaction_service = std::make_shared<DbStressCompactionService>();
-  }
+  options.memtable_op_scan_flush_trigger = FLAGS_memtable_op_scan_flush_trigger;
+  options.compaction_options_universal.reduce_file_locking =
+      FLAGS_universal_reduce_file_locking;
 }
 
 void InitializeOptionsGeneral(
@@ -4331,8 +4657,8 @@ void InitializeOptionsGeneral(
   options.create_missing_column_families = true;
   options.create_if_missing = true;
 
-  if (!options.statistics) {
-    options.statistics = dbstats;
+  if (FLAGS_statistics) {
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
   }
 
   if (options.env == Options().env) {
@@ -4405,6 +4731,16 @@ void InitializeOptionsGeneral(
   if (sqfc_factory && !sqfc_factory->GetConfigs().IsEmptyNotFound()) {
     options.table_properties_collector_factories.emplace_back(sqfc_factory);
   }
+
+  // Add CompactOnDeletionCollectorFactory if enabled
+  if (FLAGS_enable_compaction_on_deletion_trigger) {
+    options.table_properties_collector_factories.emplace_back(
+        ROCKSDB_NAMESPACE::NewCompactOnDeletionCollectorFactory(
+            FLAGS_compaction_on_deletion_window_size,
+            FLAGS_compaction_on_deletion_trigger_count,
+            FLAGS_compaction_on_deletion_ratio,
+            FLAGS_compaction_on_deletion_min_file_size));
+  }
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db_stress_tool/db_stress_test_base.h b/db_stress_tool/db_stress_test_base.h
index bba5c1665f13..d97aadf9e60e 100644
--- a/db_stress_tool/db_stress_test_base.h
+++ b/db_stress_tool/db_stress_test_base.h
@@ -14,6 +14,7 @@
 #include "db_stress_tool/db_stress_common.h"
 #include "db_stress_tool/db_stress_shared_state.h"
 #include "rocksdb/experimental.h"
+#include "utilities/fault_injection_fs.h"
 
 namespace ROCKSDB_NAMESPACE {
 class SystemClock;
@@ -25,6 +26,13 @@ using experimental::SstQueryFilterConfigsManager;
 
 class StressTest {
  public:
+  static bool IsErrorInjectedAndRetryable(const Status& error_s) {
+    assert(!error_s.ok());
+    return error_s.getState() &&
+           FaultInjectionTestFS::IsInjectedError(error_s) &&
+           !status_to_io_status(Status(error_s)).GetDataLoss();
+  }
+
   StressTest();
 
   virtual ~StressTest() {}
@@ -53,6 +61,7 @@ class StressTest {
     Status s = db_->EnableAutoCompaction(column_families_);
     return s;
   }
+  Options GetOptions(int cf_id);
   void CleanUp();
 
  protected:
@@ -274,6 +283,10 @@ class StressTest {
     return Status::NotSupported();
   }
 
+  Status TestMultiScan(ThreadState* thread, const ReadOptions& read_opts,
+                       const std::vector<int>& rand_column_families,
+                       const std::vector<int64_t>& rand_keys);
+
   // Enum used by VerifyIterator() to identify the mode to validate.
   enum LastIterateOp {
     kLastOpSeek,
@@ -319,6 +332,8 @@ class StressTest {
 
   Status TestDisableManualCompaction(ThreadState* thread);
 
+  Status TestAbortAndResumeCompactions(ThreadState* thread);
+
   void TestAcquireSnapshot(ThreadState* thread, int rand_column_family,
                            const std::string& keystr, uint64_t i);
 
@@ -345,13 +360,6 @@ class StressTest {
     return Status::NotSupported("TestCustomOperations() must be overridden");
   }
 
-  bool IsErrorInjectedAndRetryable(const Status& error_s) const {
-    assert(!error_s.ok());
-    return error_s.getState() &&
-           FaultInjectionTestFS::IsInjectedError(error_s) &&
-           !status_to_io_status(Status(error_s)).GetDataLoss();
-  }
-
   void ProcessStatus(SharedState* shared, std::string msg, const Status& s,
                      bool ignore_injected_error = true) const;
 
@@ -396,6 +404,7 @@ class StressTest {
   std::shared_ptr<Cache> cache_;
   std::shared_ptr<Cache> compressed_cache_;
   std::shared_ptr<const FilterPolicy> filter_policy_;
+  std::unique_ptr<DB> db_owner_;
   DB* db_;
   TransactionDB* txn_db_;
   OptimisticTransactionDB* optimistic_txn_db_;
@@ -414,7 +423,7 @@ class StressTest {
   std::atomic<bool> db_preload_finished_;
   std::shared_ptr<SstQueryFilterConfigsManager::Factory> sqfc_factory_;
 
-  DB* secondary_db_;
+  std::unique_ptr<DB> secondary_db_;
   std::vector<ColumnFamilyHandle*> secondary_cfhs_;
   bool is_db_stopped_;
 };
diff --git a/db_stress_tool/db_stress_tool.cc b/db_stress_tool/db_stress_tool.cc
index ca43b699c8f9..15b52b827b14 100644
--- a/db_stress_tool/db_stress_tool.cc
+++ b/db_stress_tool/db_stress_tool.cc
@@ -53,12 +53,6 @@ int db_stress_tool(int argc, char** argv) {
     SetupSyncPointsToMockDirectIO();
   }
 #endif
-  if (FLAGS_statistics) {
-    dbstats = ROCKSDB_NAMESPACE::CreateDBStatistics();
-    if (FLAGS_test_secondary) {
-      dbstats_secondaries = ROCKSDB_NAMESPACE::CreateDBStatistics();
-    }
-  }
   compression_type_e = StringToCompressionType(FLAGS_compression_type.c_str());
   bottommost_compression_type_e =
       StringToCompressionType(FLAGS_bottommost_compression_type.c_str());
@@ -100,10 +94,39 @@ int db_stress_tool(int argc, char** argv) {
     raw_env = fault_env_guard.get();
   }
 
-  env_wrapper_guard = std::make_shared<CompositeEnvWrapper>(
-      raw_env, std::make_shared<DbStressFSWrapper>(raw_env->GetFileSystem()));
+  auto db_stress_fs =
+      std::make_shared<DbStressFSWrapper>(raw_env->GetFileSystem());
+  env_wrapper_guard =
+      std::make_shared<CompositeEnvWrapper>(raw_env, db_stress_fs);
   db_stress_env = env_wrapper_guard.get();
 
+  // Handle --destroy_db_and_exit early, before other option validation
+  if (FLAGS_destroy_db_and_exit) {
+    s = DbStressDestroyDb(FLAGS_db);
+    if (s.ok()) {
+      fprintf(stdout, "Successfully destroyed db at %s\n", FLAGS_db.c_str());
+      return 0;
+    } else {
+      fprintf(stderr, "Failed to destroy db at %s: %s\n", FLAGS_db.c_str(),
+              s.ToString().c_str());
+      return 1;
+    }
+  }
+
+  // Handle --delete_dir_and_exit early, before other option validation
+  if (!FLAGS_delete_dir_and_exit.empty()) {
+    s = DestroyDir(raw_env, FLAGS_delete_dir_and_exit);
+    if (s.ok()) {
+      fprintf(stdout, "Successfully deleted directory %s\n",
+              FLAGS_delete_dir_and_exit.c_str());
+      return 0;
+    } else {
+      fprintf(stderr, "Failed to delete directory %s: %s\n",
+              FLAGS_delete_dir_and_exit.c_str(), s.ToString().c_str());
+      return 1;
+    }
+  }
+
   FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str());
 
   // The number of background threads should be at least as much the
diff --git a/db_stress_tool/expected_value.h b/db_stress_tool/expected_value.h
index 428c389cb66e..7aed38240f09 100644
--- a/db_stress_tool/expected_value.h
+++ b/db_stress_tool/expected_value.h
@@ -253,20 +253,20 @@ class PendingExpectedValue {
 class ExpectedValueHelper {
  public:
   // Return whether the key associated with `pre_read_expected_value` and
-  // `post_read_expected_value` is expected not to exist from begining till the
+  // `post_read_expected_value` is expected not to exist from beginning till the
   // end of the read
   //
   // The negation of `MustHaveNotExisted()` is "may have not existed".
-  // To assert some key must have existsed, please use `MustHaveExisted()`
+  // To assert some key must have existed, please use `MustHaveExisted()`
   static bool MustHaveNotExisted(ExpectedValue pre_read_expected_value,
                                  ExpectedValue post_read_expected_value);
 
   // Return whether the key associated with `pre_read_expected_value` and
-  // `post_read_expected_value` is expected to exist from begining till the end
+  // `post_read_expected_value` is expected to exist from beginning till the end
   // of the read.
   //
   // The negation of `MustHaveExisted()` is "may have existed".
-  // To assert some key must have not existsed, please use
+  // To assert some key must have not existed, please use
   // `MustHaveNotExisted()`
   static bool MustHaveExisted(ExpectedValue pre_read_expected_value,
                               ExpectedValue post_read_expected_value);
diff --git a/db_stress_tool/no_batched_ops_stress.cc b/db_stress_tool/no_batched_ops_stress.cc
index 616035a1b4fe..c9d3250a119a 100644
--- a/db_stress_tool/no_batched_ops_stress.cc
+++ b/db_stress_tool/no_batched_ops_stress.cc
@@ -233,6 +233,14 @@ class NonBatchedOpsStressTest : public StressTest {
           }
 
           Status s = secondary_db_->TryCatchUpWithPrimary();
+#ifndef NDEBUG
+          uint64_t manifest_num =
+              static_cast_with_check<DBImpl>(secondary_db_.get())
+                  ->TEST_Current_Manifest_FileNo();
+#else
+          uint64_t manifest_num = 0;
+#endif
+
           if (!s.ok()) {
             VerificationAbort(shared,
                               "Secondary failed to catch up to the primary");
@@ -267,9 +275,11 @@ class NonBatchedOpsStressTest : public StressTest {
             assert(!pre_read_expected_values.empty() &&
                    static_cast<size_t>(i - start) <
                        pre_read_expected_values.size());
-            VerifyValueRange(static_cast<int>(cf), i, options, shared, from_db,
-                             /* msg_prefix */ "Secondary get verification", s,
-                             pre_read_expected_values[i - start]);
+            VerifyValueRange(
+                static_cast<int>(cf), i, options, shared, from_db,
+                /* msg_prefix */ "Secondary get verification, manifest: " +
+                    std::to_string(manifest_num),
+                s, pre_read_expected_values[i - start]);
           }
         }
       } else if (method == VerificationMethod::kGetEntity) {
@@ -1600,12 +1610,6 @@ class NonBatchedOpsStressTest : public StressTest {
     Slice ub_slice;
     ReadOptions ro_copy = read_opts;
 
-    // There is a narrow window in iterator auto refresh run where injected read
-    // errors are simply untraceable, ex. failure to delete file as a part of
-    // superversion cleanup callback invoked by the DBIter destructor.
-    bool ignore_injected_read_error_in_iter =
-        ro_copy.auto_refresh_iterator_with_snapshot;
-
     // Randomly test with `iterate_upper_bound` and `prefix_same_as_start`
     //
     // Get the next prefix first and then see if we want to set it to be the
@@ -1698,8 +1702,7 @@ class NonBatchedOpsStressTest : public StressTest {
               FaultInjectionIOType::kRead),
           fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
               FaultInjectionIOType::kMetadataRead));
-      if (!ignore_injected_read_error_in_iter &&
-          !SharedState::ignore_read_error && injected_error_count > 0 &&
+      if (!SharedState::ignore_read_error && injected_error_count > 0 &&
           s.ok()) {
         // Grab mutex so multiple thread don't try to print the
         // stack trace at the same time
@@ -1852,7 +1855,17 @@ class NonBatchedOpsStressTest : public StressTest {
       } else if (FLAGS_use_merge) {
         if (!FLAGS_use_txn) {
           if (FLAGS_user_timestamp_size == 0) {
-            s = db_->Merge(write_opts, cfh, k, v);
+            if (FLAGS_ingest_wbwi_one_in &&
+                thread->rand.OneIn(FLAGS_ingest_wbwi_one_in)) {
+              auto wbwi = std::make_shared<WriteBatchWithIndex>(
+                  options_.comparator, 0, /*overwrite_key=*/true);
+              s = wbwi->Merge(cfh, k, v);
+              if (s.ok()) {
+                s = db_->IngestWriteBatchWithIndex(write_opts, wbwi);
+              }
+            } else {
+              s = db_->Merge(write_opts, cfh, k, v);
+            }
           } else {
             s = db_->Merge(write_opts, cfh, k, write_ts, v);
           }
@@ -1864,7 +1877,17 @@ class NonBatchedOpsStressTest : public StressTest {
       } else {
         if (!FLAGS_use_txn) {
           if (FLAGS_user_timestamp_size == 0) {
-            s = db_->Put(write_opts, cfh, k, v);
+            if (FLAGS_ingest_wbwi_one_in &&
+                thread->rand.OneIn(FLAGS_ingest_wbwi_one_in)) {
+              auto wbwi = std::make_shared<WriteBatchWithIndex>(
+                  options_.comparator, 0, /*overwrite_key=*/true);
+              s = wbwi->Put(cfh, k, v);
+              if (s.ok()) {
+                s = db_->IngestWriteBatchWithIndex(write_opts, wbwi);
+              }
+            } else {
+              s = db_->Put(write_opts, cfh, k, v);
+            }
           } else {
             s = db_->Put(write_opts, cfh, k, write_ts, v);
           }
@@ -1882,6 +1905,17 @@ class NonBatchedOpsStressTest : public StressTest {
     } while (!s.ok() && IsErrorInjectedAndRetryable(s) &&
              initial_wal_write_may_succeed);
 
+    if ((s.IsDeadlock() || s.IsTimedOut()) &&
+        (FLAGS_use_multiget || FLAGS_use_multi_get_entity)) {
+      // Deadlock or timeout is ok, when multi get is tested. Because multi get
+      // tests execute MaybeAddKeyToTxnForRYW function which writes to the
+      // same key space but does not acquire stress test level mutex. So it is
+      // possible RocksDB returns deadlock or timeout. Return OK() for these
+      // cases
+      pending_expected_value.Rollback();
+      return Status::OK();
+    }
+
     if (!s.ok()) {
       pending_expected_value.Rollback();
       if (IsErrorInjectedAndRetryable(s)) {
@@ -1956,7 +1990,17 @@ class NonBatchedOpsStressTest : public StressTest {
         }
         if (!FLAGS_use_txn) {
           if (FLAGS_user_timestamp_size == 0) {
-            s = db_->Delete(write_opts, cfh, key);
+            if (FLAGS_ingest_wbwi_one_in &&
+                thread->rand.OneIn(FLAGS_ingest_wbwi_one_in)) {
+              auto wbwi = std::make_shared<WriteBatchWithIndex>(
+                  options_.comparator, 0, /*overwrite_key=*/true);
+              s = wbwi->Delete(cfh, key);
+              if (s.ok()) {
+                s = db_->IngestWriteBatchWithIndex(write_opts, wbwi);
+              }
+            } else {
+              s = db_->Delete(write_opts, cfh, key);
+            }
           } else {
             s = db_->Delete(write_opts, cfh, key, write_ts);
           }
@@ -2013,7 +2057,17 @@ class NonBatchedOpsStressTest : public StressTest {
         }
         if (!FLAGS_use_txn) {
           if (FLAGS_user_timestamp_size == 0) {
-            s = db_->SingleDelete(write_opts, cfh, key);
+            if (FLAGS_ingest_wbwi_one_in &&
+                thread->rand.OneIn(FLAGS_ingest_wbwi_one_in)) {
+              auto wbwi = std::make_shared<WriteBatchWithIndex>(
+                  options_.comparator, 0, /*overwrite_key=*/true);
+              s = wbwi->SingleDelete(cfh, key);
+              if (s.ok()) {
+                s = db_->IngestWriteBatchWithIndex(write_opts, wbwi);
+              }
+            } else {
+              s = db_->SingleDelete(write_opts, cfh, key);
+            }
           } else {
             s = db_->SingleDelete(write_opts, cfh, key, write_ts);
           }
@@ -3114,13 +3168,15 @@ class NonBatchedOpsStressTest : public StressTest {
 
       Status s;
 
+      ExpectedValue new_expected_value;
+
       switch (op) {
         case Op::PutOrPutEntity:
         case Op::Merge: {
           ExpectedValue put_value;
           put_value.SyncPut(static_cast<uint32_t>(thread->rand.Uniform(
               static_cast<int>(ExpectedValue::GetValueBaseMask()))));
-          ryw_expected_values[k] = put_value;
+          new_expected_value = put_value;
 
           const uint32_t value_base = put_value.GetValueBase();
 
@@ -3144,7 +3200,7 @@ class NonBatchedOpsStressTest : public StressTest {
         case Op::Delete: {
           ExpectedValue delete_value;
           delete_value.SyncDelete();
-          ryw_expected_values[k] = delete_value;
+          new_expected_value = delete_value;
 
           s = txn->Delete(cfh, k);
           break;
@@ -3153,6 +3209,20 @@ class NonBatchedOpsStressTest : public StressTest {
           assert(false);
       }
 
+      // It is possible that multiple thread concurrently try to write to the
+      // same key, which could cause lock timeout or deadlock in the
+      // transactiondb layer, before transaction is rolled back.
+      // E.g.
+      // Timestamp 1: Transaction A: lock key M for write
+      // Timestamp 2: Transaction B: lock key N for write
+      // Timestamp 3: Transaction B: try to lock key M for write -> wait
+      // Timestamp 4: Transaction A: try to lock key N for write -> deadlock
+      if (s.IsTimedOut() || s.IsDeadlock()) {
+        return;
+      }
+
+      ryw_expected_values[k] = new_expected_value;
+
       if (!s.ok()) {
         fprintf(stderr,
                 "Transaction write error in read-your-own-write test: %s\n",
diff --git a/docs/_data/authors.yml b/docs/_data/authors.yml
index 256f4c07ff65..0bc79ad80de6 100644
--- a/docs/_data/authors.yml
+++ b/docs/_data/authors.yml
@@ -1,3 +1,5 @@
+# Note: standardize on github user names here. fbid is optional and was used
+# to use author's profile picture from Facebook
 icanadi:
   full_name: Igor Canadi
   fbid: 706165749
@@ -26,7 +28,7 @@ lgalanis:
   full_name: Leonidas Galanis
   fbid: 8649950
 
-sdong:
+siying:
   full_name: Siying Dong
   fbid: 9805119
 
@@ -83,3 +85,19 @@ zjay:
 hx235:
   full_name: Hui Xiao
   fbid: 100037058588280
+
+pdillinger:
+  full_name: Peter Dillinger
+  fbid: 513108
+
+alanpaxton:
+  full_name: Alan Paxton
+
+akankshamahajan15:
+  full_name: Akanksha Mahajan
+
+anand1976:
+  full_name: Anand Ananthabhotla
+
+poojam23:
+  full_name: Pooja Malik
diff --git a/docs/_posts/2014-05-14-lock.markdown b/docs/_posts/2014-05-14-lock.markdown
index 12009cc88c11..66bf05dc4736 100644
--- a/docs/_posts/2014-05-14-lock.markdown
+++ b/docs/_posts/2014-05-14-lock.markdown
@@ -1,7 +1,7 @@
 ---
 title: Reducing Lock Contention in RocksDB
 layout: post
-author: sdong
+author: siying
 category: blog
 redirect_from:
   - /blog/521/lock/
diff --git a/docs/_posts/2014-06-23-plaintable-a-new-file-format.markdown b/docs/_posts/2014-06-23-plaintable-a-new-file-format.markdown
index 6a641f23353c..ed03b0273233 100644
--- a/docs/_posts/2014-06-23-plaintable-a-new-file-format.markdown
+++ b/docs/_posts/2014-06-23-plaintable-a-new-file-format.markdown
@@ -1,7 +1,7 @@
 ---
 title: PlainTable — A New File Format
 layout: post
-author: sdong
+author: siying
 category: blog
 redirect_from:
   - /blog/599/plaintable-a-new-file-format/
diff --git a/docs/_posts/2015-02-27-write-batch-with-index.markdown b/docs/_posts/2015-02-27-write-batch-with-index.markdown
index 7f9f77653655..770ee0581651 100644
--- a/docs/_posts/2015-02-27-write-batch-with-index.markdown
+++ b/docs/_posts/2015-02-27-write-batch-with-index.markdown
@@ -1,7 +1,7 @@
 ---
 title: 'WriteBatchWithIndex: Utility for Implementing Read-Your-Own-Writes'
 layout: post
-author: sdong
+author: siying
 category: blog
 redirect_from:
   - /blog/1901/write-batch-with-index/
diff --git a/docs/_posts/2015-07-23-dynamic-level.markdown b/docs/_posts/2015-07-23-dynamic-level.markdown
index 0ff3a0542f82..1bc41b2fb3a4 100644
--- a/docs/_posts/2015-07-23-dynamic-level.markdown
+++ b/docs/_posts/2015-07-23-dynamic-level.markdown
@@ -1,7 +1,7 @@
 ---
 title: Dynamic Level Size for Level-Based Compaction
 layout: post
-author: sdong
+author: siying
 category: blog
 redirect_from:
   - /blog/2207/dynamic-level/
diff --git a/docs/_posts/2015-11-16-analysis-file-read-latency-by-level.markdown b/docs/_posts/2015-11-16-analysis-file-read-latency-by-level.markdown
index b21b04fe3869..7e5eb03582d6 100644
--- a/docs/_posts/2015-11-16-analysis-file-read-latency-by-level.markdown
+++ b/docs/_posts/2015-11-16-analysis-file-read-latency-by-level.markdown
@@ -1,7 +1,7 @@
 ---
 title: Analysis File Read Latency by Level
 layout: post
-author: sdong
+author: siying
 category: blog
 redirect_from:
   - /blog/2537/analysis-file-read-latency-by-level/
diff --git a/docs/_posts/2016-01-29-compaction_pri.markdown b/docs/_posts/2016-01-29-compaction_pri.markdown
index ba9ee627c91d..955e0849c95f 100644
--- a/docs/_posts/2016-01-29-compaction_pri.markdown
+++ b/docs/_posts/2016-01-29-compaction_pri.markdown
@@ -1,7 +1,7 @@
 ---
 title: Option of Compaction Priority
 layout: post
-author: sdong
+author: siying
 category: blog
 redirect_from:
   - /blog/2921/compaction_pri/
diff --git a/docs/_posts/2016-02-24-rocksdb-4-2-release.markdown b/docs/_posts/2016-02-24-rocksdb-4-2-release.markdown
index 409015cc8c8c..927121bac173 100644
--- a/docs/_posts/2016-02-24-rocksdb-4-2-release.markdown
+++ b/docs/_posts/2016-02-24-rocksdb-4-2-release.markdown
@@ -1,7 +1,7 @@
 ---
 title: RocksDB 4.2 Release!
 layout: post
-author: sdong
+author: siying
 category: blog
 redirect_from:
   - /blog/3017/rocksdb-4-2-release/
diff --git a/docs/_posts/2016-02-25-rocksdb-ama.markdown b/docs/_posts/2016-02-25-rocksdb-ama.markdown
index 2ba04f39a18e..31792552fc29 100644
--- a/docs/_posts/2016-02-25-rocksdb-ama.markdown
+++ b/docs/_posts/2016-02-25-rocksdb-ama.markdown
@@ -1,7 +1,7 @@
 ---
 title: RocksDB AMA
 layout: post
-author: yhchiang
+author: yhciang
 category: blog
 redirect_from:
   - /blog/3065/rocksdb-ama/
diff --git a/docs/_posts/2016-04-26-rocksdb-4-5-1-released.markdown b/docs/_posts/2016-04-26-rocksdb-4-5-1-released.markdown
index 247768d307b4..b29a9bd3649f 100644
--- a/docs/_posts/2016-04-26-rocksdb-4-5-1-released.markdown
+++ b/docs/_posts/2016-04-26-rocksdb-4-5-1-released.markdown
@@ -1,7 +1,7 @@
 ---
 title: RocksDB 4.5.1 Released!
 layout: post
-author: sdong
+author: siying
 category: blog
 redirect_from:
   - /blog/3179/rocksdb-4-5-1-released/
diff --git a/docs/_posts/2016-09-28-rocksdb-4-11-2-released.markdown b/docs/_posts/2016-09-28-rocksdb-4-11-2-released.markdown
index 87c20eb47d43..11760cc82560 100644
--- a/docs/_posts/2016-09-28-rocksdb-4-11-2-released.markdown
+++ b/docs/_posts/2016-09-28-rocksdb-4-11-2-released.markdown
@@ -1,7 +1,7 @@
 ---
 title: RocksDB 4.11.2 Released!
 layout: post
-author: sdong
+author: siying
 category: blog
 ---
 We abandoned release candidates 4.10.x and directly go to 4.11.2 from 4.9, to make sure the latest release is stable. In 4.11.2, we fixed several data corruption related bugs introduced in 4.9.0.
diff --git a/docs/_posts/2017-03-02-rocksdb-5-2-1-released.markdown b/docs/_posts/2017-03-02-rocksdb-5-2-1-released.markdown
index c6ce27d64db4..87fe0c050e0b 100644
--- a/docs/_posts/2017-03-02-rocksdb-5-2-1-released.markdown
+++ b/docs/_posts/2017-03-02-rocksdb-5-2-1-released.markdown
@@ -1,7 +1,7 @@
 ---
 title: RocksDB 5.2.1 Released!
 layout: post
-author: sdong
+author: siying
 category: blog
 ---
 
diff --git a/docs/_posts/2021-04-12-universal-improvements.markdown b/docs/_posts/2021-04-12-universal-improvements.markdown
index fa4e9d463b23..f6bf64b2da8e 100644
--- a/docs/_posts/2021-04-12-universal-improvements.markdown
+++ b/docs/_posts/2021-04-12-universal-improvements.markdown
@@ -1,7 +1,7 @@
 ---
 title: (Call For Contribution) Make Universal Compaction More Incremental
 layout: post
-author: sdong
+author: siying
 category: blog
 ---
 
diff --git a/docs/_posts/2021-05-26-online-validation.markdown b/docs/_posts/2021-05-26-online-validation.markdown
index 33e9dfc151ac..9314630b0705 100644
--- a/docs/_posts/2021-05-26-online-validation.markdown
+++ b/docs/_posts/2021-05-26-online-validation.markdown
@@ -1,7 +1,7 @@
 ---
 title: Online Validation
 layout: post
-author: sdong
+author: siying
 category: blog
 ---
 To prevent or mitigate data corrution in RocksDB when some software or hardware issues happens, we keep adding online consistency checks and improving existing ones.
diff --git a/docs/_posts/2025-09-24-unified-memory-tracking.markdown b/docs/_posts/2025-09-24-unified-memory-tracking.markdown
new file mode 100644
index 000000000000..dba0ca488eb8
--- /dev/null
+++ b/docs/_posts/2025-09-24-unified-memory-tracking.markdown
@@ -0,0 +1,59 @@
+---
+title: Unified Memory Tracking
+layout: post
+author: hx235
+category: blog
+---
+
+## Context / Problem
+Modern RocksDB deployments often run in environments with strict memory constraints—cloud VMs, containers, or hosts with hundreds of DB instances. Unpredictable memory usage can lead to out-of-memory (OOM) errors, degraded performance, or even service outages.
+Historically, while the block cache was the main source of memory usage, other components—such as memtables, table readers, file metadata, and temporary buffers—could consume significant memory outside the block cache’s control. This made it difficult for users to set a single memory limit and guarantee resource usage stays within expectations.
+
+## Goal
+The goal of recent memory tracking work in RocksDB is to enable users to cap the total memory usage of RocksDB instances under a single, configurable limit—the block cache capacity. This is achieved by:
+- **Tracking and charging** all major memory consumers (memtables, table readers, file metadata, compression buffers, filter construction) to the block cache.
+- **Evicting** data blocks or other memory when the total tracked usage exceeds the configured limit.
+- **Providing a fixed memory footprint** for RocksDB, making it easier to run in resource-constrained environments and avoid OOMs.
+
+## Memtable Memory Charging
+A major source of memory usage in RocksDB is the memtable. To ensure memtable memory is tracked and capped under a single limit, RocksDB provides the WriteBufferManager (WBM). When WBM is configured with a block cache, memtable memory usage is charged to the block cache. This helps prevent OOM errors and simplifies resource management.
+
+```cpp
+std::shared_ptr<Cache> cache = HyperClockCacheOptions(capacity).MakeSharedCache();;
+DBOptions db_options;
+db_options.write_buffer_manager = std::make_shared<WriteBufferManager>(.., cache);
+```
+
+## Other Memory Charging
+Beyond memtables, RocksDB allows users to control memory charging for other internal roles using the cache_usage_options API. This provides fine-grained control over how memory is tracked for components like table readers, file metadata, compression dictionary buffers (`CompressionOptions::max_dict_buffer_bytes:`) and filter construction.
+
+```cpp
+struct CacheEntryRoleOptions {
+  enum class Decision {
+    kEnabled,
+    kDisabled,
+    kFallback,
+  };
+  Decision charged = Decision::kFallback;
+};
+struct CacheUsageOptions {
+  CacheEntryRoleOptions options;
+  std::map<CacheEntryRole, CacheEntryRoleOptions> options_overrides;
+};
+
+...
+BlockBasedTableOptions table_options;
+table_options.cache_usage_options.options.charged = CacheEntryRoleOptions::Decision::kFallback;
+table_options.cache_usage_options.options_overrides[CacheEntryRole::kTableBuilder] = {
+  .charged = CacheEntryRoleOptions::Decision::kEnabled,
+};
+```
+
+Default (`Decision::kFallback`) behavior for each memory type:
+- `CacheEntryRole::kCompressionDictionaryBuildingBuffer`: `kEnabled`
+- `CacheEntryRole::kFilterConstruction`: `kDisabled`
+- `CacheEntryRole::kBlockBasedTableReader`: `kDisabled`
+- `CacheEntryRole::kFileMetadata`: `kDisabled`
+
+## Monitoring and Observability
+RocksDB provides built-in statistics to help users monitor memory usage and cache behavior. The `DB::Properties::kBlockCacheEntryStats` exposes detailed statistics about block cache entries, including breakdowns by each `CacheEntryRole`. These statistics are essential for understanding memory consumption and tuning cache configuration.
diff --git a/docs/_posts/2025-09-25-io-tagging.markdown b/docs/_posts/2025-09-25-io-tagging.markdown
new file mode 100644
index 000000000000..14651d03f0e9
--- /dev/null
+++ b/docs/_posts/2025-09-25-io-tagging.markdown
@@ -0,0 +1,74 @@
+---
+title: IO Activity Tagging
+layout: post
+author: hx235
+category: blog
+---
+
+## Context
+
+RocksDB performs a variety of IO operations—user reads, background compactions, flushes, database opens, and verification tasks. Treating all these operations the same makes it difficult for file system implementers to optimize performance, prioritize latency-sensitive IOs, and diagnose bottlenecks. To solve that, RocksDB internally tags every IO operation with its activity type using the `IOActivity` enum. This automatic tagging provides precise context for each IO, enabling file systems to make smarter, context-aware decisions for scheduling, caching, and resource management.
+
+## How Internal IO Tagging Works
+RocksDB automatically assigns an `IOActivity` tag to each IO operation. This tag is propagated through the storage stack and included in the IO options passed to the file system.
+
+```cpp
+enum class IOActivity : uint8_t {
+    kFlush = 0,                        // IO for flush operations (background write)
+    kCompaction = 1,                   // IO for compaction (background read/write)
+    kDBOpen = 2,                       // IO during database open (read/write)
+    kGet = 3,                          // User Get() read
+    kMultiGet = 4,                     // User MultiGet() read
+    kDBIterator = 5,                   // User iterator read
+    kVerifyDBChecksum = 6,             // Verification: DB checksum
+    kVerifyFileChecksums = 7,          // Verification: file checksums
+    kGetEntity = 8,                    // Entity Get (e.g., wide-column)
+    kMultiGetEntity = 9,               // Entity MultiGet
+    kGetFileChecksumsFromCurrentManifest = 10, // Manifest checksum reads
+    // 0x80–0xFE: Reserved for custom/internal use
+    kUnknown = 0xFF                    // Unknown/unspecified activity
+};
+```
+
+## Access IO Tag in File System
+Custom file systems can access the IOActivity tag via the IO options structure provided by RocksDB. This allows them to optimize behavior based on the specific IO activity.
+
+```cpp
+Status CustomFileSystem::Append(uint64_t offset, const Slice& data, const IOOptions& io_opts, ...) {
+    switch (io_opts.io_activity) {
+        case Env::IOActivity::kGet:
+            // Prioritize or cache user reads
+            break;
+        case Env::IOActivity::kCompaction:
+            // Throttle or deprioritize background compaction IO
+            break;
+        case Env::IOActivity::kDBOpen:
+            // Track or optimize DB open IO
+            break;
+        // ... handle other activities ...
+        default:
+            // Default handling
+            break;
+    }
+}
+```
+## IO Activity Statistics in RocksDB
+RocksDB provides detailed histograms for IO activities, allowing you to analyze both the aggregate time spent (in microseconds) and the count of IOs for each activity type.
+```cpp
+// Read Histograms
+FILE_READ_FLUSH_MICROS
+FILE_READ_COMPACTION_MICROS
+FILE_READ_DB_OPEN_MICROS
+FILE_READ_GET_MICROS
+FILE_READ_MULTIGET_MICROS
+FILE_READ_DB_ITERATOR_MICROS
+FILE_READ_VERIFY_DB_CHECKSUM_MICROS
+FILE_READ_VERIFY_FILE_CHECKSUMS_MICROS
+
+// Write Histograms
+FILE_WRITE_FLUSH_MICROS
+FILE_WRITE_COMPACTION_MICROS
+FILE_WRITE_DB_OPEN_MICROS
+```
+
+Thanks to Maciej Szeszko and Andrew Chang from the RocksDB team for their contributions in expanding and maintaining the IOActivity enum.
diff --git a/docs/_posts/2025-10-08-parallel-compression-revamp.markdown b/docs/_posts/2025-10-08-parallel-compression-revamp.markdown
new file mode 100644
index 000000000000..42386e5c941a
--- /dev/null
+++ b/docs/_posts/2025-10-08-parallel-compression-revamp.markdown
@@ -0,0 +1,89 @@
+---
+title: "Parallel Compression Revamp: Dramatically Reduced CPU Overhead"
+layout: post
+author: pdillinger
+category: blog
+---
+
+The upcoming RocksDB 10.7 release includes a major revamp of parallel compression that **dramatically reduces the feature's CPU overhead by up to 65%** while maintaining or improving throughput for compression-heavy workloads. We expect this to broaden the set of workloads that could benefit from parallel compression, especially for **bulk SST generation and remote compaction use cases** that are less sensitive to CPU responsiveness.
+
+## Background
+
+Parallel compression in RocksDB (`CompressionOptions::parallel_threads > 1`) allows multiple threads to compress different blocks simultaneously during SST file generation, which can significantly improve compaction throughput for workloads where compression is a bottleneck. However, the original implementation had substantial CPU overhead that often outweighed the benefits, limiting its practical adoption.
+
+## What's New: A Complete Reimplementation
+
+The parallel compression framework has been completely rewritten from the ground up in [pull request #13910](https://github.com/facebook/rocksdb/pull/13910) to address the core inefficiencies:
+
+### Ring Buffer Architecture
+Instead of separate compression and write queues with complex thread coordination, the new implementation uses a ring buffer of blocks-in-progress that enables efficient work distribution across threads. This bounds working memory while enabling high throughput with minimal cross-thread synchronization.
+
+![Ring Buffer Architecture](/static/images/parallel-compression/ring-buffer-architecture.svg)
+
+### Work-Stealing Design
+Previously, the calling thread could only generate uncompressed blocks, dedicated compression threads could only compress, and a writer thread could only write the SST file to storage. Now, all threads can participate in compression work in a quasi-work-stealing manner, dramatically reducing the need for threads to block waiting for work. While only one thread (the calling thread or "emit thread") can generate uncompressed SST blocks in the new implementation, feeding compression work to other threads and itself, all other threads are compatible with writing compressed blocks to storage.
+
+### Auto-Scaling Thread Management
+The ring buffer enables another key feature: auto-scaling of active threads based on ring buffer utilization. The framework intelligently wakes up idle worker threads only when there's sufficient work to justify the overhead, achieving near-maximum throughput while minimizing CPU waste from unnecessary thread wake-ups.
+
+### Lock-Free Synchronization
+The entire framework is now lock-free (and wait-free as long as compatible work units are available for each thread), based primarily on atomic operations. To cleanly pack and leverage many data fields into a single atomic value, I've developed a new `BitFields` utility API. This is proving useful for cleaning up the HyperClockCache implementation as well, and will be the topic of a later blog post.
+
+Semaphores are used for lock-free management of idle threads (assuming a lock-free semaphore implementation, which is likely the case with `ROCKSDB_USE_STD_SEMAPHORES` but that is untrustworthy; see below).
+
+## Performance Improvements
+
+The results speak for themselves. Here's a comparison using `db_bench` fillseq benchmarks with various compression configurations:
+
+### ZSTD Compression (Default Level)
+Note:
+* "throughput" = how quickly a given CPU-bound flush or compaction can complete
+* "CPU increase" = total CPU usage in amount of time that each core was used
+* "PT" = parallel_threads setting.
+
+**Before:**
+- PT=3: ~38% throughput increase for ~73% CPU increase
+- PT=6: No throughput increase for ~70% CPU increase
+
+**After:**
+- PT=3: ~58% throughput increase for ~25% CPU increase
+- PT=6: ~58% throughput increase for ~28% CPU increase
+
+### High Compression Scenarios
+For ZSTD compression level 8, the improvements are even more dramatic:
+
+**Before:**
+- PT=4: 2.6x throughput increase for 139% CPU increase
+- PT=8: 3.6x throughput increase for 135% CPU increase
+
+**After:**
+- PT=4: 2.8x throughput increase for 114% CPU increase
+- PT=8: 3.7x throughput increase for 116% CPU increase
+
+## Compression Algorithm Optimizations
+
+Alongside the parallel compression revamp, some optimizations have gone into the underlying compression implementations/integrations. Most notably, **LZ4HC received dramatic performance improvements** through better reuse of internal data structures between compression calls (detailed in [pull request #13805](https://github.com/facebook/rocksdb/pull/13805)). A small regression in LZ4 performance from that change was fixed in [pull request #14017](https://github.com/facebook/rocksdb/pull/14017).
+
+While **ZSTD remains the gold standard** for medium-to-high compression ratios in RocksDB, these LZ4HC optimizations make it an increasingly attractive option for read-heavy workloads where LZ4's faster decompression can provide overall performance benefits.
+
+## Production Ready
+
+With these efficiency improvements, parallel compression is now considered **production-ready**. The feature has been thoroughly tested in both unit tests and stress testing, including validation on high-load scenarios with hundreds of concurrent compression jobs and thousands of threads.
+
+Some notes on current limitations:
+- Parallel compression is currently incompatible with `UserDefinedIndex` and with the deprecated `decouple_partitioned_filters=false` setting
+- Maximum performance is available with `-DROCKSDB_USE_STD_SEMAPHORES` at compile time, though this is not currently recommended due to reported bugs in some implementations of C++20 semaphores
+
+## Configuration Recommendations
+
+The dramatically reduced CPU overhead means parallel compression is now viable for a broader range of workloads, particularly those using higher compression levels or compression-heavy scenarios like time-series data. However, simply enabling parallel compression could result in more *spiky* CPU loads for hosts serving live DB data. **Parallel compression might be most useful for bulk SST file generation and/or remote compaction workloads** because they are less sensitive to CPU responsiveness. In these scenarios there is little danger in setting `parallel_threads=8` even with the possibility of over-subscribing CPU cores, though the potentially safer "sweet spot" is typically around `parallel_threads=3`, depending on compression level, etc.
+
+## Limitations and Future
+
+Although this offers a great improvement in the implementation of an existing option, we recognize that this setup is suboptimal in a number of ways:
+* There is no work sharing / thread pooling for these SST compression/writer threads among compactions in the same process, so not well able to fit the workload to available CPU cores and not able to use other SST file compression work to avoid a worker thread going to sleep.
+* We are not (yet) using a framework that would allow micro-work sharing with things other than SST generation on a set of threads. That would be a good direction for effective sharing of CPU resources without spikes in usage, but might incur intolerable CPU overhead in managing work. With this "hand optimized" and specialized framework, we can at least evaluate such future endeavors against a perhaps ideal framework in terms of parallelizing with minimal overhead.
+
+## Try It Out
+
+Parallel compression revamp will be available in RocksDB 10.7. As always, we recommend testing in your specific environment to determine the optimal configuration for your workload.
diff --git a/docs/_posts/2025-12-31-bit-fields-api.markdown b/docs/_posts/2025-12-31-bit-fields-api.markdown
new file mode 100644
index 000000000000..40d1b60f5326
--- /dev/null
+++ b/docs/_posts/2025-12-31-bit-fields-api.markdown
@@ -0,0 +1,279 @@
+---
+title: "BitFields API: Type-Safe Bit Packing for Lock-Free Data Structures"
+layout: post
+author: pdillinger
+category: blog
+---
+
+Modern concurrent data structures increasingly rely on [atomic operations](https://en.cppreference.com/w/cpp/atomic/atomic) to avoid the overhead of locking. A valuable but under-utilized technique for maximizing the effectiveness of atomic operations is [bit packing](https://en.wikipedia.org/wiki/Bit_field)---fitting multiple logical fields into a single atomic variable for algorithmic simplicity and efficiency. However, language support for bit packing does not guarantee dense packing, and manually managing bit manipulation quickly becomes error-prone, especially when dealing with complex state machines.
+
+To address this in RocksDB, we have developed a reusable **BitFields API**, a type-safe, zero-overhead abstraction for bit packing in C++. This works in conjunction with clean wrappers for `std::atomic` for powerful and relatively safe bit-packing of atomic data. For broader use, a [variant of the code](https://github.com/facebook/folly/pull/2549) has been proposed for adding to folly.
+
+## The Problem: Managing Packed Bit Fields
+
+Consider HyperClockCache, an essentially lock-free cache implementation in RocksDB, which was [refactored to use this BitFields API](https://github.com/facebook/rocksdb/pull/14154). It is a hash table built on *slots* that can each hold a cache entry and relevant metadata. For atomic simplicity and efficiency, all the essential metadata for each slot is packed into a single 64-bit value:
+- The reference count and eviction metadata are together encoded into *acquire* and *release* counters, 30 bits each.
+- The possible states of {*empty*, *under construction/destruction*, *occupied+visible*, and *occupied+invisible*} are encoded into three state bits (instead of two, for easier decoding and manipulation).
+- A *hit* bit is used for secondary cache integration.
+
+Traditionally, you might write code like this:
+
+```cpp
+// Old approach: manual bit manipulation
+constexpr uint64_t kAcquireCounterShift = 0;
+constexpr uint64_t kReleaseCounterShift = 30;
+constexpr uint64_t kCounterMask = 0x3FFFFFFF;
+constexpr uint64_t kHitBitShift = 60;
+constexpr uint64_t kOccupiedShift = 61;
+constexpr uint64_t kShareableShift = 62;
+constexpr uint64_t kVisibleShift = 63;
+constexpr uint64_t kStateShift = kOccupiedShift;
+
+std::atomic<uint64_t> meta_;
+
+bool IsUnderConstruction(uint64_t meta) const {
+    return (meta & (uint64_t{1} << kOccupiedShift)) && !(meta & (uint64_t{1} << kShareableShift));
+}
+
+// Getting fields
+uint64_t meta = meta_.load(std::memory_order_acquire);
+if (IsUnderConstruction(meta)) {
+  // ...
+} else if ((meta >> kVisibleShift) & 1) {
+  uint32_t refcount =
+      static_cast<uint32_t>(((meta >> kAcquireCounterShift) -
+                             (meta >> kReleaseCounterShift)) & kCounterMask);
+  // ...
+}
+
+
+// Setting fields
+
+// Set the hit bit (relaxed)
+meta_.fetch_or(uint64_t{1} << kHitBitShift, std::memory_order_relaxed);
+
+// Set both counters to `new_count` (as in eviction processing)
+uint64_t meta = meta_.load(std::memory_order_relaxed);
+uint64_t new_meta =
+    (meta & ((uint64_t{1} << kHitBitShift) | (uint64_t{7} << kStateShift))) |
+    (new_count << kReleaseCounterShift) |
+    (new_count << kAcquireCounterShift);
+bool success = meta_.compare_exchange_strong(meta, new_meta,
+                                             std::memory_order_acq_rel);
+
+// Increment acquire counter by initial_countdown
+old_meta = meta_.fetch_add((uint64_t{1} << kAcquireCounterShift) * initial_countdown,
+                           std::memory_order_acq_rel);
+```
+
+This approach has several problems:
+1. **Error-prone**: Easy to get masks and shifts wrong
+2. **Maintenance burden**: Changes to field sizes require updating multiple constants
+3. **Abstraction challenges**: Even if writing a full set of well-tested getters and setters to hide all the details, details can leak in to do things like update multiple fields in one non-CAS (compare-and-swap) atomic operation.
+
+## New Solution: BitFields API
+
+The BitFields API provides a declarative, type-safe way to define bit-packed structures. Here's how the same example looks with BitFields:
+
+```cpp
+// New approach: declarative bit fields. (Each field must reference the
+// previous, so that the declaration machinery is simply stateless.)
+struct SlotMeta : public BitFields<uint64_t, SlotMeta> {
+  using AcquireCounter = UnsignedBitField<SlotMeta, 30, NoPrevBitField>;
+  using ReleaseCounter = UnsignedBitField<SlotMeta, 30, AcquireCounter>;
+  using HitFlag = BoolBitField<SlotMeta, ReleaseCounter>;
+  using OccupiedFlag = BoolBitField<SlotMeta, HitFlag>;
+  using ShareableFlag = BoolBitField<SlotMeta, OccupiedFlag>;
+  using VisibleFlag = BoolBitField<SlotMeta, ShareableFlag>;
+
+  // Convenience helpers
+  bool IsUnderConstruction() const {
+    return Get<OccupiedFlag>() && !Get<ShareableFlag>();
+  }
+};
+
+BitFieldsAtomic<SlotMeta> meta_;
+
+// Getting fields
+SlotMeta state = meta_.Load();
+if (state.IsUnderConstruction()) {
+  // ...
+} else if (state.Get<SlotMeta::VisibleFlag>()) {
+  uint32_t refcount = state.Get<SlotMeta::AcquireCounter>() -
+                      state.Get<SlotMeta::ReleaseCounter>();
+  // ...
+}
+
+// Setting fields
+
+// Set the hit bit (relaxed)
+meta_.ApplyRelaxed(SlotMeta::HitFlag::SetTransform());
+
+// Set both counters to `new_count` (as in eviction processing)
+SlotMeta meta = meta_.LoadRelaxed();
+SlotMeta new_meta = meta;
+new_meta.Set<SlotMeta::ReleaseCounter>(new_count);
+new_meta.Set<SlotMeta::AcquireCounter>(new_count);
+meta_.CasStrongRelaxed(meta, new_meta);
+
+// Increment acquire counter by initial_countdown
+auto add_acquire =
+    AcquireCounter::PlusTransformPromiseNoOverflow(initial_countdown);
+meta_.Apply(add_acquire, &old_meta);
+
+// Bonus: Atomic multi-field updates without compare-exchange
+auto transform = AcquireCounter::PlusTransformPromiseNoOverflow(1) +
+                 ReleaseCounter::PlusTransformPromiseNoOverflow(1);
+meta_.Apply(transform);
+```
+
+## Key Features
+
+### Type Safety and Self-Documentation
+
+Each field has a specific type (`bool` for `BoolBitField`, appropriately-sized unsigned int for `UnsignedBitField`) and clear semantic meaning. The field definitions are self-documenting: you can immediately see how many bits each field occupies and in what order.
+
+### [Zero Overhead](https://en.cppreference.com/w/cpp/language/Zero-overhead_principle)
+
+Because of heavy use of templates and constexpr operations and the ability to satisfy multiple field reads or writes from a single atomic operation, we have seen no runtime overhead vs. hand-written bit manipulation, in RocksDB. In one case, we verified the assembly code was identical.
+
+[For folly's LifoSem](https://github.com/facebook/folly/pull/2550), there was one case where an optimization hack with detected overflow from one field to another couldn't be replicated as efficiently with the BitFields API because it would violate overflow checking. For that case I dove into the underlying representation to bypass the BitFields overflow check.
+
+### Atomic Operations with Transforms
+
+One of the most powerful features is the ability to combine multiple field updates into a single atomic operation using "transforms", if they are all either (a) some combination of addition and subtraction, (b) bitwise-and, or (c) bitwise-or. For example:
+
+```cpp
+// Clear several but not all fields atomically
+auto and_transform = Field1::AndTransform(0) +
+                 Field2::ClearTransform() +
+                 Field4::ClearTransform();
+atomic_bitfields.Apply(and_transform, &old_state, &new_state);
+...
+// Set more than one boolean field atomically
+auto or_transform = Field2::SetTransform() +
+                 Field4::SetTransform();
+atomic_bitfields.Apply(or_transform, &old_state, &new_state);
+...
+auto add_transform = Field1::PlusTransformPromiseNoOverflow(1) +
+                     Field3::MinusTransformPromiseNoUnderflow(1);
+atomic_bitfields.Apply(add_transform, &old_state, &new_state);
+```
+
+Each `Apply()` generates a single atomic operation (e.g., `fetch_add` or `fetch_or`) that updates all the specified fields, and optionally returns both the old and new values. This enables a number of hacks for atomic updates without CAS.
+
+### Overflow Protection
+
+The API includes built-in overflow detection in debug builds:
+
+```cpp
+// An assertion will fail in debug builds if the counter overflows
+auto transform = Counter::PlusTransformPromiseNoOverflow(value);
+atomic.Apply(transform);
+```
+
+For fields at the top of the underlying representation (where overflow doesn't affect other fields), overflow is explicitly ignored. (A compile time error is generated if you try to use `PlusTransformPromiseNoOverflow` on a field at the top of the representation or `PlusTransformIgnoreOverflow` on a field not at the top of the representation.)
+
+```cpp
+// For wraparound counters
+auto transform = Counter::PlusTransformIgnoreOverflow(value);
+```
+
+This capability is used in a folly data structure called LifoSem, which [I have proposed to refactor](https://github.com/facebook/folly/pull/2550) to a proposed BitFields API variant for folly.
+
+### Compare-and-Swap (CAS) Support
+
+The atomic wrappers provide full CAS support for lock-free algorithms:
+
+```cpp
+SlotMeta expected = current_state;
+SlotMeta desired = expected.With<Field1>(new_value).With<Field2>(true);
+if (meta_.CasStrong(expected, desired)) {
+  // Successfully updated
+  ...
+}
+```
+
+### Atomic wrappers
+
+The BitFields API includes two atomic wrappers: `RelaxedBitFieldsAtomic` and `BitFieldsAtomic`. However, RocksDB also has versions of these wrappers for regular `std::atomic` variables that help with memory ordering discipline: `RelaxedAtomic` and `Atomic` in `util/atomic.h`.
+
+These wrappers help in a couple of ways:
+* **Self-document intended memory order**: An atomic field generally has a single memory order that all or most operations should use, typically either `std::memory_order_relaxed` or `std::memory_order_acq_rel`.
+* **More intentional memory orders and atomic operations**: The standard library's implicit conversions and default memory ordering (`memory_order_seq_cst`) make it easy to accidentally use sequential consistency with acquire/release ordering or even relaxed, which could hurt performance, and tend to hide where atomic operations are actually happening (e.g. implicit vs. explicit load).
+
+For example, instead of writing:
+```cpp
+std::atomic<uint64_t> stat_counter;
+stat_counter++;  // Uses memory_order_seq_cst implicitly - maybe inefficient
+```
+
+You write:
+```cpp
+RelaxedAtomic<uint64_t> stat_counter;
+stat_counter.FetchAddRelaxed(1);  // Explicitly relaxed - appropriate for a diagnostic counter
+```
+
+Or for data providing synchronization:
+```cpp
+Atomic<size_t> refcount;
+refcount.FetchAdd(1);  // Standard acquire-release semantics for coordinating with other threads
+```
+
+These wrappers complement the BitFields atomic wrappers by providing the same ordering discipline for non-packed atomic variables throughout much of RocksDB, creating a more readable and less clunky approach to concurrent programming. Migrating remaining uses of `std::atomic` is an ongoing effort.
+
+## Real-World Usage in RocksDB
+
+The BitFields API was developed along with the revamped parallel compression in RocksDB, but with the intention to also clean up the HyperClockCache (HCC) implementation. With that migration complete, we can see the benefits. Specifically, **by packing more of the state machine into a single atomic value, the parallel algorithms became both simpler and more efficient.** Concurrent algorithms that could have blown up in their state space with elaborate interleavings between threads trying not to block each other, e.g. because of multi-step consensus on work assignments, were instead able to quickly and more easily make progress, e.g. with atomically clear work assignments.
+
+### Before: Manual Bit Manipulation
+
+The old HCC code was difficult to read and maintain. Many of the common read and update operations had manually written helper functions, but it was not practical to develop the full set of functions needed for rare cases. Consider this code that clears the "visible" flag on a slot when an entry is erased from subsequent lookups but might still be referenced:
+
+```cpp
+// Old HCC code, without atomic wrappers
+uint64_t old_meta =
+        h->meta.fetch_and(~(uint64_t{ClockHandle::kStateVisibleBit}
+                                   << ClockHandle::kStateShift), std::memory_order_acq_rel);
+// Apply update to local copy
+uint64_t new_meta = old_meta & ~(uint64_t{ClockHandle::kStateVisibleBit}
+                            << ClockHandle::kStateShift);
+
+// New HCC code
+SlotMeta old_meta, new_meta;
+h->meta.Apply(SlotMeta::VisibleFlag::ClearTransform(), &old_meta, &new_meta);
+```
+
+Or this assertion that the acquire and release counters are different:
+
+```cpp
+// Old HCC code
+uint64_t old_meta = ...;
+assert(((old_meta >> ClockHandle::kAcquireCounterShift) &
+        ClockHandle::kCounterMask) !=
+        ((old_meta >> ClockHandle::kReleaseCounterShift) &
+        ClockHandle::kCounterMask));
+
+// New HCC code without single-purpose helper functions
+SlotMeta old_meta = ...;
+assert(old_meta.Get<SlotMeta::AcquireCounter>() !=
+       old_meta.Get<SlotMeta::ReleaseCounter>());
+
+// New HCC code, with single-purpose helper functions
+SlotMeta old_meta = ...;
+assert(old_meta.GetAcquireCounter() != old_meta.GetReleaseCounter());
+```
+
+Some hand-written helper functions or using directives are still useful for brevity, but even without them all the bit manipulation details are hidden in the BitFields implementation.
+
+## Future Directions
+
+We hope the proposed folly version is accepted to make the BitFields API available for broader usage. Additionally, some quality-of-life improvements are likely possible, perhaps including easier declaration and usage syntax, hopefully without delving into boost-like macro hell. Better runtime and compile time checks might also be possible.
+
+## Conclusion
+
+The BitFields API demonstrates that zero-overhead abstractions can significantly improve code quality without sacrificing performance. By providing type safety, self-documentation, and convenience features around bit manipulation and atomic operations, it makes lock-free programming more accessible and maintainable. Bit-packed atomics are arguably essential for *slaying the complexity dragon* of efficient lock-free and low-lock algorithms, because they reduce explosion in algorithm states.
+
+For RocksDB specifically, the migration to BitFields has made the HyperClockCache implementation substantially easier to understand and modify, while maintaining the same high-performance characteristics. Combined with the recent [parallel compression revamp](/blog/2025/10/08/parallel-compression-revamp.html), these improvements showcase our ongoing commitment to writing clean, efficient, and maintainable code.
+
+The BitFields API is available in RocksDB's util/bit_fields.h and can be adapted for use in other projects requiring efficient, type-safe bit packing. For those building high-performance concurrent systems, it offers a compelling alternative to manual bit manipulation—proving that safe abstractions and peak performance are not mutually exclusive.
diff --git a/docs/_posts/2026-02-17-cpu-bug.markdown b/docs/_posts/2026-02-17-cpu-bug.markdown
new file mode 100644
index 000000000000..7147ca74dc6b
--- /dev/null
+++ b/docs/_posts/2026-02-17-cpu-bug.markdown
@@ -0,0 +1,46 @@
+---
+title: "RocksDB development finds a CPU bug"
+layout: post
+author: pdillinger
+category: blog
+---
+
+This is the story of how a RocksDB unit test I added four years ago, a mini-stress test you might call it, revealed [a novel hardware bug in a newer CPU](https://www.amd.com/en/resources/product-security/bulletin/amd-sb-7055.html). It was scary enough to be assigned a "high severity" CVE.
+
+## Background: Unique Identifiers
+About four years ago, we [added unique identifiers to SST files](https://github.com/facebook/rocksdb/pull/9126) to give them stable identifiers across different filesystems for caching purposes. Part of the motivation here was to eliminate our dependence on the uniqueness and non-recycling of unique identifiers on files provided by the OS filesystem. (Some filesystems were only [guaranteeing uniqueness among existing files, not among all files even in recent history](https://github.com/facebook/rocksdb/issues/7405#issuecomment-694595587).) I would call this dependency problem the *great tension* between reusing existing solutions and code self-reliance. You don't want to duplicate others' work but you also don't want to be subject to their bugs or changing / misaligned requirements. Striking this balance can be tricky, but in this case it was clear to us that we didn't want to rely on all the possible filesystems providing quality unique identifiers.
+
+If you're comfortable with large random numbers (e.g. 128 bits), you probably agree that persisting random identifiers (or [quasi-random](https://github.com/pdillinger/unique_id/blob/main/README.md), which [I helped formalize in a paper](https://dl.acm.org/doi/10.1145/3584372.3588674), [also on arXiv](https://arxiv.org/abs/2304.07109)) with each file would be safer and more predictable than relying so crucially on a minor feature of OS filesystems.
+
+## High Quality Randomness
+However, that assumes we have access to *high quality* random numbers (at least a good one or two to start from - see the paper). Because RocksDB intends to be cross-platform, we want to minimize platform-specific dependencies and prefer cross-platform dependencies. But that could easily land us back where we didn't want to be: susceptible to a bug or hiccup in one implementation of what we needed.
+
+Fortunately, the nature of random entropy allows *combining* sources so that your result is as good as your *best* input source, so even if one is bad, you only have a problem if they're all bad. And we had the advantages that (a) we only needed uniqueness, not security, which reduced the need for extra scrutiny and allowed us to use the quasi-random approach, and (b) the quasi-random approach minimized the amount of entropy needed, so the performance cost of acquiring each unit of entropy was almost inconsequential. Therefore, I combined these sources of entropy:
+
+* C++11's [std::random_device](https://en.cppreference.com/w/cpp/numeric/random/random_device.html) which is supposed to provide high quality but is allowed not to.
+* A hash of various environment parameters including hostname, process id, thread id, and various macro and micro time readings.
+* Platform-specific UUID generator (Linux and Windows only)
+
+## Trust But Verify
+To verify the quality of each of these sources on an ongoing basis, [I added unit tests](https://github.com/facebook/rocksdb/pull/8708) that used many threads to create thousands of unique identifiers based on one of the above sources at a time and verified their uniqueness. For a high quality source, the probability of any duplicate 128-bit IDs among thousands is negligible, even if running these tests continuously for decades.
+
+## That's Weird
+That was pretty much the story until some months ago the test based on `std::random_device` failed, once. It was quite suspicious because the number of unique IDs was not just one short of expectation, it was dozens or hundreds short. However, even that could be explained by a random CPU hiccup or bit flip in which we generated fewer IDs to begin with. (You might have noticed an increasing amount of RocksDB development effort and portion of CPU time going into checks that are logically redundant but exist to detect CPU miscalculations before the corruption propagates too far.)
+
+But then it failed again about a month later. No failures for four years, then two failures in two months. This smelled really bad. Digging into the details I noticed a crucial correlation: both of the failed test jobs had run on the same type of hardware, though in completely different data centers.
+
+From there I did the natural thing for an engineer: scale it up to try to reproduce the failure. And that was remarkably easy. By increasing the number of threads in the job to around the number of cores it would fail quickly and consistently on all systems using the same type of newer CPU, and pass on everything else. I tested some variants of this to establish some more details, including
+
+* `std::random_device` using "rdrand" and "/dev/urandom" sources were not affected, and
+* libc++ (from clang) was not affected, only libstdc++ (from GCC)
+
+## Root Cause Analysis
+From there Meta colleagues investigated the low-level details. They found the problem to be that the RDSEED instruction on this type of processor would return 0 and "success" much more often than would randomly be expected, but only on some cores and only under "complex micro-architectural conditions reproducible under memory-load," as a colleague describes it. A mitigating Linux kernel patch was developed to signal that RDSEED was unavailable on these processors, with the intention of rolling it out internally at Meta to avoid problems until a fix came from the OEM. [AMD quickly acknowledged the issue and announced planned mitigation](https://www.amd.com/en/resources/product-security/bulletin/amd-sb-7055.html), including a CPU microcode update.
+
+## With Apologies
+Although I worked to keep the information confidential until the OEM publicly acknowledged the issue, the uncoordinated disclosure via the Linux mailing list was due to zealous remediation efforts that crossed multiple infrastructure teams at Meta. We regret the mistake and are working to improve controls on the processes that failed to coordinate with the OEM first.
+
+## Key Takeaways
+* Test what you depend on.
+* Have redundancies and/or sanity checks for what you depend on.
+* Even CPUs can have bugs, usually flaky individual units but occasionally a bug affecting all units.
diff --git a/docs/static/images/parallel-compression/ring-buffer-architecture.svg b/docs/static/images/parallel-compression/ring-buffer-architecture.svg
new file mode 100644
index 000000000000..75ee489cf243
--- /dev/null
+++ b/docs/static/images/parallel-compression/ring-buffer-architecture.svg
@@ -0,0 +1,136 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 800 500" style="background-color: #fafafa;">
+  <defs>
+    <marker id="arrowhead" markerWidth="10" markerHeight="7"
+     refX="10" refY="3.5" orient="auto">
+      <polygon points="0 0, 10 3.5, 0 7" fill="#333" />
+    </marker>
+    <marker id="red-arrowhead" markerWidth="10" markerHeight="7"
+     refX="10" refY="3.5" orient="auto">
+      <polygon points="0 0, 10 3.5, 0 7" fill="#d32f2f" />
+    </marker>
+    <marker id="blue-arrowhead" markerWidth="10" markerHeight="7"
+     refX="10" refY="3.5" orient="auto">
+      <polygon points="0 0, 10 3.5, 0 7" fill="#1976d2" />
+    </marker>
+    <marker id="green-arrowhead" markerWidth="10" markerHeight="7"
+     refX="10" refY="3.5" orient="auto">
+      <polygon points="0 0, 10 3.5, 0 7" fill="#388e3c" />
+    </marker>
+    <filter id="shadow" x="-20%" y="-20%" width="140%" height="140%">
+      <feDropShadow dx="2" dy="2" stdDeviation="2" flood-color="#00000020"/>
+    </filter>
+  </defs>
+
+  <!-- Title -->
+  <text x="400" y="30" font-family="Arial, sans-serif" font-size="18" font-weight="bold" text-anchor="middle" fill="#333">
+    Ring Buffer Architecture (8 slots shown) for Parallel Compression
+  </text>
+
+  <!-- Ring Buffer Array -->
+  <g transform="translate(100,150)">
+    <!-- Array slots laid out horizontally -->
+
+    <!-- Slot 0 -->
+    <rect x="0" y="0" width="70" height="50" fill="#f5f5f5" stroke="#666" stroke-width="1" rx="5" filter="url(#shadow)"/>
+    <text x="35" y="20" font-family="Arial, sans-serif" font-size="10" font-weight="bold" text-anchor="middle" fill="#333">Slot 0</text>
+    <text x="35" y="35" font-family="Arial, sans-serif" font-size="9" text-anchor="middle" fill="#666">Empty</text>
+
+    <!-- Slot 1 - NextToWrite=1, being written by Worker Thread 2 -->
+    <rect x="75" y="0" width="70" height="50" fill="#e8f5e8" stroke="#388e3c" stroke-width="3" rx="5" filter="url(#shadow)"/>
+    <text x="110" y="20" font-family="Arial, sans-serif" font-size="10" font-weight="bold" text-anchor="middle" fill="#333">Slot 1</text>
+    <text x="110" y="35" font-family="Arial, sans-serif" font-size="9" text-anchor="middle" fill="#388e3c">Writing...</text>
+
+    <!-- Slot 2 - Compressed -->
+    <rect x="150" y="0" width="70" height="50" fill="#fff3e0" stroke="#f57c00" stroke-width="2" rx="5" filter="url(#shadow)"/>
+    <text x="185" y="20" font-family="Arial, sans-serif" font-size="10" font-weight="bold" text-anchor="middle" fill="#333">Slot 2</text>
+    <text x="185" y="35" font-family="Arial, sans-serif" font-size="9" text-anchor="middle" fill="#666">Compressed</text>
+
+    <!-- Slot 3 - Being compressed by Worker Thread 1 -->
+    <rect x="225" y="0" width="70" height="50" fill="#e3f2fd" stroke="#1976d2" stroke-width="3" rx="5" filter="url(#shadow)"/>
+    <text x="260" y="20" font-family="Arial, sans-serif" font-size="10" font-weight="bold" text-anchor="middle" fill="#333">Slot 3</text>
+    <text x="260" y="35" font-family="Arial, sans-serif" font-size="9" text-anchor="middle" fill="#1976d2">Compressing...</text>
+
+    <!-- Slot 4 - NextToCompress=4, uncompressed -->
+    <rect x="300" y="0" width="70" height="50" fill="#ffebee" stroke="#d32f2f" stroke-width="2" rx="5" filter="url(#shadow)"/>
+    <text x="335" y="20" font-family="Arial, sans-serif" font-size="10" font-weight="bold" text-anchor="middle" fill="#333">Slot 4</text>
+    <text x="335" y="35" font-family="Arial, sans-serif" font-size="9" text-anchor="middle" fill="#666">Uncompressed</text>
+
+    <!-- Slot 5 - NextToEmit=5, adding block -->
+    <rect x="375" y="0" width="70" height="50" fill="#e1f5fe" stroke="#4fc3f7" stroke-width="3" rx="5" filter="url(#shadow)"/>
+    <text x="410" y="20" font-family="Arial, sans-serif" font-size="10" font-weight="bold" text-anchor="middle" fill="#333">Slot 5</text>
+    <text x="410" y="35" font-family="Arial, sans-serif" font-size="9" text-anchor="middle" fill="#4fc3f7">Adding block...</text>
+
+    <!-- Slot 6 -->
+    <rect x="450" y="0" width="70" height="50" fill="#f5f5f5" stroke="#666" stroke-width="1" rx="5" filter="url(#shadow)"/>
+    <text x="485" y="20" font-family="Arial, sans-serif" font-size="10" font-weight="bold" text-anchor="middle" fill="#333">Slot 6</text>
+    <text x="485" y="35" font-family="Arial, sans-serif" font-size="9" text-anchor="middle" fill="#666">Empty</text>
+
+    <!-- Slot 7 -->
+    <rect x="525" y="0" width="70" height="50" fill="#f5f5f5" stroke="#666" stroke-width="1" rx="5" filter="url(#shadow)"/>
+    <text x="560" y="20" font-family="Arial, sans-serif" font-size="10" font-weight="bold" text-anchor="middle" fill="#333">Slot 7</text>
+    <text x="560" y="35" font-family="Arial, sans-serif" font-size="9" text-anchor="middle" fill="#666">Empty</text>
+  </g>
+
+  <!-- Arrows pointing to array positions - staggered vertically -->
+  <!-- NextToWrite=1 arrow (red) -->
+  <line x1="210" y1="100" x2="210" y2="145" stroke="#d32f2f" stroke-width="3" marker-end="url(#red-arrowhead)"/>
+  <text x="210" y="90" font-family="Arial, sans-serif" font-size="12" font-weight="bold" text-anchor="middle" fill="#d32f2f">NextToWrite=1</text>
+
+  <!-- NextToCompress=4 arrow (blue) -->
+  <line x1="435" y1="110" x2="435" y2="145" stroke="#1976d2" stroke-width="3" marker-end="url(#blue-arrowhead)"/>
+  <text x="435" y="100" font-family="Arial, sans-serif" font-size="12" font-weight="bold" text-anchor="middle" fill="#1976d2">NextToCompress=4</text>
+
+  <!-- NextToEmit=5 arrow (light blue) -->
+  <line x1="510" y1="120" x2="510" y2="145" stroke="#4fc3f7" stroke-width="3" marker-end="url(#arrowhead)"/>
+  <text x="510" y="110" font-family="Arial, sans-serif" font-size="12" font-weight="bold" text-anchor="middle" fill="#4fc3f7">NextToEmit=5</text>
+
+  <!-- Worker Thread 2 (wider box) -->
+  <g transform="translate(30,250)">
+    <rect x="0" y="0" width="160" height="70" fill="#e8f5e8" stroke="#388e3c" stroke-width="2" rx="8" filter="url(#shadow)"/>
+    <text x="80" y="25" font-family="Arial, sans-serif" font-size="14" font-weight="bold" text-anchor="middle" fill="#333">Worker Thread 2</text>
+    <text x="80" y="40" font-family="Arial, sans-serif" font-size="11" text-anchor="middle" fill="#666">Currently writing</text>
+    <text x="80" y="55" font-family="Arial, sans-serif" font-size="11" text-anchor="middle" fill="#666">Can also compress</text>
+  </g>
+
+  <!-- Worker Thread 1 (wider box) -->
+  <g transform="translate(240,280)">
+    <rect x="0" y="0" width="160" height="70" fill="#e3f2fd" stroke="#1976d2" stroke-width="2" rx="8" filter="url(#shadow)"/>
+    <text x="80" y="25" font-family="Arial, sans-serif" font-size="14" font-weight="bold" text-anchor="middle" fill="#333">Worker Thread 1</text>
+    <text x="80" y="40" font-family="Arial, sans-serif" font-size="11" text-anchor="middle" fill="#666">Currently compressing</text>
+    <text x="80" y="55" font-family="Arial, sans-serif" font-size="11" text-anchor="middle" fill="#666">Can also write</text>
+  </g>
+
+  <!-- Emit Thread (wider box) -->
+  <g transform="translate(460,250)">
+    <rect x="0" y="0" width="180" height="70" fill="#e1f5fe" stroke="#4fc3f7" stroke-width="2" rx="8" filter="url(#shadow)"/>
+    <text x="90" y="25" font-family="Arial, sans-serif" font-size="14" font-weight="bold" text-anchor="middle" fill="#333">Emit Thread</text>
+    <text x="90" y="40" font-family="Arial, sans-serif" font-size="11" text-anchor="middle" fill="#666">Generates uncompressed blocks</text>
+    <text x="90" y="55" font-family="Arial, sans-serif" font-size="11" text-anchor="middle" fill="#666">Can help with compression</text>
+  </g>
+
+  <!-- Arrows from threads to array (no labels) -->
+  <!-- Worker Thread 2 to Slot 1 -->
+  <line x1="190" y1="285" x2="210" y2="210" stroke="#388e3c" stroke-width="2" marker-end="url(#green-arrowhead)"/>
+
+  <!-- Worker Thread 1 to Slot 3 -->
+  <line x1="320" y1="280" x2="360" y2="210" stroke="#1976d2" stroke-width="2" marker-end="url(#blue-arrowhead)"/>
+
+  <!-- Emit Thread to Slot 5 -->
+  <line x1="550" y1="250" x2="510" y2="210" stroke="#4fc3f7" stroke-width="2" marker-end="url(#arrowhead)"/>
+
+  <!-- SST File Output (centered below Worker Thread 2) -->
+  <g transform="translate(50,380)">
+    <rect x="0" y="0" width="120" height="40" fill="#e8f5e8" stroke="#388e3c" stroke-width="2" rx="5" filter="url(#shadow)"/>
+    <text x="60" y="25" font-family="Arial, sans-serif" font-size="12" font-weight="bold" text-anchor="middle" fill="#333">SST File</text>
+  </g>
+
+  <!-- Arrow from Worker Thread 2 to SST file -->
+  <line x1="110" y1="320" x2="110" y2="380" stroke="#388e3c" stroke-width="2" marker-end="url(#green-arrowhead)"/>
+
+  <!-- Invariant (moved to the right) -->
+  <g transform="translate(400,430)">
+    <text x="0" y="0" font-family="Arial, sans-serif" font-size="12" font-weight="bold" fill="#333">Invariant:</text>
+    <text x="0" y="20" font-family="Arial, sans-serif" font-size="11" fill="#666">NextToWrite ≤ NextToCompress ≤ NextToEmit (modulo ring buffer size)</text>
+  </g>
+</svg>
diff --git a/env/composite_env.cc b/env/composite_env.cc
index 59434785ced5..a0a4d9edf66d 100644
--- a/env/composite_env.cc
+++ b/env/composite_env.cc
@@ -100,6 +100,10 @@ class CompositeRandomAccessFileWrapper : public RandomAccessFile {
     return target_->InvalidateCache(offset, length);
   }
 
+  Status GetFileSize(uint64_t* size) override {
+    return target_->GetFileSize(size);
+  }
+
  private:
   std::unique_ptr<FSRandomAccessFile> target_;
 };
diff --git a/env/env.cc b/env/env.cc
index 683771e72360..80d65cced3a5 100644
--- a/env/env.cc
+++ b/env/env.cc
@@ -9,6 +9,7 @@
 
 #include "rocksdb/env.h"
 
+#include <sstream>
 #include <thread>
 
 #include "env/composite_env_wrapper.h"
@@ -26,6 +27,7 @@
 #include "rocksdb/utilities/object_registry.h"
 #include "rocksdb/utilities/options_type.h"
 #include "util/autovector.h"
+#include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 namespace {
@@ -186,6 +188,10 @@ class LegacyRandomAccessFileWrapper : public FSRandomAccessFile {
   IOStatus InvalidateCache(size_t offset, size_t length) override {
     return status_to_io_status(target_->InvalidateCache(offset, length));
   }
+  IOStatus GetFileSize(uint64_t* result) override {
+    auto status = target_->GetFileSize(result);
+    return status_to_io_status(std::move(status));
+  }
 
  private:
   std::unique_ptr<RandomAccessFile> target_;
@@ -732,6 +738,48 @@ std::string Env::PriorityToString(Env::Priority priority) {
   return "Invalid";
 }
 
+std::string Env::IOActivityToString(IOActivity activity) {
+  switch (activity) {
+    case Env::IOActivity::kFlush:
+      return "Flush";
+    case Env::IOActivity::kCompaction:
+      return "Compaction";
+    case Env::IOActivity::kDBOpen:
+      return "DBOpen";
+    case Env::IOActivity::kGet:
+      return "Get";
+    case Env::IOActivity::kMultiGet:
+      return "MultiGet";
+    case Env::IOActivity::kDBIterator:
+      return "DBIterator";
+    case Env::IOActivity::kVerifyDBChecksum:
+      return "VerifyDBChecksum";
+    case Env::IOActivity::kVerifyFileChecksums:
+      return "VerifyFileChecksums";
+    case Env::IOActivity::kGetEntity:
+      return "GetEntity";
+    case Env::IOActivity::kMultiGetEntity:
+      return "MultiGetEntity";
+    case Env::IOActivity::kGetFileChecksumsFromCurrentManifest:
+      return "GetFileChecksumsFromCurrentManifest";
+    case Env::IOActivity::kUnknown:
+      return "Unknown";
+    default:
+      int activityIndex = static_cast<int>(activity);
+      if (activityIndex >=
+              static_cast<int>(Env::IOActivity::kFirstCustomIOActivity) &&
+          activityIndex <=
+              static_cast<int>(Env::IOActivity::kLastCustomIOActivity)) {
+        std::stringstream ss;
+        ss << std::hex << std::uppercase << activityIndex;
+        return "CustomIOActivity" + ss.str();
+      }
+      return "Invalid";
+  };
+  assert(false);
+  return "Invalid";
+}
+
 uint64_t Env::GetThreadID() const {
   std::hash<std::thread::id> hasher;
   return hasher(std::this_thread::get_id());
diff --git a/env/env_encryption.cc b/env/env_encryption.cc
index 16a3c32819f0..9565b9d9bc90 100644
--- a/env/env_encryption.cc
+++ b/env/env_encryption.cc
@@ -665,17 +665,52 @@ class EncryptedFileSystemImpl : public EncryptedFileSystem {
                               std::unique_ptr<FSWritableFile>* result,
                               IODebugContext* dbg) override {
     result->reset();
-    if (options.use_mmap_writes) {
+    if (options.use_mmap_reads || options.use_mmap_writes) {
       return IOStatus::InvalidArgument();
     }
+
+    size_t prefix_length = 0;
+    std::unique_ptr<BlockAccessCipherStream> stream;
+
     // Open file using underlying Env implementation
     std::unique_ptr<FSWritableFile> underlying;
-    IOStatus status =
+    auto status =
         FileSystemWrapper::ReopenWritableFile(fname, options, &underlying, dbg);
     if (!status.ok()) {
       return status;
     }
-    return CreateWritableEncryptedFile(fname, underlying, options, result, dbg);
+
+    if (underlying->GetFileSize(options.io_options, dbg) != 0) {
+      // read the cipher stream from file for non-empty file
+      std::unique_ptr<FSRandomAccessFile> underlying_file_reader;
+      status = FileSystemWrapper::NewRandomAccessFile(
+          fname, options, &underlying_file_reader, dbg);
+      if (!status.ok()) {
+        return status;
+      }
+
+      status = CreateRandomReadCipherStream(
+          fname, underlying_file_reader, options, &prefix_length, &stream, dbg);
+
+      if (!status.ok()) {
+        return status;
+      }
+    } else {
+      // create cipher stream for new or empty file
+      status = CreateWritableCipherStream(fname, underlying, options,
+                                          &prefix_length, &stream, dbg);
+      if (!status.ok()) {
+        return status;
+      }
+    }
+
+    if (stream) {
+      result->reset(new EncryptedWritableFile(
+          std::move(underlying), std::move(stream), prefix_length));
+    } else {
+      result->reset(underlying.release());
+    }
+    return status;
   }
 
   IOStatus ReuseWritableFile(const std::string& fname,
diff --git a/env/env_posix.cc b/env/env_posix.cc
index 8b24a7a27888..86a7741f0f34 100644
--- a/env/env_posix.cc
+++ b/env/env_posix.cc
@@ -169,8 +169,9 @@ class PosixClock : public SystemClock {
     struct timespec ts;
     clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
     return (static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec) / 1000;
-#endif
+#else
     return 0;
+#endif
   }
 
   uint64_t CPUNanos() override {
@@ -179,8 +180,9 @@ class PosixClock : public SystemClock {
     struct timespec ts;
     clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
     return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
-#endif
+#else
     return 0;
+#endif
   }
 
   void SleepForMicroseconds(int micros) override { usleep(micros); }
diff --git a/env/env_test.cc b/env/env_test.cc
index e89f48531dc1..4c0939ecffa4 100644
--- a/env/env_test.cc
+++ b/env/env_test.cc
@@ -41,6 +41,9 @@
 #include "env/env_chroot.h"
 #include "env/env_encryption_ctr.h"
 #include "env/fs_readonly.h"
+#if defined(ROCKSDB_IOURING_PRESENT)
+#include "env/io_posix.h"
+#endif
 #include "env/mock_env.h"
 #include "env/unique_id_gen.h"
 #include "logging/log_buffer.h"
@@ -1655,42 +1658,6 @@ void GenerateFilesAndRequest(Env* env, const std::string& fname,
   }
 }
 
-TEST_F(EnvPosixTest, MultiReadIOUringError) {
-  // In this test we don't do aligned read, so we can't do direct I/O.
-  EnvOptions soptions;
-  soptions.use_direct_reads = soptions.use_direct_writes = false;
-  std::string fname = test::PerThreadDBPath(env_, "testfile");
-
-  std::vector<std::string> scratches;
-  std::vector<ReadRequest> reqs;
-  GenerateFilesAndRequest(env_, fname, &reqs, &scratches);
-  // Query the data
-  std::unique_ptr<RandomAccessFile> file;
-  ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
-
-  bool io_uring_wait_cqe_called = false;
-  SyncPoint::GetInstance()->SetCallBack(
-      "PosixRandomAccessFile::MultiRead:io_uring_wait_cqe:return",
-      [&](void* arg) {
-        if (!io_uring_wait_cqe_called) {
-          io_uring_wait_cqe_called = true;
-          ssize_t& ret = *(static_cast<ssize_t*>(arg));
-          ret = 1;
-        }
-      });
-  SyncPoint::GetInstance()->EnableProcessing();
-
-  Status s = file->MultiRead(reqs.data(), reqs.size());
-  if (io_uring_wait_cqe_called) {
-    ASSERT_NOK(s);
-  } else {
-    s.PermitUncheckedError();
-  }
-
-  SyncPoint::GetInstance()->DisableProcessing();
-  SyncPoint::GetInstance()->ClearAllCallBacks();
-}
-
 TEST_F(EnvPosixTest, MultiReadIOUringError2) {
   // In this test we don't do aligned read, so we can't do direct I/O.
   EnvOptions soptions;
@@ -1706,19 +1673,20 @@ TEST_F(EnvPosixTest, MultiReadIOUringError2) {
 
   bool io_uring_submit_and_wait_called = false;
   SyncPoint::GetInstance()->SetCallBack(
-      "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return1",
+      "PosixRandomAccessFile::MultiRead:io_uring_sq_ready:return1",
       [&](void* arg) {
         io_uring_submit_and_wait_called = true;
-        ssize_t* ret = static_cast<ssize_t*>(arg);
-        (*ret)--;
+        unsigned* ret = static_cast<unsigned*>(arg);
+        *ret = 1;
       });
   SyncPoint::GetInstance()->SetCallBack(
       "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return2",
       [&](void* arg) {
         struct io_uring* iu = static_cast<struct io_uring*>(arg);
         struct io_uring_cqe* cqe;
-        assert(io_uring_wait_cqe(iu, &cqe) == 0);
-        io_uring_cqe_seen(iu, cqe);
+        // CQ should be empty after drain - peek should fail
+        int ret = io_uring_peek_cqe(iu, &cqe);
+        assert(-EAGAIN == ret);  // No CQEs available
       });
   SyncPoint::GetInstance()->EnableProcessing();
 
@@ -2540,7 +2508,7 @@ TEST_P(EnvFSTestWithParam, OptionsTest) {
     }
   }
   for (int i = 0; i < 2; ++i) {
-    DB* db;
+    std::unique_ptr<DB> db;
     Status s = DB::Open(opts, dbname, &db);
     ASSERT_OK(s);
 
@@ -2558,7 +2526,7 @@ TEST_P(EnvFSTestWithParam, OptionsTest) {
     ASSERT_EQ("b", val);
 
     ASSERT_OK(db->Close());
-    delete db;
+    db.reset();
     ASSERT_OK(DestroyDB(dbname, opts));
 
     dbname = dbname2_;
@@ -3467,7 +3435,6 @@ class ReadAsyncRandomAccessFile : public FSRandomAccessFileOwnerWrapper {
 
  private:
   ReadAsyncFS& fs_;
-  std::unique_ptr<FSRandomAccessFile> file_;
   int counter = 0;
 };
 
@@ -3641,6 +3608,486 @@ TEST_F(TestAsyncRead, ReadAsync) {
   }
 }
 
+// Test ReadAsync -> MultiRead -> Poll with real io_uring (not mock).
+// This verifies that MultiRead doesn't interfere with async read buffers.
+TEST_F(TestAsyncRead, InterleavingIOUringOperations) {
+#if defined(ROCKSDB_IOURING_PRESENT)
+  // Use the real filesystem directly (not the mock ReadAsyncFS).
+  std::shared_ptr<FileSystem> fs = env_->GetFileSystem();
+  std::string fname = test::PerThreadDBPath(env_, "testfile_iouring");
+
+  constexpr size_t kSectorSize = 4096;
+  constexpr size_t kNumSectors = 8;
+
+  // 1. Create & write to a file.
+  {
+    std::unique_ptr<FSWritableFile> wfile;
+    ASSERT_OK(
+        fs->NewWritableFile(fname, FileOptions(), &wfile, nullptr /*dbg*/));
+
+    for (size_t i = 0; i < kNumSectors; ++i) {
+      auto data = NewAligned(kSectorSize * 8, static_cast<char>(i + 1));
+      Slice slice(data.get(), kSectorSize);
+      ASSERT_OK(wfile->Append(slice, IOOptions(), nullptr));
+    }
+    ASSERT_OK(wfile->Close(IOOptions(), nullptr));
+  }
+
+  // 2. Test interleaved ReadAsync and MultiRead operations.
+  {
+    std::unique_ptr<FSRandomAccessFile> file;
+    ASSERT_OK(fs->NewRandomAccessFile(fname, FileOptions(), &file, nullptr));
+
+    IOOptions opts;
+    std::vector<void*> io_handles(kNumSectors);
+    std::vector<FSReadRequest> async_reqs(kNumSectors);
+    std::vector<std::unique_ptr<char, Deleter>> async_data;
+    std::vector<size_t> vals;
+    IOHandleDeleter del_fn;
+
+    // Initialize async read requests.
+    for (size_t i = 0; i < kNumSectors; i++) {
+      async_reqs[i].offset = i * kSectorSize;
+      async_reqs[i].len = kSectorSize;
+      async_data.emplace_back(NewAligned(kSectorSize, 0));
+      async_reqs[i].scratch = async_data.back().get();
+      vals.push_back(i);
+    }
+
+    // Callback function for async reads.
+    std::function<void(FSReadRequest&, void*)> callback =
+        [&](FSReadRequest& req, void* cb_arg) {
+          assert(cb_arg != nullptr);
+          size_t i = *(reinterpret_cast<size_t*>(cb_arg));
+          async_reqs[i].offset = req.offset;
+          async_reqs[i].result = req.result;
+          async_reqs[i].status = req.status;
+        };
+
+    // Submit asynchronous read requests.
+    for (size_t i = 0; i < kNumSectors; i++) {
+      void* cb_arg = static_cast<void*>(&(vals[i]));
+      IOStatus s = file->ReadAsync(async_reqs[i], opts, callback, cb_arg,
+                                   &(io_handles[i]), &del_fn, nullptr);
+      if (s.IsNotSupported()) {
+        // io_uring not supported on this system, skip the test.
+        fprintf(stderr, "Skipping test - io_uring not supported: %s\n",
+                s.ToString().c_str());
+        for (size_t j = 0; j < i; j++) {
+          if (io_handles[j] != nullptr) {
+            del_fn(io_handles[j]);
+          }
+        }
+        return;
+      }
+      // For any other error, fail the test.
+      ASSERT_OK(s);
+    }
+
+    // Do a MultiRead on same sectors while async reads are submitted.
+    std::vector<FSReadRequest> multi_reqs(kNumSectors);
+    std::vector<std::unique_ptr<char, Deleter>> multi_data;
+    for (size_t i = 0; i < kNumSectors; i++) {
+      multi_reqs[i].offset = i * kSectorSize;
+      multi_reqs[i].len = kSectorSize;
+      multi_data.emplace_back(NewAligned(kSectorSize, 0));
+      multi_reqs[i].scratch = multi_data.back().get();
+    }
+    ASSERT_OK(file->MultiRead(multi_reqs.data(), kNumSectors, opts, nullptr));
+
+    // Check the status of MultiRead requests (should all succeed).
+    for (size_t i = 0; i < kNumSectors; i++) {
+      auto buf = NewAligned(kSectorSize * 8, static_cast<char>(i + 1));
+      Slice expected_data(buf.get(), kSectorSize);
+
+      ASSERT_EQ(multi_reqs[i].offset, i * kSectorSize);
+      ASSERT_OK(multi_reqs[i].status);
+      ASSERT_EQ(expected_data.ToString(), multi_reqs[i].result.ToString());
+    }
+
+    // Poll for the submitted async requests.
+    ASSERT_OK(fs->Poll(io_handles, kNumSectors));
+
+    // Check the status of async read requests (should all succeed).
+    for (size_t i = 0; i < kNumSectors; i++) {
+      auto buf = NewAligned(kSectorSize * 8, static_cast<char>(i + 1));
+      Slice expected_data(buf.get(), kSectorSize);
+
+      ASSERT_EQ(async_reqs[i].offset, i * kSectorSize);
+      ASSERT_OK(async_reqs[i].status);
+      ASSERT_EQ(expected_data.ToString(), async_reqs[i].result.ToString());
+    }
+
+    // Delete io_handles.
+    for (size_t i = 0; i < io_handles.size(); i++) {
+      del_fn(io_handles[i]);
+    }
+  }
+#else
+  fprintf(stderr, "Skipping test - ROCKSDB_IOURING_PRESENT not defined\n");
+#endif
+}
+
+// Helper function to run AbortIO test with parameterized read requests.
+// Each request is specified as {offset, length}.
+// use_direct_io: if true, opens the file with O_DIRECT to bypass page cache.
+// iterations: number of times to repeat the test (useful for race conditions).
+void TestAbortIOWithRequests(
+    Env* env, size_t file_size,
+    const std::vector<std::pair<uint64_t, size_t>>& read_specs,
+    bool use_direct_io = false, int iterations = 1) {
+#if defined(ROCKSDB_IOURING_PRESENT)
+  fprintf(stderr,
+          "TestAbortIOWithRequests: file_size=%zu, num_reads=%zu, "
+          "direct_io=%d, iterations=%d\n",
+          file_size, read_specs.size(), use_direct_io, iterations);
+  std::shared_ptr<FileSystem> fs = env->GetFileSystem();
+  std::string fname = test::PerThreadDBPath(env, "testfile_abortio");
+
+  // 1. Create test file once (content doesn't change between iterations)
+  {
+    std::unique_ptr<FSWritableFile> wfile;
+    FileOptions file_opts;
+    file_opts.use_direct_writes = true;
+    ASSERT_OK(fs->NewWritableFile(fname, file_opts, &wfile, nullptr));
+
+    // Query the file's required buffer alignment (logical block size)
+    // instead of hardcoding 4096, to support devices with different
+    // sector sizes.
+    size_t sector_size = wfile->GetRequiredBufferAlignment();
+
+    // Round up to full sectors for direct IO writes
+    size_t num_sectors = (file_size + sector_size - 1) / sector_size;
+    for (size_t i = 0; i < num_sectors; ++i) {
+      auto data = NewAligned(sector_size, static_cast<char>(i + 1));
+      Slice slice(data.get(), sector_size);
+      ASSERT_OK(wfile->Append(slice, IOOptions(), nullptr));
+    }
+
+    // Truncate to exact file size if not aligned to sector boundary
+    if (file_size % sector_size != 0) {
+      ASSERT_OK(wfile->Truncate(file_size, IOOptions(), nullptr));
+    }
+
+    ASSERT_OK(wfile->Close(IOOptions(), nullptr));
+  }
+
+  for (int iter = 0; iter < iterations; iter++) {
+    // 2. Submit ReadAsync requests and immediately abort
+    {
+      FileOptions file_opts;
+      file_opts.use_direct_reads = use_direct_io;
+      std::unique_ptr<FSRandomAccessFile> file;
+      ASSERT_OK(fs->NewRandomAccessFile(fname, file_opts, &file, nullptr));
+
+      const size_t num_reads = read_specs.size();
+      IOOptions opts;
+      std::vector<void*> io_handles(num_reads);
+      std::vector<FSReadRequest> reqs(num_reads);
+      std::vector<std::unique_ptr<char, Deleter>> data;
+      std::vector<size_t> vals;
+      IOHandleDeleter del_fn;
+      std::atomic<int> callbacks_invoked{0};
+
+      // Initialize read requests from specs
+      for (size_t i = 0; i < num_reads; i++) {
+        reqs[i].offset = read_specs[i].first;
+        reqs[i].len = read_specs[i].second;
+        data.emplace_back(NewAligned(reqs[i].len, 0));
+        reqs[i].scratch = data.back().get();
+        vals.push_back(i);
+      }
+
+      // Callback
+      std::function<void(FSReadRequest&, void*)> callback =
+          [&](FSReadRequest& req, void* cb_arg) {
+            size_t i = *(reinterpret_cast<size_t*>(cb_arg));
+            reqs[i].status = req.status;
+            callbacks_invoked++;
+          };
+
+      // Submit all ReadAsync requests
+      for (size_t i = 0; i < num_reads; i++) {
+        void* cb_arg = static_cast<void*>(&(vals[i]));
+        IOStatus s = file->ReadAsync(reqs[i], opts, callback, cb_arg,
+                                     &(io_handles[i]), &del_fn, nullptr);
+        if (s.IsNotSupported()) {
+          // io_uring not supported, clean up and skip
+          fprintf(stderr,
+                  "WARNING: io_uring not supported, skipping test: %s\n",
+                  s.ToString().c_str());
+          for (size_t j = 0; j < i; j++) {
+            if (io_handles[j]) {
+              del_fn(io_handles[j]);
+            }
+          }
+          ASSERT_OK(fs->DeleteFile(fname, IOOptions(), nullptr));
+          return;
+        }
+        ASSERT_OK(s);
+      }
+
+      // Immediately call AbortIO - this should NOT hang
+      ASSERT_OK(fs->AbortIO(io_handles));
+
+      // Verify all handles are finished and all callbacks were invoked.
+      // Since all handles are passed to AbortIO, every handle is guaranteed
+      // to be finalized (either completed or cancelled).
+      for (size_t i = 0; i < num_reads; i++) {
+        Posix_IOHandle* h = static_cast<Posix_IOHandle*>(io_handles[i]);
+        ASSERT_TRUE(h->is_finished);
+      }
+      ASSERT_EQ(callbacks_invoked.load(), static_cast<int>(num_reads));
+
+      // Clean up handles
+      for (size_t i = 0; i < num_reads; i++) {
+        if (io_handles[i]) {
+          del_fn(io_handles[i]);
+        }
+      }
+    }
+  }
+
+  ASSERT_OK(fs->DeleteFile(fname, IOOptions(), nullptr));
+
+  fprintf(stderr, "TestAbortIOWithRequests: completed %d iterations\n",
+          iterations);
+#else
+  fprintf(stderr,
+          "TestAbortIOWithRequests: SKIPPED (ROCKSDB_IOURING_PRESENT not "
+          "defined)\n");
+  (void)env;
+  (void)file_size;
+  (void)read_specs;
+  (void)use_direct_io;
+  (void)iterations;
+#endif
+}
+
+// Test overlapping reads at aligned offsets (multiples of 4KB)
+TEST_F(TestAsyncRead, AbortIOOverlappingAligned) {
+  // 4 reads of 16KB each, overlapping by 8KB, all at 4KB-aligned offsets
+  // Read 0: [0, 16KB), Read 1: [8KB, 24KB), Read 2: [16KB, 32KB), Read 3:
+  // [24KB, 40KB)
+  std::vector<std::pair<uint64_t, size_t>> specs = {
+      {0, 16384},
+      {8192, 16384},
+      {16384, 16384},
+      {24576, 16384},
+  };
+  TestAbortIOWithRequests(env_, 64 * 1024, specs);
+}
+
+// Test reads at unaligned offsets (not multiples of 4KB)
+TEST_F(TestAsyncRead, AbortIOUnalignedOffsets) {
+  // Reads starting at non-4KB-aligned offsets
+  std::vector<std::pair<uint64_t, size_t>> specs = {
+      {1000, 8192},    // starts at 1000 (unaligned)
+      {5000, 12288},   // starts at 5000 (unaligned), spans multiple sectors
+      {15000, 8192},   // starts at 15000 (unaligned)
+      {25500, 16384},  // starts at 25500 (unaligned)
+  };
+  TestAbortIOWithRequests(env_, 64 * 1024, specs);
+}
+
+// Test mix of aligned and unaligned, various sizes
+TEST_F(TestAsyncRead, AbortIOMixedOffsets) {
+  std::vector<std::pair<uint64_t, size_t>> specs = {
+      {0, 4096},       // aligned, 1 sector
+      {1500, 8192},    // unaligned, 2 sectors
+      {4096, 20480},   // aligned, 5 sectors
+      {7000, 4096},    // unaligned, spans 2 sectors
+      {16384, 32768},  // aligned, 8 sectors
+      {50000, 8192},   // unaligned
+  };
+  TestAbortIOWithRequests(env_, 128 * 1024, specs);
+}
+
+// Stress test with many concurrent handles
+TEST_F(TestAsyncRead, AbortIOStress) {
+  std::vector<std::pair<uint64_t, size_t>> specs;
+  // 16 overlapping reads with mixed alignment
+  for (int i = 0; i < 16; i++) {
+    uint64_t offset = i * 4000;          // Not aligned to 4KB
+    size_t len = 8192 + (i % 4) * 4096;  // 8KB to 20KB
+    specs.emplace_back(offset, len);
+  }
+  TestAbortIOWithRequests(env_, 256 * 1024, specs);
+}
+
+// Regression test for a fixed bug in AbortIO where out-of-order io_uring
+// completions could cause an infinite hang. The bug occurred when completions
+// for a different handle arrived while waiting for the current handle - the
+// code would consume those completions but not mark the handle as finished,
+// causing a hang when later iterating to that handle.
+//
+// Uses a large read (1MB) followed by a small read (4KB) with Direct I/O to
+// maximize the chance of out-of-order completions. Runs 100 iterations to
+// increase the likelihood of triggering the race condition.
+TEST_F(TestAsyncRead, AbortIOReversedHandles) {
+  // Request 0: LARGE (1MB) at offset 0
+  // Request 1: SMALL (4KB) at offset 1MB
+  std::vector<std::pair<uint64_t, size_t>> specs = {
+      {0, 1024 * 1024},     // 1MB read
+      {1024 * 1024, 4096},  // 4KB read at 1MB offset
+  };
+  // 2MB file, Direct I/O enabled, 100 iterations
+  TestAbortIOWithRequests(env_, 2 * 1024 * 1024, specs,
+                          /*use_direct_io=*/true, /*iterations=*/100);
+}
+
+// Test for bug fix: AbortIO with partial handles should correctly handle
+// completions for non-aborted handles.
+//
+// Previously, AbortIO would consume completions for non-aborted handles but
+// not set is_finished (since it expected req_count==2 for all handles).
+// This caused subsequent Poll calls to hang forever.
+//
+// The fix correctly detects handles not in the abort set and finalizes them
+// immediately when their completion arrives (at req_count==1).
+TEST_F(TestAsyncRead, AbortIOPartialHandlesBug) {
+#if defined(ROCKSDB_IOURING_PRESENT)
+  std::shared_ptr<FileSystem> fs = env_->GetFileSystem();
+  std::string fname = test::PerThreadDBPath(env_, "testfile_abortio_partial");
+
+  constexpr size_t kSectorSize = 4096;
+  constexpr size_t kFileSize = 2 * 1024 * 1024;  // 2MB
+
+  // 1. Create test file with direct I/O
+  {
+    std::unique_ptr<FSWritableFile> wfile;
+    FileOptions file_opts;
+    file_opts.use_direct_writes = true;
+    ASSERT_OK(fs->NewWritableFile(fname, file_opts, &wfile, nullptr));
+
+    size_t num_sectors = kFileSize / kSectorSize;
+    for (size_t i = 0; i < num_sectors; ++i) {
+      auto data = NewAligned(kSectorSize, static_cast<char>(i + 1));
+      Slice slice(data.get(), kSectorSize);
+      ASSERT_OK(wfile->Append(slice, IOOptions(), nullptr));
+    }
+    ASSERT_OK(wfile->Close(IOOptions(), nullptr));
+  }
+
+  // 2. Submit 3 ReadAsync requests, abort only the first one, then Poll the
+  // rest
+  {
+    FileOptions file_opts;
+    file_opts.use_direct_reads = true;
+    std::unique_ptr<FSRandomAccessFile> file;
+    ASSERT_OK(fs->NewRandomAccessFile(fname, file_opts, &file, nullptr));
+
+    IOOptions opts;
+    constexpr size_t kNumReads = 3;
+    std::vector<void*> io_handles(kNumReads);
+    std::vector<FSReadRequest> reqs(kNumReads);
+    std::vector<std::unique_ptr<char, Deleter>> data;
+    std::vector<size_t> vals;
+    IOHandleDeleter del_fn;
+    std::atomic<int> callbacks_invoked{0};
+
+    // H0: 1MB read, H1: 4KB read, H2: 4KB read
+    std::vector<std::pair<uint64_t, size_t>> read_specs = {
+        {0, 1024 * 1024},            // H0: 1MB at offset 0
+        {1024 * 1024, 4096},         // H1: 4KB at offset 1MB
+        {1024 * 1024 + 4096, 4096},  // H2: 4KB at offset 1MB+4KB
+    };
+
+    for (size_t i = 0; i < kNumReads; i++) {
+      reqs[i].offset = read_specs[i].first;
+      reqs[i].len = read_specs[i].second;
+      data.emplace_back(NewAligned(reqs[i].len, 0));
+      reqs[i].scratch = data.back().get();
+      vals.push_back(i);
+    }
+
+    std::function<void(FSReadRequest&, void*)> callback =
+        [&](FSReadRequest& req, void* cb_arg) {
+          size_t i = *(reinterpret_cast<size_t*>(cb_arg));
+          reqs[i].status = req.status;
+          callbacks_invoked++;
+        };
+
+    // Submit all ReadAsync requests
+    for (size_t i = 0; i < kNumReads; i++) {
+      void* cb_arg = static_cast<void*>(&(vals[i]));
+      IOStatus s = file->ReadAsync(reqs[i], opts, callback, cb_arg,
+                                   &(io_handles[i]), &del_fn, nullptr);
+      if (s.IsNotSupported()) {
+        // io_uring not supported, clean up and skip
+        for (size_t j = 0; j < i; j++) {
+          if (io_handles[j]) {
+            del_fn(io_handles[j]);
+          }
+        }
+        ASSERT_OK(fs->DeleteFile(fname, IOOptions(), nullptr));
+        return;
+      }
+      ASSERT_OK(s);
+    }
+
+    // Wait for reads to complete in io_uring (completions in queue but not
+    // consumed). 5 seconds should be plenty for direct I/O reads to complete.
+    std::this_thread::sleep_for(std::chrono::seconds(5));
+
+    // Abort ONLY H0 - this will consume all completions but should correctly
+    // finalize H1 and H2 (since they're not in the abort set).
+    std::vector<void*> abort_handles = {io_handles[0]};
+    ASSERT_OK(fs->AbortIO(abort_handles));
+
+    // Verify H0 is finished (aborted)
+    Posix_IOHandle* h0 = static_cast<Posix_IOHandle*>(io_handles[0]);
+    ASSERT_TRUE(h0->is_finished);
+    ASSERT_EQ(h0->req_count, 2u);  // original + cancel
+
+    // Note: H1 and H2 may or may not be finished at this point. AbortIO
+    // finalizes non-aborted handles whose CQEs arrive while waiting for
+    // aborted handles, but CQE ordering is non-deterministic. If H0's
+    // completions arrived first, H1/H2's CQEs are still in the queue.
+    // Poll handles either case correctly.
+
+    // Poll on H1, H2 - completes them if not already finalized by AbortIO
+    std::vector<void*> poll_handles = {io_handles[1], io_handles[2]};
+
+    // Use a watchdog to detect hang (regression test for the original bug
+    // where AbortIO consumed non-aborted CQEs without finalizing them)
+    std::atomic<bool> poll_completed{false};
+    std::thread watchdog([&]() {
+      for (int i = 0; i < 500; i++) {  // 5 seconds timeout
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+        if (poll_completed) return;
+      }
+      // Bug regression: Poll hung
+      _exit(1);
+    });
+
+    fs->Poll(poll_handles, poll_handles.size());
+    poll_completed = true;
+    watchdog.join();
+
+    // After Poll, H1 and H2 must be finished
+    Posix_IOHandle* h1 = static_cast<Posix_IOHandle*>(io_handles[1]);
+    Posix_IOHandle* h2 = static_cast<Posix_IOHandle*>(io_handles[2]);
+    ASSERT_TRUE(h1->is_finished);
+    ASSERT_TRUE(h2->is_finished);
+
+    // Verify all callbacks were invoked
+    ASSERT_EQ(callbacks_invoked.load(), 3);
+
+    // Clean up handles
+    for (size_t i = 0; i < kNumReads; i++) {
+      if (io_handles[i]) {
+        del_fn(io_handles[i]);
+      }
+    }
+  }
+
+  ASSERT_OK(fs->DeleteFile(fname, IOOptions(), nullptr));
+#else
+  (void)env_;  // Suppress unused variable warning
+#endif
+}
+
 struct StaticDestructionTester {
   bool activated = false;
   ~StaticDestructionTester() {
@@ -3657,6 +4104,60 @@ TEST(EnvTestMisc, StaticDestruction) {
   static_destruction_tester.activated = true;
 }
 
+// Test GetFileSize API
+class TestGetFileSize : public testing::Test {
+ public:
+  TestGetFileSize() { env_ = Env::Default(); }
+  Env* env_;
+};
+
+// Validate GetFileSize API returns the right value.
+// Use the default implementation from env
+TEST_F(TestGetFileSize, GetFileSize) {
+  EnvOptions soptions;
+  auto fs = env_->GetFileSystem();
+
+  std::string fname = test::PerThreadDBPath(env_, "getFileSizeTestfile");
+
+  // randomize file size
+  auto rnd = Random::GetTLSInstance();
+  auto expectedFileSize = rnd->Uniform(256 * 1024) + 1;
+  auto content = rnd->RandomBinaryString(static_cast<int>(expectedFileSize));
+
+  ASSERT_OK(CreateFile(fs.get(), fname, content, false));
+
+  std::unique_ptr<FSRandomAccessFile> file;
+  ASSERT_OK(fs->NewRandomAccessFile(fname, FileOptions(), &file, nullptr));
+
+  uint64_t fileSizeFromFileSystemAPI;
+  ASSERT_OK(
+      fs->GetFileSize(fname, IOOptions(), &fileSizeFromFileSystemAPI, nullptr));
+  ASSERT_EQ(fileSizeFromFileSystemAPI, expectedFileSize);
+
+  uint64_t fileSizeFromFsRandomAccessFileAPI;
+  ASSERT_OK(file->GetFileSize(&fileSizeFromFsRandomAccessFileAPI));
+
+  ASSERT_EQ(fileSizeFromFsRandomAccessFileAPI, expectedFileSize);
+}
+
+class TestIOActivity : public testing::Test {
+ public:
+  TestIOActivity() {}
+};
+
+TEST_F(TestIOActivity, IOActivityToString) {
+  ASSERT_EQ(Env::IOActivityToString(Env::IOActivity::kMultiGet), "MultiGet");
+
+  ASSERT_EQ(Env::IOActivityToString(Env::IOActivity::kCustomIOActivity80),
+            "CustomIOActivity80");
+  ASSERT_EQ(Env::IOActivityToString(Env::IOActivity::kCustomIOActivityA9),
+            "CustomIOActivityA9");
+  ASSERT_EQ(Env::IOActivityToString(Env::IOActivity::kCustomIOActivityFE),
+            "CustomIOActivityFE");
+
+  ASSERT_EQ(Env::IOActivityToString(Env::IOActivity::kUnknown), "Unknown");
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/env/file_system_tracer.cc b/env/file_system_tracer.cc
index dc44107b58c9..46fe4ce7491b 100644
--- a/env/file_system_tracer.cc
+++ b/env/file_system_tracer.cc
@@ -355,9 +355,11 @@ IOStatus FSRandomAccessFileTracingWrapper::ReadAsync(
   IOStatus s = target()->ReadAsync(req, opts, read_async_callback,
                                    read_async_cb_info, io_handle, del_fn, dbg);
 
+#ifndef __clang_analyzer__
   if (!s.ok()) {
     delete read_async_cb_info;
   }
+#endif  // __clang_analyzer__
   return s;
 }
 
diff --git a/env/fs_posix.cc b/env/fs_posix.cc
index 82fb9fba337b..14b34ca6920d 100644
--- a/env/fs_posix.cc
+++ b/env/fs_posix.cc
@@ -243,7 +243,7 @@ class PosixFileSystem : public FileSystem {
       // Use mmap when virtual address-space is plentiful.
       uint64_t size;
       IOOptions opts;
-      s = GetFileSize(fname, opts, &size, nullptr);
+      s = GetFileSizeOnOpenedFile(fd, fname, &size);
       if (s.ok()) {
         void* base = mmap(nullptr, size, PROT_READ, MAP_SHARED, fd, 0);
         if (base != MAP_FAILED) {
@@ -270,7 +270,10 @@ class PosixFileSystem : public FileSystem {
           options
 #if defined(ROCKSDB_IOURING_PRESENT)
           ,
-          !IsIOUringEnabled() ? nullptr : thread_local_io_urings_.get()
+          !IsIOUringEnabled() ? nullptr
+                              : thread_local_async_read_io_urings_.get(),
+          !IsIOUringEnabled() ? nullptr
+                              : thread_local_multi_read_io_urings_.get()
 #endif
               ));
     }
@@ -322,8 +325,17 @@ class PosixFileSystem : public FileSystem {
     if (options.use_mmap_writes) {
       MaybeForceDisableMmap(fd);
     }
+    uint64_t initial_file_size = 0;
+    if (reopen) {
+      s = GetFileSizeOnOpenedFile(fd, fname, &initial_file_size);
+      if (!s.ok()) {
+        close(fd);
+        return s;
+      }
+    }
     if (options.use_mmap_writes && !forceMmapOff_) {
-      result->reset(new PosixMmapFile(fname, fd, page_size_, options));
+      result->reset(
+          new PosixMmapFile(fname, fd, page_size_, options, initial_file_size));
     } else if (options.use_direct_writes && !options.use_mmap_writes) {
 #ifdef OS_MACOSX
       if (fcntl(fd, F_NOCACHE, 1) == -1) {
@@ -343,7 +355,7 @@ class PosixFileSystem : public FileSystem {
 #endif
       result->reset(new PosixWritableFile(
           fname, fd, GetLogicalBlockSizeForWriteIfNeeded(options, fname, fd),
-          options));
+          options, initial_file_size));
     } else {
       // disable mmap writes
       EnvOptions no_mmap_writes_options = options;
@@ -352,7 +364,7 @@ class PosixFileSystem : public FileSystem {
           new PosixWritableFile(fname, fd,
                                 GetLogicalBlockSizeForWriteIfNeeded(
                                     no_mmap_writes_options, fname, fd),
-                                no_mmap_writes_options));
+                                no_mmap_writes_options, initial_file_size));
     }
     return s;
   }
@@ -418,7 +430,8 @@ class PosixFileSystem : public FileSystem {
       MaybeForceDisableMmap(fd);
     }
     if (options.use_mmap_writes && !forceMmapOff_) {
-      result->reset(new PosixMmapFile(fname, fd, page_size_, options));
+      result->reset(new PosixMmapFile(fname, fd, page_size_, options,
+                                      /*initial_file_size=*/0));
     } else if (options.use_direct_writes && !options.use_mmap_writes) {
 #ifdef OS_MACOSX
       if (fcntl(fd, F_NOCACHE, 1) == -1) {
@@ -438,16 +451,16 @@ class PosixFileSystem : public FileSystem {
 #endif
       result->reset(new PosixWritableFile(
           fname, fd, GetLogicalBlockSizeForWriteIfNeeded(options, fname, fd),
-          options));
+          options, /*initial_file_size=*/0));
     } else {
       // disable mmap writes
       FileOptions no_mmap_writes_options = options;
       no_mmap_writes_options.use_mmap_writes = false;
-      result->reset(
-          new PosixWritableFile(fname, fd,
-                                GetLogicalBlockSizeForWriteIfNeeded(
-                                    no_mmap_writes_options, fname, fd),
-                                no_mmap_writes_options));
+      result->reset(new PosixWritableFile(
+          fname, fd,
+          GetLogicalBlockSizeForWriteIfNeeded(no_mmap_writes_options, fname,
+                                              fd),
+          no_mmap_writes_options, /*initial_file_size=*/0));
     }
     return s;
   }
@@ -499,7 +512,7 @@ class PosixFileSystem : public FileSystem {
     uint64_t size;
     if (status.ok()) {
       IOOptions opts;
-      status = GetFileSize(fname, opts, &size, nullptr);
+      status = GetFileSizeOnOpenedFile(fd, fname, &size);
     }
     void* base = nullptr;
     if (status.ok()) {
@@ -661,7 +674,7 @@ class PosixFileSystem : public FileSystem {
 
   IOStatus GetFileSize(const std::string& fname, const IOOptions& /*opts*/,
                        uint64_t* size, IODebugContext* /*dbg*/) override {
-    struct stat sbuf;
+    struct stat sbuf{};
     if (stat(fname.c_str(), &sbuf) != 0) {
       *size = 0;
       return IOError("while stat a file for size", fname, errno);
@@ -858,7 +871,6 @@ class PosixFileSystem : public FileSystem {
       IOOptions opts;
       return CreateDirIfMissing(*result, opts, nullptr);
     }
-    return IOStatus::OK();
   }
 
   IOStatus GetFreeSpace(const std::string& fname, const IOOptions& /*opts*/,
@@ -965,6 +977,22 @@ class PosixFileSystem : public FileSystem {
  private:
   bool forceMmapOff_ = false;  // do we override Env options?
 
+  // This is a faster API comparing to the public method that uses stat to get
+  // file size. However this API only works on opened file.
+  IOStatus GetFileSizeOnOpenedFile(const int fd, const std::string& name,
+                                   uint64_t* size) {
+    struct stat sb{};
+    *size = 0;
+    // Get file information using fstat
+    if (fstat(fd, &sb) == -1) {
+      return IOError(
+          "while fstat a file for size with fd " + std::to_string(fd), name,
+          errno);
+    }
+    *size = sb.st_size;
+    return IOStatus::OK();
+  }
+
 #ifdef OS_LINUX
   // Get the minimum "linux system limit" (i.e, the largest I/O size that the OS
   // can issue to block devices under a directory, also known as
@@ -1062,8 +1090,9 @@ class PosixFileSystem : public FileSystem {
 #if defined(ROCKSDB_IOURING_PRESENT)
     // io_uring_queue_init.
     struct io_uring* iu = nullptr;
-    if (thread_local_io_urings_) {
-      iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get());
+    if (thread_local_async_read_io_urings_) {
+      iu = static_cast<struct io_uring*>(
+          thread_local_async_read_io_urings_->Get());
     }
 
     // Init failed, platform doesn't support io_uring.
@@ -1082,8 +1111,10 @@ class PosixFileSystem : public FileSystem {
         struct io_uring_cqe* cqe = nullptr;
         ssize_t ret = io_uring_wait_cqe(iu, &cqe);
         if (ret) {
-          // abort as it shouldn't be in indeterminate state and there is no
-          // good way currently to handle this error.
+          fprintf(stderr, "Poll: io_uring_wait_cqe failed: %ld", (long)ret);
+          if (ret == -EINTR || ret == -EAGAIN) {
+            continue;  // Retry
+          }
           abort();
         }
 
@@ -1098,25 +1129,7 @@ class PosixFileSystem : public FileSystem {
         // Reset cqe data to catch any stray reuse of it
         static_cast<struct io_uring_cqe*>(cqe)->user_data = 0xd5d5d5d5d5d5d5d5;
 
-        FSReadRequest req;
-        req.scratch = posix_handle->scratch;
-        req.offset = posix_handle->offset;
-        req.len = posix_handle->len;
-
-        size_t finished_len = 0;
-        size_t bytes_read = 0;
-        bool read_again = false;
-        UpdateResult(cqe, "", req.len, posix_handle->iov.iov_len,
-                     true /*async_read*/, posix_handle->use_direct_io,
-                     posix_handle->alignment, finished_len, &req, bytes_read,
-                     read_again);
-        posix_handle->is_finished = true;
-        io_uring_cqe_seen(iu, cqe);
-        posix_handle->cb(req, posix_handle->cb_arg);
-
-        (void)finished_len;
-        (void)bytes_read;
-        (void)read_again;
+        FinalizeAsyncRead(iu, cqe, posix_handle);
 
         if (static_cast<Posix_IOHandle*>(io_handles[i]) == posix_handle) {
           break;
@@ -1126,7 +1139,7 @@ class PosixFileSystem : public FileSystem {
     return IOStatus::OK();
 #else
     (void)io_handles;
-    return IOStatus::NotSupported("Poll");
+    return IOStatus::NotSupported("Poll not implemented");
 #endif
   }
 
@@ -1134,8 +1147,9 @@ class PosixFileSystem : public FileSystem {
 #if defined(ROCKSDB_IOURING_PRESENT)
     // io_uring_queue_init.
     struct io_uring* iu = nullptr;
-    if (thread_local_io_urings_) {
-      iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get());
+    if (thread_local_async_read_io_urings_) {
+      iu = static_cast<struct io_uring*>(
+          thread_local_async_read_io_urings_->Get());
     }
 
     // Init failed, platform doesn't support io_uring.
@@ -1156,6 +1170,11 @@ class PosixFileSystem : public FileSystem {
         return IOStatus::IOError("");
       }
 
+      // Mark this handle as being aborted. This is used when processing
+      // completions to distinguish between aborted handles (expect 2
+      // completions: original + cancel) and non-aborted handles (expect 1).
+      posix_handle->is_being_aborted = true;
+
       // Prepare the cancel request.
       struct io_uring_sqe* sqe;
       sqe = io_uring_get_sqe(iu);
@@ -1185,8 +1204,10 @@ class PosixFileSystem : public FileSystem {
         struct io_uring_cqe* cqe = nullptr;
         ssize_t ret = io_uring_wait_cqe(iu, &cqe);
         if (ret) {
-          // abort as it shouldn't be in indeterminate state and there is no
-          // good way currently to handle this error.
+          fprintf(stderr, "AbortIO: io_uring_wait_cqe failed: %ld", (long)ret);
+          if (ret == -EINTR || ret == -EAGAIN) {
+            continue;  // Retry
+          }
           abort();
         }
         assert(cqe != nullptr);
@@ -1200,6 +1221,14 @@ class PosixFileSystem : public FileSystem {
         }
         posix_handle->req_count++;
 
+        if (!posix_handle->is_being_aborted) {
+          // This is a completion for a handle NOT being aborted.
+          // It only has 1 outstanding request (the original read), so we
+          // should finalize it now.
+          FinalizeAsyncRead(iu, cqe, posix_handle);
+          continue;
+        }
+
         // Reset cqe data to catch any stray reuse of it
         static_cast<struct io_uring_cqe*>(cqe)->user_data = 0xd5d5d5d5d5d5d5d5;
         io_uring_cqe_seen(iu, cqe);
@@ -1213,16 +1242,23 @@ class PosixFileSystem : public FileSystem {
         // - And finally, if the request to cancel wasn't
         //   found, the cancel request is completed with -ENOENT.
         //
-        // Every handle has to wait for 2 requests completion: original one and
-        // the cancel request which is tracked by PosixHandle::req_count.
-        if (posix_handle->req_count == 2 &&
-            static_cast<Posix_IOHandle*>(io_handles[i]) == posix_handle) {
+        // Every handle being aborted has to wait for 2 requests completion:
+        // original one and the cancel request which is tracked by
+        // PosixHandle::req_count.
+        // Note: We must mark is_finished and invoke the callback for ANY handle
+        // that reaches req_count == 2, not just the one we're currently waiting
+        // for (io_handles[i]). Otherwise, if completions arrive out of order,
+        // we consume another handle's completions without marking it finished,
+        // causing an infinite hang when we later wait for that handle.
+        if (posix_handle->req_count == 2) {
           posix_handle->is_finished = true;
           FSReadRequest req;
           req.status = IOStatus::Aborted();
           posix_handle->cb(req, posix_handle->cb_arg);
 
-          break;
+          if (static_cast<Posix_IOHandle*>(io_handles[i]) == posix_handle) {
+            break;
+          }
         }
       }
     }
@@ -1238,16 +1274,18 @@ class PosixFileSystem : public FileSystem {
   void SupportedOps(int64_t& supported_ops) override {
     supported_ops = 0;
 #if defined(ROCKSDB_IOURING_PRESENT)
-    if (IsIOUringEnabled()) {
+    if (IsIOUringEnabled() && thread_local_async_read_io_urings_) {
       // Underlying FS supports async_io
       supported_ops |= (1 << FSSupportedOps::kAsyncIO);
     }
 #endif
+    supported_ops |= (1 << FSSupportedOps::kFSPrefetch);
   }
 
 #if defined(ROCKSDB_IOURING_PRESENT)
   // io_uring instance
-  std::unique_ptr<ThreadLocalPtr> thread_local_io_urings_;
+  std::unique_ptr<ThreadLocalPtr> thread_local_async_read_io_urings_;
+  std::unique_ptr<ThreadLocalPtr> thread_local_multi_read_io_urings_;
 #endif
 
   size_t page_size_;
@@ -1302,12 +1340,12 @@ PosixFileSystem::PosixFileSystem()
       page_size_(getpagesize()),
       allow_non_owner_access_(true) {
 #if defined(ROCKSDB_IOURING_PRESENT)
-  // Test whether IOUring is supported, and if it does, create a managing
-  // object for thread local point so that in the future thread-local
-  // io_uring can be created.
+  // Test whether IOUring is supported with the same flags that ReadAsync and
+  // MultiRead will use at runtime.
   struct io_uring* new_io_uring = CreateIOUring();
   if (new_io_uring != nullptr) {
-    thread_local_io_urings_.reset(new ThreadLocalPtr(DeleteIOUring));
+    thread_local_async_read_io_urings_.reset(new ThreadLocalPtr(DeleteIOUring));
+    thread_local_multi_read_io_urings_.reset(new ThreadLocalPtr(DeleteIOUring));
     delete new_io_uring;
   }
 #endif
diff --git a/env/io_posix.cc b/env/io_posix.cc
index 231e88daef39..a04e469cb91e 100644
--- a/env/io_posix.cc
+++ b/env/io_posix.cc
@@ -589,7 +589,8 @@ PosixRandomAccessFile::PosixRandomAccessFile(
     const EnvOptions& options
 #if defined(ROCKSDB_IOURING_PRESENT)
     ,
-    ThreadLocalPtr* thread_local_io_urings
+    ThreadLocalPtr* thread_local_async_read_io_urings,
+    ThreadLocalPtr* thread_local_multi_read_io_urings
 #endif
     )
     : filename_(fname),
@@ -598,7 +599,8 @@ PosixRandomAccessFile::PosixRandomAccessFile(
       logical_sector_size_(logical_block_size)
 #if defined(ROCKSDB_IOURING_PRESENT)
       ,
-      thread_local_io_urings_(thread_local_io_urings)
+      thread_local_async_read_io_urings_(thread_local_async_read_io_urings),
+      thread_local_multi_read_io_urings_(thread_local_multi_read_io_urings)
 #endif
 {
   assert(!options.use_direct_reads || !options.use_mmap_reads);
@@ -607,6 +609,17 @@ PosixRandomAccessFile::PosixRandomAccessFile(
 
 PosixRandomAccessFile::~PosixRandomAccessFile() { close(fd_); }
 
+IOStatus PosixRandomAccessFile::GetFileSize(uint64_t* result) {
+  struct stat sbuf{};
+  if (fstat(fd_, &sbuf) != 0) {
+    *result = 0;
+    return IOError("While fstat with fd " + std::to_string(fd_), filename_,
+                   errno);
+  }
+  *result = sbuf.st_size;
+  return IOStatus::OK();
+}
+
 IOStatus PosixRandomAccessFile::Read(uint64_t offset, size_t n,
                                      const IOOptions& /*opts*/, Slice* result,
                                      char* scratch,
@@ -648,6 +661,83 @@ IOStatus PosixRandomAccessFile::Read(uint64_t offset, size_t n,
   return s;
 }
 
+// MultiRead: Perform multiple concurrent read requests using io_uring.
+//
+// OVERVIEW:
+// This function batches multiple read requests and submits them concurrently
+// to io_uring for improved I/O performance. It operates synchronously from the
+// caller's perspective (blocks until all reads complete) but uses io_uring's
+// async capabilities internally for parallel I/O execution.
+//
+// IO_URING LIFECYCLE:
+// 1. Preparation Phase:
+//    - Allocate SQEs (Submission Queue Entries) for read requests
+//    - Limited by: min(pending_work, io_uring_sq_space_left(), kIoUringDepth -
+//    inflight)
+//    - Uses io_uring_sq_space_left() to query available SQ slots
+//    - Each SQE is tracked in wrap_cache for completion matching
+//
+// 2. Submission Phase:
+//    - Loop: while io_uring_sq_ready() > 0 (SQEs pending submission)
+//    - Call io_uring_submit_and_wait() to submit SQEs and wait for CQEs
+//    - Handles retryable errors (EINTR, EAGAIN) by continuing
+//    - Breaks on terminal errors (logs error, sets err variable)
+//
+// 3. Completion Phase:
+//    - Non-blocking CQE reaping via io_uring_for_each_cqe()
+//    - Matches CQEs to requests using user_data pointer
+//    - Processes results: updates bytes read, handles partial reads
+//    - Removes completed requests from wrap_cache
+//
+// 4. Loop Iteration:
+//    - Repeats until: all requests submitted AND all completions reaped
+//    - Termination condition: (num_reqs == reqs_off) &&
+//    resubmit_rq_list.empty() && wrap_cache.empty()
+//
+// ERROR HANDLING STRATEGY:
+// - Retryable submission errors (-EINTR, -EAGAIN): Retry submission
+// - Memory pressure (-ENOMEM): Mark memory_pressure_on_submission, attempt
+// recovery
+// - Terminal submission errors: Break, enter teardown path
+// - Retryable CQE errors (-EINTR, -EAGAIN): Add to resubmit_rq_list for retry
+// - Terminal CQE errors: Set ios to IOError, continue processing other CQEs
+// - Teardown path: If SQEs remain unsubmitted after error, reap submitted CQEs,
+//   destroy io_uring instance, return error
+//
+// PARTIAL READ HANDLING:
+// - Short reads (bytes_read < requested): Request added to resubmit_rq_list
+// - finished_len tracks cumulative bytes read across resubmissions
+// - iov.iov_base/iov_len adjusted on each resubmission attempt
+// - UpdateResult() determines if read should be retried based on:
+//   * Direct I/O alignment requirements
+//   * EOF detection
+//   * Error conditions
+//
+// RESUBMISSION LOGIC:
+// - resubmit_rq_list: Requests needing retry (short reads, EINTR/EAGAIN errors)
+// - Prioritized in SQE allocation loop: resubmits before new requests
+// - List cleared after SQE preparation
+// - Requests remain in wrap_cache across resubmissions until fully complete
+//
+// CONCURRENCY CONTROL:
+// - wrap_cache.size(): Tracks total inflight requests (SQ + CQ)
+// - io_uring_sq_ready(): Queries SQEs prepared but not yet submitted
+// - io_uring_sq_space_left(): Queries available SQ slots
+// - Max concurrency: kIoUringDepth (256)
+//
+// ACCOUNTING CORRECTNESS:
+// - Uses io_uring native APIs (io_uring_sq_ready, io_uring_sq_space_left)
+//   instead of manual counters for robustness
+// - wrap_cache is the authoritative source for inflight request tracking
+// - Re-query io_uring_sq_ready() after submission loop to detect
+//   unsubmitted SQEs (indicates submission errors)
+//
+// THREAD SAFETY:
+// - Uses thread-local io_uring instance (thread_local_multi_read_io_urings_)
+// - IORING_SETUP_SINGLE_ISSUER: Only one thread submits to this ring
+// - IORING_SETUP_DEFER_TASKRUN: Task work runs in submitting thread
+// - No cross-thread coordination required
+//
 IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs,
                                           const IOOptions& options,
                                           IODebugContext* dbg) {
@@ -661,12 +751,13 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs,
 
 #if defined(ROCKSDB_IOURING_PRESENT)
   struct io_uring* iu = nullptr;
-  if (thread_local_io_urings_) {
-    iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get());
+  if (thread_local_multi_read_io_urings_) {
+    iu = static_cast<struct io_uring*>(
+        thread_local_multi_read_io_urings_->Get());
     if (iu == nullptr) {
       iu = CreateIOUring();
       if (iu != nullptr) {
-        thread_local_io_urings_->Reset(iu);
+        thread_local_multi_read_io_urings_->Reset(iu);
       }
     }
   }
@@ -677,8 +768,6 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs,
     return FSRandomAccessFile::MultiRead(reqs, num_reqs, options, dbg);
   }
 
-  IOStatus ios = IOStatus::OK();
-
   struct WrappedReadRequest {
     FSReadRequest* req;
     struct iovec iov;
@@ -687,118 +776,199 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs,
   };
 
   autovector<WrappedReadRequest, 32> req_wraps;
-  autovector<WrappedReadRequest*, 4> incomplete_rq_list;
+  autovector<WrappedReadRequest*, 4> resubmit_rq_list;
   std::unordered_set<WrappedReadRequest*> wrap_cache;
 
   for (size_t i = 0; i < num_reqs; i++) {
     req_wraps.emplace_back(&reqs[i]);
   }
 
+  IOStatus ios = IOStatus::OK();
   size_t reqs_off = 0;
-  while (num_reqs > reqs_off || !incomplete_rq_list.empty()) {
-    size_t this_reqs = (num_reqs - reqs_off) + incomplete_rq_list.size();
-
-    // If requests exceed depth, split it into batches
-    if (this_reqs > kIoUringDepth) {
-      this_reqs = kIoUringDepth;
-    }
-
-    assert(incomplete_rq_list.size() <= this_reqs);
-    for (size_t i = 0; i < this_reqs; i++) {
-      WrappedReadRequest* rep_to_submit;
-      if (i < incomplete_rq_list.size()) {
-        rep_to_submit = incomplete_rq_list[i];
+  while ((num_reqs > reqs_off) || !resubmit_rq_list.empty() ||
+         !wrap_cache.empty()) {
+    assert(resubmit_rq_list.size() + wrap_cache.size() <= kIoUringDepth);
+    // Total number of requests that still need to be submitted, includes:
+    //
+    //  1) requests NOT yet submitted (num_reqs - reqs_off)
+    //  2) requests on resubmission list (resubmit_rq_list)
+    //
+    // capped by min of the # of remaining entries in IO ring submission queue
+    // and the max IO ring depth less the inflight requests.
+    size_t new_sqe_reqs_count = std::min({
+        num_reqs - reqs_off + resubmit_rq_list.size(),
+        static_cast<size_t>(io_uring_sq_space_left(iu)),
+        kIoUringDepth - wrap_cache.size()  // queue depth less inflight requests
+    });
+    for (size_t i = 0; i < new_sqe_reqs_count; i++) {
+      WrappedReadRequest* req;
+      if (i < resubmit_rq_list.size()) {
+        req = resubmit_rq_list[i];
       } else {
-        rep_to_submit = &req_wraps[reqs_off++];
+        req = &req_wraps[reqs_off++];
       }
-      assert(rep_to_submit->req->len > rep_to_submit->finished_len);
-      rep_to_submit->iov.iov_base =
-          rep_to_submit->req->scratch + rep_to_submit->finished_len;
-      rep_to_submit->iov.iov_len =
-          rep_to_submit->req->len - rep_to_submit->finished_len;
+      assert(req->req->len > req->finished_len);
+      req->iov.iov_base = req->req->scratch + req->finished_len;
+      req->iov.iov_len = req->req->len - req->finished_len;
 
       struct io_uring_sqe* sqe;
       sqe = io_uring_get_sqe(iu);
-      io_uring_prep_readv(
-          sqe, fd_, &rep_to_submit->iov, 1,
-          rep_to_submit->req->offset + rep_to_submit->finished_len);
-      io_uring_sqe_set_data(sqe, rep_to_submit);
-      wrap_cache.emplace(rep_to_submit);
+      // NULL is unexpected as we do maintain proper ring accounting.
+      assert(sqe);
+      io_uring_prep_readv(sqe, fd_, &req->iov, 1,
+                          req->req->offset + req->finished_len);
+      io_uring_sqe_set_data(sqe, req);
+      wrap_cache.emplace(req);
     }
-    incomplete_rq_list.clear();
+    resubmit_rq_list.clear();
+
+    struct io_uring_cqe* cqe = nullptr;
+    unsigned head;
+    ssize_t err = 0;
+    bool memory_pressure_on_submission = false;
+    unsigned reqs_pending_submission;
+    unsigned reqs_submitted = 0;
+    while ((reqs_pending_submission = io_uring_sq_ready(iu))) {
+      // MultiRead is synchronous in nature. io_uring_submit_and_wait provides
+      // batching semantics (submit + best effort wait in one syscall), while
+      // io_uring_submit enables async producer/consumer semantics (submit
+      // only, requires separate reaping). We chose batching approach to
+      // reduce the volume of syscalls and context switches.
+      ssize_t ret = io_uring_submit_and_wait(iu, reqs_pending_submission);
+      if (ret < 0) {
+        if (-EINTR == ret || -EAGAIN == ret) {
+          // Submission failed due to rare, retryable syscall error. Try again.
+          continue;
+        }
+        if (-ENOMEM == ret) {
+          fprintf(stderr,
+                  "PosixRandomAccessFile::MultiRead: io_uring_submit_and_wait "
+                  "experienced terse memory condition.\n");
+          // Best effort to reclaim resources in terse condition.
+          memory_pressure_on_submission = true;
+        } else {
+          fprintf(stderr,
+                  "PosixRandomAccessFile::MultiRead: "
+                  "io_uring_submit_and_wait returned terminal error: %zd.\n",
+                  ret);
+          err = ret;
+        }
+        break;
+      }
+      if (0 == ret) {
+        // This scenario is unexpected for any modern kernel!
+        // We deliberately error out to avoid bugs around infinite loops.
+        fprintf(stderr,
+                "PosixRandomAccessFile::MultiRead: "
+                "io_uring_submit_and_wait returned 0 submissions!\n");
+        break;
+      }
+      reqs_submitted += static_cast<unsigned int>(ret);
+    };
+    reqs_pending_submission = io_uring_sq_ready(iu);
 
-    ssize_t ret =
-        io_uring_submit_and_wait(iu, static_cast<unsigned int>(this_reqs));
     TEST_SYNC_POINT_CALLBACK(
-        "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return1",
-        &ret);
-    TEST_SYNC_POINT_CALLBACK(
-        "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return2",
-        iu);
-
-    if (static_cast<size_t>(ret) != this_reqs) {
-      fprintf(stderr, "ret = %ld this_reqs: %ld\n", (long)ret, (long)this_reqs);
-      // If error happens and we submitted fewer than expected, it is an
-      // exception case and we don't retry here. We should still consume
-      // what is is submitted in the ring.
-      for (ssize_t i = 0; i < ret; i++) {
-        struct io_uring_cqe* cqe = nullptr;
-        io_uring_wait_cqe(iu, &cqe);
-        if (cqe != nullptr) {
-          io_uring_cqe_seen(iu, cqe);
+        "PosixRandomAccessFile::MultiRead:io_uring_sq_ready:return1",
+        &reqs_pending_submission);
+
+    // Error occurred or IO uring stopped submitting outstanding requests.
+    if (reqs_pending_submission && !memory_pressure_on_submission) {
+      // IO ring is initialized once in thread-local variable and then reused
+      // to handle the consecutive MultiRead API calls. Therefore, it's crucial
+      // to reap all the submitted requests.
+      //
+      // NOTE: Loop will run indefinitely until we reap all the completions!!!
+      size_t nr = 0;
+      assert(reqs_pending_submission <= wrap_cache.size());
+      size_t nr_await_cqe = wrap_cache.size() - reqs_pending_submission;
+      while (nr < nr_await_cqe) {
+        // blocking
+        io_uring_wait_cqes(iu, &cqe,
+                           static_cast<unsigned int>(nr_await_cqe - nr),
+                           nullptr, nullptr);
+        size_t reaped_cqe_count = 0;
+        io_uring_for_each_cqe(iu, head, cqe) { reaped_cqe_count++; }
+        if (reaped_cqe_count > 0) {
+          io_uring_cq_advance(iu, static_cast<unsigned int>(reaped_cqe_count));
+          nr += reaped_cqe_count;
         }
       }
-      return IOStatus::IOError("io_uring_submit_and_wait() requested " +
-                               std::to_string(this_reqs) + " but returned " +
-                               std::to_string(ret));
-    }
 
-    for (size_t i = 0; i < this_reqs; i++) {
-      struct io_uring_cqe* cqe = nullptr;
-      WrappedReadRequest* req_wrap;
-
-      // We could use the peek variant here, but this seems safer in terms
-      // of our initial wait not reaping all completions
-      ret = io_uring_wait_cqe(iu, &cqe);
       TEST_SYNC_POINT_CALLBACK(
-          "PosixRandomAccessFile::MultiRead:io_uring_wait_cqe:return", &ret);
-      if (ret) {
-        ios = IOStatus::IOError("io_uring_wait_cqe() returns " +
-                                std::to_string(ret));
-
-        if (cqe != nullptr) {
-          io_uring_cqe_seen(iu, cqe);
-        }
-        continue;
+          "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return2",
+          iu);
+
+      // While all the submitted completions have been reaped successfully,
+      // IO ring submission queue still contains at least one non-submitted
+      // request. Destroy io_uring (discards unsubmitted SQEs).
+      //
+      // NOTE: This is a rare scenario and should not happen in normal cases.
+      //       Hence, this should NOT materially impact the performance metrics.
+      io_uring_queue_exit(iu);
+      delete iu;
+      thread_local_multi_read_io_urings_->Reset(nullptr);
+
+      if (err < 0) {
+        return IOStatus::IOError(
+            "io_uring_submit_and_wait() failed with an error " +
+            std::to_string(err));
       }
+      return IOStatus::IOError(
+          "io_uring_submit_and_wait() requested " +
+          std::to_string(reqs_submitted + reqs_pending_submission) +
+          " but returned " + std::to_string(reqs_submitted));
+    }
 
-      req_wrap = static_cast<WrappedReadRequest*>(io_uring_cqe_get_data(cqe));
-      // Reset cqe data to catch any stray reuse of it
-      static_cast<struct io_uring_cqe*>(cqe)->user_data = 0xd5d5d5d5d5d5d5d5;
-      // Check that we got a valid unique cqe data
-      auto wrap_check = wrap_cache.find(req_wrap);
-      if (wrap_check == wrap_cache.end()) {
-        fprintf(stderr,
-                "PosixRandomAccessFile::MultiRead: "
-                "Bad cqe data from IO uring - %p\n",
-                req_wrap);
-        port::PrintStack();
-        ios = IOStatus::IOError("io_uring_cqe_get_data() returned " +
-                                std::to_string((uint64_t)req_wrap));
-        continue;
-      }
-      wrap_cache.erase(wrap_check);
-
-      FSReadRequest* req = req_wrap->req;
-      size_t bytes_read = 0;
-      bool read_again = false;
-      UpdateResult(cqe, filename_, req->len, req_wrap->iov.iov_len,
-                   false /*async_read*/, use_direct_io(),
-                   GetRequiredBufferAlignment(), req_wrap->finished_len, req,
-                   bytes_read, read_again);
-      int32_t res = cqe->res;
-      if (res >= 0) {
-        if (bytes_read == 0) {
+    if ((0 == reqs_submitted) && wrap_cache.size() > reqs_pending_submission) {
+      // If no requests have been submitted and there is at least one request
+      // pending completion, wait for at least one completion to arrive.
+      // This is a guardrail to prevent the busy CPU loops.
+      //
+      // NOTE: it's not really a tight CPU-burning loop in the traditional sense
+      // as it's naturally throttled by the io_uring_submit_and_wait() syscall.
+      io_uring_wait_cqe(iu, &cqe);
+    }
+
+    unsigned int nr = 0;
+    io_uring_for_each_cqe(iu, head, cqe) {  // non-blocking
+      if (cqe->user_data) {  // non-discarded, valid user data only!
+        nr++;
+        WrappedReadRequest* req_wrap =
+            static_cast<WrappedReadRequest*>(io_uring_cqe_get_data(cqe));
+        // Reset cqe data to catch any stray reuse of it
+        static_cast<struct io_uring_cqe*>(cqe)->user_data = 0xd5d5d5d5d5d5d5d5;
+        // Check that we got a valid unique cqe data
+        auto wrap_check = wrap_cache.find(req_wrap);
+        if (wrap_check == wrap_cache.end()) {
+          fprintf(stderr,
+                  "PosixRandomAccessFile::MultiRead: "
+                  "Bad cqe data from IO uring - %p\n",
+                  req_wrap);
+          port::PrintStack();
+          ios = IOStatus::IOError("io_uring_cqe_get_data() returned " +
+                                  std::to_string((uint64_t)req_wrap));
+          continue;
+        }
+        wrap_cache.erase(wrap_check);
+        if (cqe->res < 0) {
+          if (-EINTR == cqe->res || -EAGAIN == cqe->res) {
+            resubmit_rq_list.push_back(req_wrap);
+          } else {
+            ios = IOStatus::IOError("io_uring_for_each_cqe() returns " +
+                                    std::to_string(cqe->res));
+          }
+          continue;
+        }
+        // cqe->res >= 0
+        FSReadRequest* req = req_wrap->req;
+        size_t bytes_read = 0;
+        bool read_again = false;
+        UpdateResult(cqe, filename_, req->len, req_wrap->iov.iov_len,
+                     false /*async_read*/, use_direct_io(),
+                     GetRequiredBufferAlignment(), req_wrap->finished_len, req,
+                     bytes_read, read_again);
+
+        if (0 == bytes_read) {
           if (read_again) {
             Slice tmp_slice;
             req->status =
@@ -808,14 +978,15 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs,
             req->result =
                 Slice(req->scratch, req_wrap->finished_len + tmp_slice.size());
           }
-          // else It means EOF so no need to do anything.
+          // else it means EOF so no need to do anything.
         } else if (bytes_read < req_wrap->iov.iov_len) {
-          incomplete_rq_list.push_back(req_wrap);
+          resubmit_rq_list.push_back(req_wrap);
         }
       }
-      io_uring_cqe_seen(iu, cqe);
     }
-    wrap_cache.clear();
+    if (nr > 0) {
+      io_uring_cq_advance(iu, nr);
+    }
   }
   return ios;
 #else
@@ -912,19 +1083,21 @@ IOStatus PosixRandomAccessFile::ReadAsync(
 #if defined(ROCKSDB_IOURING_PRESENT)
   // io_uring_queue_init.
   struct io_uring* iu = nullptr;
-  if (thread_local_io_urings_) {
-    iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get());
+  if (thread_local_async_read_io_urings_) {
+    iu = static_cast<struct io_uring*>(
+        thread_local_async_read_io_urings_->Get());
     if (iu == nullptr) {
       iu = CreateIOUring();
       if (iu != nullptr) {
-        thread_local_io_urings_->Reset(iu);
+        thread_local_async_read_io_urings_->Reset(iu);
       }
     }
   }
 
   // Init failed, platform doesn't support io_uring.
   if (iu == nullptr) {
-    return IOStatus::NotSupported("ReadAsync");
+    fprintf(stderr, "failed to init io_uring\n");
+    return IOStatus::NotSupported("ReadAsync: failed to init io_uring");
   }
 
   // Allocate io_handle.
@@ -954,11 +1127,35 @@ IOStatus PosixRandomAccessFile::ReadAsync(
   io_uring_sqe_set_data(sqe, posix_handle);
 
   // Step 4: io_uring_submit
-  ssize_t ret = io_uring_submit(iu);
-  if (ret < 0) {
-    fprintf(stderr, "io_uring_submit error: %ld\n", long(ret));
-    return IOStatus::IOError("io_uring_submit() requested but returned " +
-                             std::to_string(ret));
+  ssize_t ret;
+  do {
+    ret = io_uring_submit(iu);
+    if (ret < 0) {
+      if (-EINTR == ret || -EAGAIN == ret) {
+        // Submission failed due to transient error. Try again.
+        continue;
+      }
+      fprintf(stderr,
+              "PosixRandomAccessFile::ReadAsync: "
+              "io_uring_submit returned terminal error = %zd\n",
+              ret);
+      break;
+    }
+    if (0 == ret) {
+      // Unexpected. Will be reported as error.
+      break;
+    }
+  } while (ret < 1);
+  if (ret <= 0) {
+    return IOStatus::IOError(
+        "PosixRandomAccessFile::ReadAsync: io_uring_submit() returned " +
+        std::to_string(ret));
+  }
+  if (ret > 1) {
+    fprintf(stderr,
+            "PosixRandomAccessFile::ReadAsync: "
+            "io_uring_submit() returned = %zd\n",
+            ret);
   }
   return IOStatus::OK();
 #else
@@ -967,7 +1164,8 @@ IOStatus PosixRandomAccessFile::ReadAsync(
   (void)cb_arg;
   (void)io_handle;
   (void)del_fn;
-  return IOStatus::NotSupported("ReadAsync");
+  return IOStatus::NotSupported(
+      "ReadAsync: ROCKSDB_IOURING_PRESENT is not set");
 #endif
 }
 
@@ -1056,6 +1254,11 @@ IOStatus PosixMmapReadableFile::InvalidateCache(size_t offset, size_t length) {
 #endif
 }
 
+IOStatus PosixMmapReadableFile::GetFileSize(uint64_t* result) {
+  *result = length_;
+  return IOStatus::OK();
+}
+
 /*
  * PosixMmapFile
  *
@@ -1138,7 +1341,8 @@ IOStatus PosixMmapFile::Msync() {
 }
 
 PosixMmapFile::PosixMmapFile(const std::string& fname, int fd, size_t page_size,
-                             const EnvOptions& options)
+                             const EnvOptions& options,
+                             uint64_t initial_file_size)
     : filename_(fname),
       fd_(fd),
       page_size_(page_size),
@@ -1147,7 +1351,7 @@ PosixMmapFile::PosixMmapFile(const std::string& fname, int fd, size_t page_size,
       limit_(nullptr),
       dst_(nullptr),
       last_sync_(nullptr),
-      file_offset_(0) {
+      file_offset_(initial_file_size) {
 #ifdef ROCKSDB_FALLOCATE_PRESENT
   allow_fallocate_ = options.allow_fallocate;
   fallocate_with_keep_size_ = options.fallocate_with_keep_size;
@@ -1317,12 +1521,13 @@ IOStatus PosixMmapFile::Allocate(uint64_t offset, uint64_t len,
  */
 PosixWritableFile::PosixWritableFile(const std::string& fname, int fd,
                                      size_t logical_block_size,
-                                     const EnvOptions& options)
+                                     const EnvOptions& options,
+                                     uint64_t initial_file_size)
     : FSWritableFile(options),
       filename_(fname),
       use_direct_io_(options.use_direct_writes),
       fd_(fd),
-      filesize_(0),
+      filesize_(initial_file_size),
       logical_sector_size_(logical_block_size) {
 #ifdef ROCKSDB_FALLOCATE_PRESENT
   allow_fallocate_ = options.allow_fallocate;
@@ -1386,6 +1591,7 @@ IOStatus PosixWritableFile::Truncate(uint64_t size, const IOOptions& /*opts*/,
                 filename_, errno);
   } else {
     filesize_ = size;
+    lseek(fd_, filesize_, SEEK_SET);
   }
   return s;
 }
diff --git a/env/io_posix.h b/env/io_posix.h
index 60788df9bf8b..bca0c5836a63 100644
--- a/env/io_posix.h
+++ b/env/io_posix.h
@@ -11,6 +11,16 @@
 #if defined(ROCKSDB_IOURING_PRESENT)
 #include <liburing.h>
 #include <sys/uio.h>
+
+// Compatibility defines for io_uring flags that may not be present in older
+// kernel headers. These values are fixed and won't change, so it's safe to
+// define them even if the running kernel doesn't support them.
+#ifndef IORING_SETUP_SINGLE_ISSUER
+#define IORING_SETUP_SINGLE_ISSUER (1U << 12)
+#endif
+#ifndef IORING_SETUP_DEFER_TASKRUN
+#define IORING_SETUP_DEFER_TASKRUN (1U << 13)
+#endif
 #endif
 #include <unistd.h>
 
@@ -117,6 +127,7 @@ struct Posix_IOHandle {
         use_direct_io(_use_direct_io),
         alignment(_alignment),
         is_finished(false),
+        is_being_aborted(false),
         req_count(0) {}
 
   struct iovec iov;
@@ -129,6 +140,10 @@ struct Posix_IOHandle {
   bool use_direct_io;
   size_t alignment;
   bool is_finished;
+  // is_being_aborted is set by AbortIO when a cancel request is submitted.
+  // Used to distinguish between aborted handles (expect 2 completions) and
+  // non-aborted handles (expect 1 completion) when processing completions.
+  bool is_being_aborted;
   // req_count is used by AbortIO API to keep track of number of requests.
   uint32_t req_count;
 };
@@ -187,6 +202,27 @@ inline void UpdateResult(struct io_uring_cqe* cqe, const std::string& file_name,
   (void)len;
 #endif
 }
+
+// Finalize a completed async read request.
+// Processes the CQE result, marks the handle as finished, and invokes the
+// callback. This is shared between Poll and AbortIO (for non-aborted handles).
+inline void FinalizeAsyncRead(struct io_uring* iu, struct io_uring_cqe* cqe,
+                              Posix_IOHandle* posix_handle) {
+  FSReadRequest req;
+  req.scratch = posix_handle->scratch;
+  req.offset = posix_handle->offset;
+  req.len = posix_handle->len;
+
+  size_t finished_len = 0;
+  size_t bytes_read = 0;
+  bool read_again = false;
+  UpdateResult(cqe, "", req.len, posix_handle->iov.iov_len, true /*async_read*/,
+               posix_handle->use_direct_io, posix_handle->alignment,
+               finished_len, &req, bytes_read, read_again);
+  posix_handle->is_finished = true;
+  io_uring_cqe_seen(iu, cqe);
+  posix_handle->cb(req, posix_handle->cb_arg);
+}
 #endif
 
 #ifdef OS_LINUX
@@ -299,7 +335,10 @@ inline void DeleteIOUring(void* p) {
 
 inline struct io_uring* CreateIOUring() {
   struct io_uring* new_io_uring = new struct io_uring;
-  int ret = io_uring_queue_init(kIoUringDepth, new_io_uring, 0);
+  unsigned int flags = 0;
+  flags |= IORING_SETUP_SINGLE_ISSUER;
+  flags |= IORING_SETUP_DEFER_TASKRUN;
+  int ret = io_uring_queue_init(kIoUringDepth, new_io_uring, flags);
   if (ret) {
     delete new_io_uring;
     new_io_uring = nullptr;
@@ -315,7 +354,8 @@ class PosixRandomAccessFile : public FSRandomAccessFile {
   bool use_direct_io_;
   size_t logical_sector_size_;
 #if defined(ROCKSDB_IOURING_PRESENT)
-  ThreadLocalPtr* thread_local_io_urings_;
+  ThreadLocalPtr* thread_local_async_read_io_urings_;
+  ThreadLocalPtr* thread_local_multi_read_io_urings_;
 #endif
 
  public:
@@ -323,7 +363,8 @@ class PosixRandomAccessFile : public FSRandomAccessFile {
                         size_t logical_block_size, const EnvOptions& options
 #if defined(ROCKSDB_IOURING_PRESENT)
                         ,
-                        ThreadLocalPtr* thread_local_io_urings
+                        ThreadLocalPtr* thread_local_async_read_io_urings,
+                        ThreadLocalPtr* thread_local_multi_read_io_urings
 #endif
   );
   virtual ~PosixRandomAccessFile();
@@ -352,6 +393,8 @@ class PosixRandomAccessFile : public FSRandomAccessFile {
                              void* cb_arg, void** io_handle,
                              IOHandleDeleter* del_fn,
                              IODebugContext* dbg) override;
+
+  virtual IOStatus GetFileSize(uint64_t* result) override;
 };
 
 class PosixWritableFile : public FSWritableFile {
@@ -374,7 +417,8 @@ class PosixWritableFile : public FSWritableFile {
  public:
   explicit PosixWritableFile(const std::string& fname, int fd,
                              size_t logical_block_size,
-                             const EnvOptions& options);
+                             const EnvOptions& options,
+                             uint64_t initial_file_size);
   virtual ~PosixWritableFile();
 
   // Need to implement this so the file is truncated correctly
@@ -436,6 +480,7 @@ class PosixMmapReadableFile : public FSRandomAccessFile {
                 char* scratch, IODebugContext* dbg) const override;
   void Hint(AccessPattern pattern) override;
   IOStatus InvalidateCache(size_t offset, size_t length) override;
+  virtual IOStatus GetFileSize(uint64_t* result) override;
 };
 
 class PosixMmapFile : public FSWritableFile {
@@ -469,7 +514,7 @@ class PosixMmapFile : public FSWritableFile {
 
  public:
   PosixMmapFile(const std::string& fname, int fd, size_t page_size,
-                const EnvOptions& options);
+                const EnvOptions& options, uint64_t initial_file_size);
   ~PosixMmapFile();
 
   // Means Close() will properly take care of truncate
diff --git a/env/io_posix_test.cc b/env/io_posix_test.cc
index 81ce5058708b..6daff356afaf 100644
--- a/env/io_posix_test.cc
+++ b/env/io_posix_test.cc
@@ -4,6 +4,7 @@
 // (found in the LICENSE.Apache file in the root directory).
 
 #include "test_util/testharness.h"
+#include "util/random.h"
 
 #ifdef ROCKSDB_LIB_IO_POSIX
 #include "env/io_posix.h"
@@ -131,6 +132,48 @@ TEST_F(LogicalBlockSizeCacheTest, Ref) {
 }
 #endif
 
+class PosixWritableFileTest : public testing::Test {};
+
+TEST_F(PosixWritableFileTest, SeekAfterTruncate) {
+  std::shared_ptr<FileSystem> fs = FileSystem::Default();
+  std::string path =
+      test::PerThreadDBPath("PosixWritableFileTest_SeekAfterTruncate");
+  Random rnd(300);
+  std::unique_ptr<FSWritableFile> wfile;
+
+  ASSERT_OK(fs->NewWritableFile(path, FileOptions(), &wfile, nullptr));
+  ASSERT_OK(wfile->Append(rnd.RandomString(16384), IOOptions(), nullptr));
+  ASSERT_OK(wfile->Truncate(4096, IOOptions(), nullptr));
+  ASSERT_OK(wfile->Append(rnd.RandomString(4096), IOOptions(), nullptr));
+  ASSERT_OK(wfile->Close(IOOptions(), nullptr));
+  wfile.reset();
+
+  uint64_t size = 0;
+  ASSERT_OK(fs->GetFileSize(path, IOOptions(), &size, nullptr));
+  ASSERT_EQ(size, 8192);
+  ASSERT_OK(fs->DeleteFile(path, IOOptions(), nullptr));
+}
+
+TEST_F(PosixWritableFileTest, SeekAfterExtend) {
+  std::shared_ptr<FileSystem> fs = FileSystem::Default();
+  std::string path =
+      test::PerThreadDBPath("PosixWritableFileTest_SeekAfterTruncate");
+  Random rnd(300);
+  std::unique_ptr<FSWritableFile> wfile;
+
+  ASSERT_OK(fs->NewWritableFile(path, FileOptions(), &wfile, nullptr));
+  ASSERT_OK(wfile->Append(rnd.RandomString(4096), IOOptions(), nullptr));
+  ASSERT_OK(wfile->Truncate(8192, IOOptions(), nullptr));
+  ASSERT_OK(wfile->Append(rnd.RandomString(8192), IOOptions(), nullptr));
+  ASSERT_OK(wfile->Close(IOOptions(), nullptr));
+  wfile.reset();
+
+  uint64_t size = 0;
+  ASSERT_OK(fs->GetFileSize(path, IOOptions(), &size, nullptr));
+  ASSERT_EQ(size, 16384);
+  ASSERT_OK(fs->DeleteFile(path, IOOptions(), nullptr));
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 #endif
 
diff --git a/env/mock_env.cc b/env/mock_env.cc
index bf0e76adbbe4..0f9e5ab47f67 100644
--- a/env/mock_env.cc
+++ b/env/mock_env.cc
@@ -322,6 +322,11 @@ class MockRandomAccessFile : public FSRandomAccessFile {
     }
   }
 
+  IOStatus GetFileSize(uint64_t* size) override {
+    *size = file_->Size();
+    return IOStatus::OK();
+  }
+
  private:
   MemFile* file_;
   bool use_direct_io_;
diff --git a/examples/Makefile b/examples/Makefile
index b056508a6c3f..0970cfd4002d 100644
--- a/examples/Makefile
+++ b/examples/Makefile
@@ -19,16 +19,16 @@ CFLAGS += -Wstrict-prototypes
 all: simple_example column_families_example compact_files_example c_simple_example optimistic_transaction_example transaction_example compaction_filter_example options_file_example rocksdb_backup_restore_example
 
 simple_example: librocksdb simple_example.cc
-	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++20 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
 
 column_families_example: librocksdb column_families_example.cc
-	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++20 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
 
 compaction_filter_example: librocksdb compaction_filter_example.cc
-	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++20 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
 
 compact_files_example: librocksdb compact_files_example.cc
-	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++20 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
 
 .c.o:
 	$(CC) $(CFLAGS) -c $< -o $@ -I../include
@@ -37,19 +37,19 @@ c_simple_example: librocksdb c_simple_example.o
 	$(CXX) $@.o -o$@ ../librocksdb.a $(PLATFORM_LDFLAGS) $(EXEC_LDFLAGS)
 
 optimistic_transaction_example: librocksdb optimistic_transaction_example.cc
-	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++20 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
 
 transaction_example: librocksdb transaction_example.cc
-	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++20 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
 
 options_file_example: librocksdb options_file_example.cc
-	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++20 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
 
 multi_processes_example: librocksdb multi_processes_example.cc
-	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++20 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
 
 rocksdb_backup_restore_example: librocksdb rocksdb_backup_restore_example.cc
-	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++20 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
 
 clean:
 	rm -rf ./simple_example ./column_families_example ./compact_files_example ./compaction_filter_example ./c_simple_example c_simple_example.o ./optimistic_transaction_example ./transaction_example ./options_file_example ./multi_processes_example ./rocksdb_backup_restore_example
diff --git a/examples/column_families_example.cc b/examples/column_families_example.cc
index 3828d3fb3f73..f8ce4b8c7013 100644
--- a/examples/column_families_example.cc
+++ b/examples/column_families_example.cc
@@ -3,6 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 #include <cstdio>
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -32,7 +33,7 @@ int main() {
   // open DB
   Options options;
   options.create_if_missing = true;
-  DB* db;
+  std::unique_ptr<DB> db;
   Status s = DB::Open(options, kDBPath, &db);
   assert(s.ok());
 
@@ -44,7 +45,7 @@ int main() {
   // close DB
   s = db->DestroyColumnFamilyHandle(cf);
   assert(s.ok());
-  delete db;
+  db.reset();
 
   // open DB with two column families
   std::vector<ColumnFamilyDescriptor> column_families;
@@ -82,7 +83,7 @@ int main() {
     s = db->DestroyColumnFamilyHandle(handle);
     assert(s.ok());
   }
-  delete db;
+  db.reset();
 
   return 0;
 }
diff --git a/examples/compact_files_example.cc b/examples/compact_files_example.cc
index 52b054002d76..cc9e04e4506b 100644
--- a/examples/compact_files_example.cc
+++ b/examples/compact_files_example.cc
@@ -6,6 +6,7 @@
 // An example code demonstrating how to use CompactFiles, EventListener,
 // and GetColumnFamilyMetaData APIs to implement custom compaction algorithm.
 
+#include <memory>
 #include <mutex>
 #include <string>
 
@@ -151,10 +152,12 @@ int main() {
   options.IncreaseParallelism(5);
   options.listeners.emplace_back(new FullCompactor(options));
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   ROCKSDB_NAMESPACE::DestroyDB(kDBPath, options);
-  Status s = DB::Open(options, kDBPath, &db);
-  assert(s.ok());
+  {
+    Status s = DB::Open(options, kDBPath, &db);
+    assert(s.ok());
+  }
   assert(db);
 
   // if background compaction is not working, write will stall
@@ -172,7 +175,7 @@ int main() {
   }
 
   // close the db.
-  delete db;
+  db.reset();
 
   return 0;
 }
diff --git a/examples/compaction_filter_example.cc b/examples/compaction_filter_example.cc
index 03a1952600d7..9c17a229940b 100644
--- a/examples/compaction_filter_example.cc
+++ b/examples/compaction_filter_example.cc
@@ -63,7 +63,7 @@ std::string kRemoveDirCommand = "rm -rf ";
 #endif
 
 int main() {
-  ROCKSDB_NAMESPACE::DB* raw_db;
+  std::unique_ptr<ROCKSDB_NAMESPACE::DB> db;
   ROCKSDB_NAMESPACE::Status status;
 
   MyFilter filter;
@@ -77,9 +77,8 @@ int main() {
   options.create_if_missing = true;
   options.merge_operator.reset(new MyMerge);
   options.compaction_filter = &filter;
-  status = ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &raw_db);
+  status = ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db);
   assert(status.ok());
-  std::unique_ptr<ROCKSDB_NAMESPACE::DB> db(raw_db);
 
   ROCKSDB_NAMESPACE::WriteOptions wopts;
   db->Merge(wopts, "0", "bad");  // This is filtered out
diff --git a/examples/multi_processes_example.cc b/examples/multi_processes_example.cc
index b9a6cbe207d1..20a3af3637b4 100644
--- a/examples/multi_processes_example.cc
+++ b/examples/multi_processes_example.cc
@@ -19,6 +19,7 @@
 #include <cstdio>
 #include <cstdlib>
 #include <ctime>
+#include <memory>
 #include <string>
 #include <thread>
 #include <vector>
@@ -147,7 +148,7 @@ void CreateDB() {
     assert(false);
   }
   options.create_if_missing = true;
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   s = DB::Open(options, kDBPath, &db);
   if (!s.ok()) {
     fprintf(stderr, "[process %ld] Failed to open DB: %s\n", my_pid,
@@ -173,7 +174,7 @@ void CreateDB() {
     delete h;
   }
   handles.clear();
-  delete db;
+  db.reset();
 }
 
 void RunPrimary() {
@@ -181,7 +182,7 @@ void RunPrimary() {
   fprintf(stdout, "[process %ld] Primary instance starts\n", my_pid);
   CreateDB();
   std::srand(time(nullptr));
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   Options options;
   options.create_if_missing = false;
   std::vector<ColumnFamilyDescriptor> column_families;
@@ -227,8 +228,7 @@ void RunPrimary() {
         delete h;
       }
       handles.clear();
-      delete db;
-      db = nullptr;
+      db.reset();
     }
   }
   if (nullptr != db) {
@@ -236,8 +236,7 @@ void RunPrimary() {
       delete h;
     }
     handles.clear();
-    delete db;
-    db = nullptr;
+    db.reset();
   }
   fprintf(stdout, "[process %ld] Finished adding keys\n", my_pid);
 }
@@ -262,7 +261,7 @@ void RunSecondary() {
       exit(0);
     }
   }
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   Options options;
   options.create_if_missing = false;
   options.max_open_files = -1;
@@ -344,7 +343,7 @@ void RunSecondary() {
     column_families.push_back(ColumnFamilyDescriptor(cf_name, options));
   }
   std::vector<ColumnFamilyHandle*> handles;
-  DB* verification_db = nullptr;
+  std::unique_ptr<DB> verification_db;
   s = DB::OpenForReadOnly(options, kDBPath, column_families, &handles,
                           &verification_db);
   assert(s.ok());
@@ -369,8 +368,8 @@ void RunSecondary() {
   }
   delete iter;
   delete iter1;
-  delete db;
-  delete verification_db;
+  db.reset();
+  verification_db.reset();
 }
 
 int main(int argc, char** argv) {
diff --git a/examples/options_file_example.cc b/examples/options_file_example.cc
index 00632f391ae9..09be3185ca88 100644
--- a/examples/options_file_example.cc
+++ b/examples/options_file_example.cc
@@ -7,6 +7,7 @@
 // rocksdb/utilities/options_util.h to open a rocksdb database without
 // remembering all the rocksdb options.
 #include <cstdio>
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -74,7 +75,7 @@ int main() {
   cf_descs[1].options.table_factory.reset(NewBlockBasedTableFactory(bbt_opts));
 
   // destroy and open DB
-  DB* db;
+  std::unique_ptr<DB> db;
   Status s = ROCKSDB_NAMESPACE::DestroyDB(kDBPath,
                                           Options(db_opt, cf_descs[0].options));
   assert(s.ok());
@@ -88,7 +89,7 @@ int main() {
 
   // close DB
   delete cf;
-  delete db;
+  db.reset();
 
   // In the following code, we will reopen the rocksdb instance using
   // the options file stored in the db directory.
@@ -128,5 +129,5 @@ int main() {
   for (auto* handle : handles) {
     delete handle;
   }
-  delete db;
+  db.reset();
 }
diff --git a/examples/rocksdb_backup_restore_example.cc b/examples/rocksdb_backup_restore_example.cc
index c833ed1c2a8f..e5ad703eed8d 100644
--- a/examples/rocksdb_backup_restore_example.cc
+++ b/examples/rocksdb_backup_restore_example.cc
@@ -4,6 +4,7 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include <cstdio>
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -29,7 +30,7 @@ std::string kDBPath = "/tmp/rocksdb_example";
 #endif
 
 int main() {
-  DB* db;
+  std::unique_ptr<DB> db;
   Options options;
   // Optimize RocksDB. This is the easiest way to get RocksDB to perform well
   options.IncreaseParallelism();
@@ -52,7 +53,7 @@ int main() {
                          &backup_engine);
   assert(s.ok());
 
-  backup_engine->CreateNewBackup(db);
+  backup_engine->CreateNewBackup(db.get());
   assert(s.ok());
 
   std::vector<BackupInfo> backup_info;
@@ -65,9 +66,7 @@ int main() {
   db->Put(WriteOptions(), "key2", "value2");
   assert(s.ok());
 
-  db->Close();
-  delete db;
-  db = nullptr;
+  db.reset();
 
   // restore db to backup 1
   BackupEngineReadOnly* backup_engine_ro;
@@ -93,7 +92,7 @@ int main() {
 
   delete backup_engine;
   delete backup_engine_ro;
-  delete db;
+  db.reset();
 
   return 0;
 }
diff --git a/examples/simple_example.cc b/examples/simple_example.cc
index 2d49c4d14da2..85a87da77cea 100644
--- a/examples/simple_example.cc
+++ b/examples/simple_example.cc
@@ -4,6 +4,7 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include <cstdio>
+#include <memory>
 #include <string>
 
 #include "rocksdb/db.h"
@@ -25,7 +26,7 @@ std::string kDBPath = "/tmp/rocksdb_simple_example";
 #endif
 
 int main() {
-  DB* db;
+  std::unique_ptr<DB> db;
   Options options;
   // Optimize RocksDB. This is the easiest way to get RocksDB to perform well
   options.IncreaseParallelism();
@@ -87,7 +88,7 @@ int main() {
   pinnable_val.Reset();
   // The Slice pointed by pinnable_val is not valid after this point
 
-  delete db;
+  db.reset();
 
   return 0;
 }
diff --git a/file/delete_scheduler.cc b/file/delete_scheduler.cc
index b06409a5dcbb..79bb63c5b3d9 100644
--- a/file/delete_scheduler.cc
+++ b/file/delete_scheduler.cc
@@ -130,6 +130,7 @@ Status DeleteScheduler::AddFileToDeletionQueue(const std::string& file_path,
                  s.ToString().c_str());
 
   if (!s.ok()) {
+    IGNORE_STATUS_IF_ERROR(s);
     ROCKS_LOG_ERROR(info_log_, "Failed to mark %s as trash -- %s",
                     file_path.c_str(), s.ToString().c_str());
     s = fs_->DeleteFile(file_path, IOOptions(), nullptr);
@@ -151,6 +152,7 @@ Status DeleteScheduler::AddFileToDeletionQueue(const std::string& file_path,
     if (io_s.ok()) {
       total_trash_size_.fetch_add(trash_file_size);
     }
+    IGNORE_STATUS_IF_ERROR(s);
   }
   //**TODO: What should we do if we failed to
   // get the file size?
diff --git a/file/file_prefetch_buffer.cc b/file/file_prefetch_buffer.cc
index 7683db861732..ab78fccf72b4 100644
--- a/file/file_prefetch_buffer.cc
+++ b/file/file_prefetch_buffer.cc
@@ -126,6 +126,8 @@ Status FilePrefetchBuffer::Read(BufferInfo* buf, const IOOptions& opts,
 
   if (usage_ == FilePrefetchBufferUsage::kUserScanPrefetch) {
     RecordTick(stats_, PREFETCH_BYTES, read_len);
+  } else if (usage_ == FilePrefetchBufferUsage::kCompactionPrefetch) {
+    RecordInHistogram(stats_, COMPACTION_PREFETCH_BYTES, read_len);
   }
   if (!use_fs_buffer) {
     // Update the buffer size.
@@ -154,8 +156,22 @@ Status FilePrefetchBuffer::ReadAsync(BufferInfo* buf, const IOOptions& opts,
                                &(buf->del_fn_), /*aligned_buf =*/nullptr);
   req.status.PermitUncheckedError();
   if (s.ok()) {
-    RecordTick(stats_, PREFETCH_BYTES, read_len);
+    if (usage_ == FilePrefetchBufferUsage::kUserScanPrefetch) {
+      RecordTick(stats_, PREFETCH_BYTES, read_len);
+    }
     buf->async_read_in_progress_ = true;
+  } else if (s.IsNotSupported()) {
+    // Async IO is not available (e.g., io_uring failed to initialize).
+    // Fall back to synchronous read so the buffer is populated inline
+    // and callers proceed transparently.
+    s = reader->Read(opts, start_offset, read_len, &result,
+                     buf->buffer_.BufferStart(), /*aligned_buf=*/nullptr);
+    if (s.ok()) {
+      buf->buffer_.Size(buf->CurrentSize() + result.size());
+      if (usage_ == FilePrefetchBufferUsage::kUserScanPrefetch) {
+        RecordTick(stats_, PREFETCH_BYTES, read_len);
+      }
+    }
   }
   return s;
 }
@@ -347,7 +363,7 @@ void FilePrefetchBuffer::ClearOutdatedData(uint64_t offset, size_t length) {
   assert(IsBufferQueueEmpty() || buf->IsOffsetInBuffer(offset));
 }
 
-void FilePrefetchBuffer::PollIfNeeded(uint64_t offset, size_t length) {
+Status FilePrefetchBuffer::PollIfNeeded(uint64_t offset, size_t length) {
   BufferInfo* buf = GetFirstBuffer();
 
   if (buf->async_read_in_progress_ && fs_ != nullptr) {
@@ -358,7 +374,16 @@ void FilePrefetchBuffer::PollIfNeeded(uint64_t offset, size_t length) {
       std::vector<void*> handles;
       handles.emplace_back(buf->io_handle_);
       StopWatch sw(clock_, stats_, POLL_WAIT_MICROS);
-      fs_->Poll(handles, 1).PermitUncheckedError();
+      IOStatus io_s = fs_->Poll(handles, 1);
+      // Allow tests to inject Poll errors
+      TEST_SYNC_POINT_CALLBACK("FilePrefetchBuffer::PollIfNeeded:IOStatus",
+                               &io_s);
+      if (!io_s.ok()) {
+        // On Poll failure, clean up the handle and abort.
+        // DestroyAndClearIOHandle also sets async_read_in_progress_ to false.
+        DestroyAndClearIOHandle(buf);
+        return io_s;
+      }
     }
 
     // Reset and Release io_handle after the Poll API as request has been
@@ -369,6 +394,7 @@ void FilePrefetchBuffer::PollIfNeeded(uint64_t offset, size_t length) {
   // Always call outdated data after Poll as Buffers might be out of sync w.r.t
   // offset and length.
   ClearOutdatedData(offset, length);
+  return Status::OK();
 }
 
 // ReadAheadSizeTuning API calls readaheadsize_cb_
@@ -507,7 +533,10 @@ Status FilePrefetchBuffer::HandleOverlappingAsyncData(
   // by Seek, but the next access is at another offset.
   if (buf->async_read_in_progress_ &&
       buf->IsOffsetInBufferWithAsyncProgress(offset)) {
-    PollIfNeeded(offset, length);
+    Status poll_status = PollIfNeeded(offset, length);
+    if (!poll_status.ok()) {
+      return poll_status;
+    }
   }
 
   if (IsBufferQueueEmpty() || NumBuffersAllocated() == 1) {
@@ -642,7 +671,10 @@ Status FilePrefetchBuffer::PrefetchInternal(const IOOptions& opts,
       return s;
     }
   } else {
-    PollIfNeeded(tmp_offset, tmp_length);
+    Status poll_status = PollIfNeeded(tmp_offset, tmp_length);
+    if (!poll_status.ok()) {
+      return poll_status;
+    }
   }
 
   AllocateBufferIfEmpty();
diff --git a/file/file_prefetch_buffer.h b/file/file_prefetch_buffer.h
index b8b6812bc83d..5ebf1f051df9 100644
--- a/file/file_prefetch_buffer.h
+++ b/file/file_prefetch_buffer.h
@@ -93,8 +93,8 @@ struct BufferInfo {
   //
   // For example - if end offset of previous buffer was 100 and because of
   // readahead_size optimization, end_offset was trimmed to 60. Then for next
-  // prefetch call, start_offset should be intialized to 100 i.e  start_offset =
-  // buf->initial_end_offset_.
+  // prefetch call, start_offset should be initialized to 100 i.e  start_offset
+  // = buf->initial_end_offset_.
   uint64_t initial_end_offset_ = 0;
 
   bool IsDataBlockInBuffer(uint64_t offset, size_t length) {
@@ -134,6 +134,7 @@ struct BufferInfo {
 enum class FilePrefetchBufferUsage {
   kTableOpenPrefetchTail,
   kUserScanPrefetch,
+  kCompactionPrefetch,
   kUnknown,
 };
 
@@ -154,7 +155,7 @@ enum class FilePrefetchBufferUsage {
 // When reusing the file system allocated buffer, overlap_buf_ is used if the
 // main buffer only contains part of the requested data. It is returned to
 // the caller after the remaining data is fetched.
-// If num_buffers_ > 1, then the data is prefetched asynchronosuly in the
+// If num_buffers_ > 1, then the data is prefetched asynchronously in the
 // buffers whenever the data is consumed from the buffers and that buffer is
 // freed.
 // If num_buffers > 1, then requested data can be overlapping between 2 buffers.
@@ -430,7 +431,7 @@ class FilePrefetchBuffer {
   void ClearOutdatedData(uint64_t offset, size_t len);
 
   // It calls Poll API to check for any pending asynchronous request.
-  void PollIfNeeded(uint64_t offset, size_t len);
+  Status PollIfNeeded(uint64_t offset, size_t len);
 
   Status PrefetchInternal(const IOOptions& opts, RandomAccessFileReader* reader,
                           uint64_t offset, size_t length, size_t readahead_size,
@@ -574,6 +575,9 @@ class FilePrefetchBuffer {
                            size_t& read_len, uint64_t& aligned_useful_len);
 
   void UpdateStats(bool found_in_buffer, size_t length_found) {
+    if (usage_ != FilePrefetchBufferUsage::kUserScanPrefetch) {
+      return;
+    }
     if (found_in_buffer) {
       RecordTick(stats_, PREFETCH_HITS);
     }
diff --git a/file/file_util.cc b/file/file_util.cc
index 105e88690226..c44d799b8ce4 100644
--- a/file/file_util.cc
+++ b/file/file_util.cc
@@ -22,7 +22,10 @@ IOStatus CopyFile(FileSystem* fs, const std::string& source,
                   Temperature src_temp_hint,
                   std::unique_ptr<WritableFileWriter>& dest_writer,
                   uint64_t size, bool use_fsync,
-                  const std::shared_ptr<IOTracer>& io_tracer) {
+                  const std::shared_ptr<IOTracer>& io_tracer,
+                  uint64_t max_read_buffer_size,
+                  const std::optional<IOOptions>& readIOOptions,
+                  const std::optional<IOOptions>& writeIOOptions) {
   FileOptions soptions;
   IOStatus io_s;
   std::unique_ptr<SequentialFileReader> src_reader;
@@ -38,7 +41,8 @@ IOStatus CopyFile(FileSystem* fs, const std::string& source,
 
     if (size == 0) {
       // default argument means copy everything
-      io_s = fs->GetFileSize(source, opts, &size, nullptr);
+      io_s =
+          fs->GetFileSize(source, readIOOptions.value_or(opts), &size, nullptr);
       if (!io_s.ok()) {
         return io_s;
       }
@@ -47,14 +51,23 @@ IOStatus CopyFile(FileSystem* fs, const std::string& source,
         new SequentialFileReader(std::move(srcfile), source, io_tracer));
   }
 
-  char buffer[4096];
+  const size_t read_buffer_size = std::max(
+      static_cast<size_t>(4096), static_cast<size_t>(max_read_buffer_size));
+  std::unique_ptr<char[]> buffer;
+  buffer.reset(new char[read_buffer_size]);
+
+  Env::IOPriority read_rate_limiter_priority = Env::IO_TOTAL;
+  if (readIOOptions.has_value()) {
+    read_rate_limiter_priority = readIOOptions.value().rate_limiter_priority;
+  }
   Slice slice;
   while (size > 0) {
-    size_t bytes_to_read = std::min(sizeof(buffer), static_cast<size_t>(size));
+    size_t bytes_to_read = std::min(static_cast<size_t>(read_buffer_size),
+                                    static_cast<size_t>(size));
     // TODO: rate limit copy file
-    io_s = status_to_io_status(
-        src_reader->Read(bytes_to_read, &slice, buffer,
-                         Env::IO_TOTAL /* rate_limiter_priority */));
+    io_s = status_to_io_status(src_reader->Read(
+        bytes_to_read, &slice, buffer.get(),
+        read_rate_limiter_priority /* rate_limiter_priority */));
     if (!io_s.ok()) {
       return io_s;
     }
@@ -65,19 +78,22 @@ IOStatus CopyFile(FileSystem* fs, const std::string& source,
           std::to_string(dest_writer->GetFileSize()));
     }
 
-    io_s = dest_writer->Append(opts, slice);
+    io_s = dest_writer->Append(writeIOOptions.value_or(opts), slice);
     if (!io_s.ok()) {
       return io_s;
     }
     size -= slice.size();
   }
-  return dest_writer->Sync(opts, use_fsync);
+  return dest_writer->Sync(writeIOOptions.value_or(opts), use_fsync);
 }
 
 IOStatus CopyFile(FileSystem* fs, const std::string& source,
                   Temperature src_temp_hint, const std::string& destination,
                   Temperature dst_temp, uint64_t size, bool use_fsync,
-                  const std::shared_ptr<IOTracer>& io_tracer) {
+                  const std::shared_ptr<IOTracer>& io_tracer,
+                  uint64_t max_read_buffer_size,
+                  const std::optional<IOOptions>& readIOOptions,
+                  const std::optional<IOOptions>& writeIOOptions) {
   FileOptions options;
   IOStatus io_s;
   std::unique_ptr<WritableFileWriter> dest_writer;
@@ -96,7 +112,8 @@ IOStatus CopyFile(FileSystem* fs, const std::string& source,
   }
 
   return CopyFile(fs, source, src_temp_hint, dest_writer, size, use_fsync,
-                  io_tracer);
+                  io_tracer, max_read_buffer_size, readIOOptions,
+                  writeIOOptions);
 }
 
 // Utility function to create a file with the provided contents
@@ -161,7 +178,8 @@ IOStatus GenerateOneFileChecksum(
     std::string* file_checksum_func_name,
     size_t verify_checksums_readahead_size, bool /*allow_mmap_reads*/,
     std::shared_ptr<IOTracer>& io_tracer, RateLimiter* rate_limiter,
-    const ReadOptions& read_options, Statistics* stats, SystemClock* clock) {
+    const ReadOptions& read_options, Statistics* stats, SystemClock* clock,
+    const FileOptions& file_options) {
   if (checksum_factory == nullptr) {
     return IOStatus::InvalidArgument("Checksum factory is invalid");
   }
@@ -201,7 +219,12 @@ IOStatus GenerateOneFileChecksum(
   std::unique_ptr<RandomAccessFileReader> reader;
   {
     std::unique_ptr<FSRandomAccessFile> r_file;
-    io_s = fs->NewRandomAccessFile(file_path, FileOptions(), &r_file, nullptr);
+    FileOptions fopts = file_options;
+    if (fopts.file_checksum.empty()) {
+      // No expected checksum is known — this is a from-scratch computation.
+      fopts.file_checksum_func_name = kNoFileChecksumFuncName;
+    }
+    io_s = fs->NewRandomAccessFile(file_path, fopts, &r_file, nullptr);
     if (!io_s.ok()) {
       return io_s;
     }
@@ -230,15 +253,16 @@ IOStatus GenerateOneFileChecksum(
   Slice slice;
   uint64_t offset = 0;
   IOOptions opts;
-  io_s = reader->PrepareIOOptions(read_options, opts);
+  IODebugContext dbg;
+  io_s = reader->PrepareIOOptions(read_options, opts, &dbg);
   if (!io_s.ok()) {
     return io_s;
   }
   while (size > 0) {
     size_t bytes_to_read =
         static_cast<size_t>(std::min(uint64_t{readahead_size}, size));
-    io_s =
-        reader->Read(opts, offset, bytes_to_read, &slice, buf.get(), nullptr);
+    io_s = reader->Read(opts, offset, bytes_to_read, &slice, buf.get(), nullptr,
+                        &dbg);
     if (!io_s.ok()) {
       return IOStatus::Corruption("file read failed with error: " +
                                   io_s.ToString());
diff --git a/file/file_util.h b/file/file_util.h
index 8a72fea27ad3..f460a30caa9b 100644
--- a/file/file_util.h
+++ b/file/file_util.h
@@ -24,18 +24,28 @@ IOStatus CopyFile(FileSystem* fs, const std::string& source,
                   Temperature src_temp_hint,
                   std::unique_ptr<WritableFileWriter>& dest_writer,
                   uint64_t size, bool use_fsync,
-                  const std::shared_ptr<IOTracer>& io_tracer);
+                  const std::shared_ptr<IOTracer>& io_tracer,
+                  uint64_t max_read_buffer_size = 4096,
+                  const std::optional<IOOptions>& readIOOptions = {},
+                  const std::optional<IOOptions>& writeIOOptions = {});
 IOStatus CopyFile(FileSystem* fs, const std::string& source,
                   Temperature src_temp_hint, const std::string& destination,
                   Temperature dst_temp, uint64_t size, bool use_fsync,
-                  const std::shared_ptr<IOTracer>& io_tracer);
+                  const std::shared_ptr<IOTracer>& io_tracer,
+                  uint64_t max_read_buffer_size = 4096,
+                  const std::optional<IOOptions>& readIOOptions = {},
+                  const std::optional<IOOptions>& writeIOOptions = {});
 inline IOStatus CopyFile(const std::shared_ptr<FileSystem>& fs,
                          const std::string& source, Temperature src_temp_hint,
                          const std::string& destination, Temperature dst_temp,
                          uint64_t size, bool use_fsync,
-                         const std::shared_ptr<IOTracer>& io_tracer) {
+                         const std::shared_ptr<IOTracer>& io_tracer,
+                         uint64_t max_read_buffer_size = 4096,
+                         const std::optional<IOOptions>& readIOOptions = {},
+                         const std::optional<IOOptions>& writeIOOptions = {}) {
   return CopyFile(fs.get(), source, src_temp_hint, destination, dst_temp, size,
-                  use_fsync, io_tracer);
+                  use_fsync, io_tracer, max_read_buffer_size, readIOOptions,
+                  writeIOOptions);
 }
 IOStatus CreateFile(FileSystem* fs, const std::string& destination,
                     const std::string& contents, bool use_fsync);
@@ -73,10 +83,18 @@ IOStatus GenerateOneFileChecksum(
     std::string* file_checksum_func_name,
     size_t verify_checksums_readahead_size, bool allow_mmap_reads,
     std::shared_ptr<IOTracer>& io_tracer, RateLimiter* rate_limiter,
-    const ReadOptions& read_options, Statistics* stats, SystemClock* clock);
+    const ReadOptions& read_options, Statistics* stats, SystemClock* clock,
+    const FileOptions& file_options);
 
 inline IOStatus PrepareIOFromReadOptions(const ReadOptions& ro,
-                                         SystemClock* clock, IOOptions& opts) {
+                                         SystemClock* clock, IOOptions& opts,
+                                         IODebugContext* dbg = nullptr) {
+  if (ro.request_id != nullptr) {
+    if (dbg != nullptr && dbg->request_id == nullptr) {
+      dbg->SetRequestId(ro.request_id);
+    }
+  }
+
   if (ro.deadline.count()) {
     std::chrono::microseconds now =
         std::chrono::microseconds(clock->NowMicros());
diff --git a/file/filename.cc b/file/filename.cc
index 45cbf9d76a98..d1d9c815a440 100644
--- a/file/filename.cc
+++ b/file/filename.cc
@@ -25,6 +25,7 @@ namespace ROCKSDB_NAMESPACE {
 
 const std::string kCurrentFileName = "CURRENT";
 const std::string kOptionsFileNamePrefix = "OPTIONS-";
+const std::string kCompactionProgressFileNamePrefix = "COMPACTION_PROGRESS-";
 const std::string kTempFileNameSuffix = "dbtmp";
 
 static const std::string kRocksDbTFileExt = "sst";
@@ -242,6 +243,25 @@ std::string TempOptionsFileName(const std::string& dbname, uint64_t file_num) {
   return dbname + "/" + buffer;
 }
 
+std::string CompactionProgressFileName(const std::string& dbname,
+                                       uint64_t timestamp) {
+  char buffer[256];
+  snprintf(buffer, sizeof(buffer), "%s%llu",
+           kCompactionProgressFileNamePrefix.c_str(),
+           static_cast<unsigned long long>(timestamp));
+  return dbname + "/" + buffer;
+}
+
+std::string TempCompactionProgressFileName(const std::string& dbname,
+                                           uint64_t timestamp) {
+  char buffer[256];
+  snprintf(buffer, sizeof(buffer), "%s%llu.%s",
+           kCompactionProgressFileNamePrefix.c_str(),
+           static_cast<unsigned long long>(timestamp),
+           kTempFileNameSuffix.c_str());
+  return dbname + "/" + buffer;
+}
+
 std::string MetaDatabaseName(const std::string& dbname, uint64_t number) {
   char buf[100];
   snprintf(buf, sizeof(buf), "/METADB-%llu",
@@ -264,6 +284,8 @@ std::string IdentityFileName(const std::string& dbname) {
 //    dbname/METADB-[0-9]+
 //    dbname/OPTIONS-[0-9]+
 //    dbname/OPTIONS-[0-9]+.dbtmp
+//    dbname/COMPACTION_PROGRESS-[timestamp]
+//    dbname/COMPACTION_PROGRESS-[timestamp].dbtmp
 //    Disregards / at the beginning
 bool ParseFileName(const std::string& fname, uint64_t* number, FileType* type,
                    WalFileType* log_type) {
@@ -339,6 +361,24 @@ bool ParseFileName(const std::string& fname, uint64_t* number,
     }
     *number = ts_suffix;
     *type = is_temp_file ? kTempFile : kOptionsFile;
+  } else if (rest.starts_with(kCompactionProgressFileNamePrefix)) {
+    uint64_t timestamp;
+    bool is_temp_file = false;
+    rest.remove_prefix(kCompactionProgressFileNamePrefix.size());
+    const std::string kTempFileNameSuffixWithDot =
+        std::string(".") + kTempFileNameSuffix;
+    if (rest.ends_with(kTempFileNameSuffixWithDot)) {
+      rest.remove_suffix(kTempFileNameSuffixWithDot.size());
+      is_temp_file = true;
+    }
+    if (!ConsumeDecimalNumber(&rest, &timestamp)) {
+      return false;
+    }
+    if (!rest.empty()) {
+      return false;
+    }
+    *number = timestamp;
+    *type = is_temp_file ? kTempFile : kCompactionProgressFile;
   } else {
     // Avoid strtoull() to keep filename format independent of the
     // current locale
diff --git a/file/filename.h b/file/filename.h
index 5a52c745ac6d..399a20f23cfa 100644
--- a/file/filename.h
+++ b/file/filename.h
@@ -124,7 +124,10 @@ std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts,
                                const std::string& log_dir = "");
 
 extern const std::string kOptionsFileNamePrefix;  // = "OPTIONS-"
-extern const std::string kTempFileNameSuffix;     // = "dbtmp"
+extern const std::string
+    kCompactionProgressFileNamePrefix;         // =
+                                               // "COMPACTION_PROGRESS-"
+extern const std::string kTempFileNameSuffix;  // = "dbtmp"
 
 // Return a options file name given the "dbname" and file number.
 // Format:  OPTIONS-[number].dbtmp
@@ -135,6 +138,16 @@ std::string OptionsFileName(uint64_t file_num);
 // Format:  OPTIONS-[number]
 std::string TempOptionsFileName(const std::string& dbname, uint64_t file_num);
 
+// Return a compaction progress file name given the timestamp.
+// Format:  COMPACTION_PROGRESS-[timestamp]
+std::string CompactionProgressFileName(const std::string& dbname,
+                                       uint64_t timestamp);
+
+// Return a temp compaction progress file name given the timestamp.
+// Format:  COMPACTION_PROGRESS-[timestamp].dbtmp
+std::string TempCompactionProgressFileName(const std::string& dbname,
+                                           uint64_t timestamp);
+
 // Return the name to use for a metadatabase. The result will be prefixed with
 // "dbname".
 std::string MetaDatabaseName(const std::string& dbname, uint64_t number);
diff --git a/file/prefetch_test.cc b/file/prefetch_test.cc
index 2c0919ed9522..57559b5e8466 100644
--- a/file/prefetch_test.cc
+++ b/file/prefetch_test.cc
@@ -299,9 +299,18 @@ TEST_P(PrefetchTest, Basic) {
   const uint64_t prev_table_open_prefetch_tail_hit =
       options.statistics->getTickerCount(TABLE_OPEN_PREFETCH_TAIL_HIT);
 
+  HistogramData pre_compaction_prefetch_bytes;
+  options.statistics->histogramData(COMPACTION_PREFETCH_BYTES,
+                                    &pre_compaction_prefetch_bytes);
+  ASSERT_EQ(pre_compaction_prefetch_bytes.count, 0);
+
   // commenting out the line below causes the example to work correctly
   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
 
+  HistogramData post_compaction_prefetch_bytes;
+  options.statistics->histogramData(COMPACTION_PREFETCH_BYTES,
+                                    &post_compaction_prefetch_bytes);
+
   HistogramData cur_table_open_prefetch_tail_read;
   options.statistics->histogramData(TABLE_OPEN_PREFETCH_TAIL_READ_BYTES,
                                     &cur_table_open_prefetch_tail_read);
@@ -318,6 +327,7 @@ TEST_P(PrefetchTest, Basic) {
     ASSERT_GT(fs->GetPrefetchCount(), 1);
     ASSERT_EQ(0, buff_prefetch_count);
     fs->ClearPrefetchCount();
+    ASSERT_EQ(post_compaction_prefetch_bytes.count, 0);
   } else {
     ASSERT_FALSE(fs->IsPrefetchCalled());
     // To rule out false positive by the SST file tail prefetch during
@@ -331,6 +341,20 @@ TEST_P(PrefetchTest, Basic) {
               prev_table_open_prefetch_tail_hit);
     ASSERT_GE(cur_table_open_prefetch_tail_miss,
               prev_table_open_prefetch_tail_miss);
+
+    ASSERT_GT(post_compaction_prefetch_bytes.count, 0);
+
+    // Not an exact match due to potential roundup/down for alignment
+    auto expected_compaction_readahead_size =
+        Options().compaction_readahead_size;
+    ASSERT_LE(post_compaction_prefetch_bytes.max,
+              expected_compaction_readahead_size * 1.1);
+    ASSERT_GE(post_compaction_prefetch_bytes.max,
+              expected_compaction_readahead_size * 0.9);
+    ASSERT_LE(post_compaction_prefetch_bytes.average,
+              expected_compaction_readahead_size * 1.1);
+    ASSERT_GE(post_compaction_prefetch_bytes.average,
+              expected_compaction_readahead_size * 0.9);
   }
 
   for (bool disable_io : {false, true}) {
@@ -645,7 +669,7 @@ TEST_P(PrefetchTest, ConfigureAutoMaxReadaheadSize) {
     MoveFilesToLevel(level);
   }
   Close();
-  std::vector<int> buff_prefectch_level_count = {0, 0, 0};
+  std::vector<int> buff_prefetch_level_count = {0, 0, 0};
   ASSERT_OK(TryReopen(options));
   {
     auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
@@ -683,7 +707,7 @@ TEST_P(PrefetchTest, ConfigureAutoMaxReadaheadSize) {
         iter->Next();
       }
 
-      buff_prefectch_level_count[level] = buff_prefetch_count;
+      buff_prefetch_level_count[level] = buff_prefetch_count;
       if (support_prefetch && !use_direct_io) {
         if (level == 0) {
           ASSERT_FALSE(fs->IsPrefetchCalled());
@@ -704,7 +728,7 @@ TEST_P(PrefetchTest, ConfigureAutoMaxReadaheadSize) {
   }
 
   if (!support_prefetch) {
-    ASSERT_GT(buff_prefectch_level_count[1], buff_prefectch_level_count[2]);
+    ASSERT_GT(buff_prefetch_level_count[1], buff_prefetch_level_count[2]);
   }
 
   SyncPoint::GetInstance()->DisableProcessing();
@@ -790,7 +814,7 @@ TEST_P(PrefetchTest, ConfigureInternalAutoReadaheadSize) {
                                       "{initial_auto_readahead_size=0;}"}}));
           break;
         case 1:
-          // intial_auto_readahead_size and max_auto_readahead_size are set
+          // initial_auto_readahead_size and max_auto_readahead_size are set
           // same so readahead_size remains same.
           ASSERT_OK(db_->SetOptions({{"block_based_table_factory",
                                       "{initial_auto_readahead_size=4096;max_"
@@ -1057,7 +1081,7 @@ TEST_P(PrefetchTest, PrefetchWhenReseek) {
   }
   {
     /*
-     * Reesek keys from Single Data Block.
+     * Reseek keys from Single Data Block.
      */
     auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
     iter->Seek(BuildKey(0));
@@ -1092,9 +1116,8 @@ TEST_P(PrefetchTest, PrefetchWhenReseek) {
     ASSERT_TRUE(iter->Valid());
     iter->Seek(BuildKey(1008));
     ASSERT_TRUE(iter->Valid());
-    iter->Seek(
-        BuildKey(996));  // Reseek won't prefetch any data and
-                         // readahead_size will be initiallized to 8*1024.
+    iter->Seek(BuildKey(996));  // Reseek won't prefetch any data and
+                                // readahead_size will be initialized to 8*1024.
     ASSERT_TRUE(iter->Valid());
     iter->Seek(BuildKey(992));
     ASSERT_TRUE(iter->Valid());
@@ -1566,7 +1589,7 @@ INSTANTIATE_TEST_CASE_P(
     ::testing::Combine(
         // Params are as follows -
         // Param 0 - TableOptions::index_shortening
-        // Param 2 - ReadOptinos::auto_readahead_size
+        // Param 2 - ReadOptions::auto_readahead_size
         ::testing::Values(
             BlockBasedTableOptions::IndexShorteningMode::kNoShortening,
             BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators,
@@ -2494,6 +2517,187 @@ TEST_P(PrefetchTest1, SeekParallelizationTest) {
   Close();
 }
 
+TEST_P(PrefetchTest1, PollErrorRecoveryDuringIteration) {
+  // This end-to-end test verifies that Poll() errors during async prefetching
+  // are properly propagated to the iterator. When Poll() fails, the iterator
+  // should stop and return an IOError status.
+  //
+  // With error injection on the 3rd Poll call, the iterator reads ~231 keys
+  // (out of 500) before encountering the error.
+
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+
+  const int kNumKeys = 500;
+  std::shared_ptr<MockFS> fs = std::make_shared<MockFS>(
+      FileSystem::Default(), /*support_prefetch=*/false);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  bool use_direct_io = GetParam();
+  Options options;
+  SetGenericOptions(env.get(), use_direct_io, options);
+  options.statistics = CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  SetBlockBasedTableOptions(table_options);
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  Status s = TryReopen(options);
+  if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    ROCKSDB_GTEST_SKIP("Direct IO not supported");
+    return;
+  }
+  ASSERT_OK(s);
+
+  // Write keys with known values so we can verify correctness
+  std::map<std::string, std::string> expected_data;
+  {
+    WriteBatch batch;
+    for (int i = 0; i < kNumKeys; i++) {
+      std::string key = BuildKey(i);
+      std::string value = "value_" + std::to_string(i) + "_" +
+                          std::string(100, 'x');  // Make values ~110 bytes
+      ASSERT_OK(batch.Put(key, value));
+      expected_data[key] = value;
+    }
+    ASSERT_OK(db_->Write(WriteOptions(), &batch));
+    ASSERT_OK(Flush());
+  }
+
+  std::string start_key = BuildKey(0);
+  std::string end_key = BuildKey(kNumKeys - 1);
+  Slice least(start_key.data(), start_key.size());
+  Slice greatest(end_key.data(), end_key.size());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
+
+  // Set up callbacks to track async IO and inject Poll errors
+  std::atomic<int> poll_call_count{0};
+  std::atomic<int> poll_error_injected_count{0};
+  bool read_async_called = false;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "FilePrefetchBuffer::PollIfNeeded:IOStatus", [&](void* arg) {
+        poll_call_count++;
+        int current_count = poll_call_count.load();
+
+        // Inject error on the third Poll call to allow some keys to be read
+        // first
+        if (current_count == 3) {
+          IOStatus* io_s = static_cast<IOStatus*>(arg);
+          *io_s = IOStatus::IOError("Injected Poll error for e2e testing");
+          poll_error_injected_count++;
+          std::cout << "PollErrorRecoveryDuringIteration: Injected error on "
+                       "Poll call #"
+                    << current_count << std::endl;
+        }
+      });
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "UpdateResults::io_uring_result",
+      [&](void* /*arg*/) { read_async_called = true; });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Iterate through all keys with async IO enabled
+  ReadOptions ro;
+  ro.async_io = true;
+  ro.adaptive_readahead = true;
+
+  int keys_read = 0;
+  int data_mismatches = 0;
+  Status iter_status;
+  {
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      std::string key = iter->key().ToString();
+      std::string value = iter->value().ToString();
+
+      auto it = expected_data.find(key);
+      if (it == expected_data.end()) {
+        std::cout << "PollErrorRecoveryDuringIteration: Unexpected key: " << key
+                  << std::endl;
+        data_mismatches++;
+      } else if (it->second != value) {
+        std::cout << "PollErrorRecoveryDuringIteration: Value mismatch for key "
+                  << key << std::endl;
+        data_mismatches++;
+      }
+      keys_read++;
+    }
+    iter_status = iter->status();
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // Log results
+  std::cout << "PollErrorRecoveryDuringIteration: " << "read_async_called="
+            << read_async_called << ", poll_calls=" << poll_call_count.load()
+            << ", poll_errors_injected=" << poll_error_injected_count.load()
+            << ", keys_read=" << keys_read << ", expected_keys=" << kNumKeys
+            << ", data_mismatches=" << data_mismatches
+            << ", iter_status=" << iter_status.ToString() << std::endl;
+
+  // Verify no data mismatches occurred for keys that were read
+  ASSERT_EQ(data_mismatches, 0)
+      << "Found " << data_mismatches << " data mismatches";
+
+  if (read_async_called) {
+    // Async IO was used - verify Poll error was injected and propagated
+    ASSERT_EQ(poll_call_count.load(), 3)
+        << "Expected exactly 3 Poll calls when error injected on 3rd call";
+    ASSERT_EQ(poll_error_injected_count.load(), 1)
+        << "Expected exactly 1 Poll error to be injected";
+
+    // The iterator should have stopped with an error status
+    ASSERT_TRUE(iter_status.IsIOError())
+        << "Expected iterator to report IOError after Poll failure, got: "
+        << iter_status.ToString();
+
+    std::cout << "PollErrorRecoveryDuringIteration: Successfully verified "
+                 "Poll error was injected and propagated to iterator"
+              << std::endl;
+  } else {
+    // Async IO not supported - iterator should complete successfully
+    ASSERT_OK(iter_status);
+    ASSERT_EQ(keys_read, kNumKeys);
+    std::cout << "PollErrorRecoveryDuringIteration: Async IO (io_uring) not "
+                 "supported on this platform, verified data correctness"
+              << std::endl;
+  }
+
+  // Retry iteration without error injection - verify all data is still readable
+  // This confirms the Poll error didn't corrupt state
+  {
+    int retry_keys_read = 0;
+    int retry_data_mismatches = 0;
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      std::string key = iter->key().ToString();
+      std::string value = iter->value().ToString();
+
+      auto it = expected_data.find(key);
+      if (it == expected_data.end()) {
+        retry_data_mismatches++;
+      } else if (it->second != value) {
+        retry_data_mismatches++;
+      }
+      retry_keys_read++;
+    }
+    ASSERT_OK(iter->status())
+        << "Retry iteration failed: " << iter->status().ToString();
+    ASSERT_EQ(retry_keys_read, kNumKeys)
+        << "Retry should read all " << kNumKeys << " keys";
+    ASSERT_EQ(retry_data_mismatches, 0)
+        << "Retry found " << retry_data_mismatches << " data mismatches";
+    std::cout << "PollErrorRecoveryDuringIteration: Retry succeeded, read all "
+              << retry_keys_read << " keys correctly" << std::endl;
+  }
+
+  Close();
+}
+
 namespace {
 #ifdef GFLAGS
 const int kMaxArgCount = 100;
@@ -3251,8 +3455,9 @@ TEST_F(FilePrefetchBufferTest, SyncReadaheadStats) {
   ReadaheadParams readahead_params;
   readahead_params.initial_readahead_size = 8192;
   readahead_params.max_readahead_size = 8192;
-  FilePrefetchBuffer fpb(readahead_params, true, false, fs(), nullptr,
-                         stats.get());
+  FilePrefetchBuffer fpb(
+      readahead_params, true, false, fs(), nullptr, stats.get(),
+      nullptr /* cb */, FilePrefetchBufferUsage::kUserScanPrefetch /* usage */);
   Slice result;
   // Simulate a seek of 4096 bytes at offset 0. Due to the readahead settings,
   // it will do a read of offset 0 and length - (4096 + 8192) 12288.
@@ -3278,7 +3483,7 @@ TEST_F(FilePrefetchBufferTest, SyncReadaheadStats) {
   ASSERT_EQ(stats->getAndResetTickerCount(PREFETCH_HITS), 1);
   ASSERT_EQ(stats->getAndResetTickerCount(PREFETCH_BYTES_USEFUL), 8192);
 
-  // Now read some data with length doesn't align with aligment and it needs
+  // Now read some data with length doesn't align with alignment and it needs
   // prefetching. Read from 16000 with length 10000 (i.e. requested end offset -
   // 26000).
   ASSERT_TRUE(
@@ -3352,6 +3557,118 @@ TEST_F(FilePrefetchBufferTest, ForCompaction) {
       0);
 }
 
+TEST_F(FilePrefetchBufferTest, PollErrorPropagation) {
+  // This test verifies that Poll() errors in PollIfNeeded are properly
+  // propagated rather than being silently ignored.
+
+  std::string fname = "poll-error-test";
+  Random rand(0);
+  std::string content = rand.RandomString(32768);
+  Write(fname, content);
+
+  FileOptions opts;
+  std::unique_ptr<RandomAccessFileReader> r;
+  Read(fname, opts, &r);
+
+  // Set up readahead params for async prefetching
+  ReadaheadParams readahead_params;
+  readahead_params.initial_readahead_size = 16384;
+  readahead_params.max_readahead_size = 16384;
+
+  FilePrefetchBuffer fpb(readahead_params, /*enable=*/true,
+                         /*track_min_offset=*/false, fs());
+
+  Slice result;
+  // Start an async prefetch to set up async_read_in_progress_ state
+  Status s = fpb.PrefetchAsync(IOOptions(), r.get(), 0, 4096, &result);
+
+  // Skip test on platforms that don't support async IO.
+  if (s.IsNotSupported()) {
+    ROCKSDB_GTEST_SKIP("Async IO not supported on this platform");
+    return;
+  }
+  ASSERT_TRUE(s.IsTryAgain());
+
+  // With the ReadAsync sync fallback, PrefetchAsync returns TryAgain even when
+  // async IO is unavailable (data is read synchronously, but data_found was
+  // false at entry). Detect by checking async_read_in_progress_ on the buffer.
+  {
+    std::vector<std::tuple<uint64_t, size_t, bool>> buf_info(1);
+    fpb.TEST_GetBufferOffsetandSize(buf_info);
+    bool async_read_in_progress = std::get<2>(buf_info[0]);
+    if (!async_read_in_progress) {
+      ROCKSDB_GTEST_SKIP("Async IO not available (sync fallback used)");
+      return;
+    }
+  }
+
+  // Set up SyncPoint to inject Poll error
+  SyncPoint::GetInstance()->SetCallBack(
+      "FilePrefetchBuffer::PollIfNeeded:IOStatus", [&](void* arg) {
+        IOStatus* io_s = static_cast<IOStatus*>(arg);
+        *io_s = IOStatus::IOError("Injected Poll error for testing");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // TryReadFromCache will call PollIfNeeded to complete the async read
+  IOOptions io_opts;
+  io_opts.rate_limiter_priority = Env::IOPriority::IO_LOW;
+  Status read_status;
+  bool found =
+      fpb.TryReadFromCache(io_opts, r.get(), 0, 4096, &result, &read_status);
+
+  // When PollIfNeeded fails:
+  // 1. PrefetchInternal returns the error status
+  // 2. TryReadFromCacheUntracked sets *status to the error and returns false
+  // Therefore: found should be false, and read_status should contain the error
+  ASSERT_FALSE(found) << "Expected TryReadFromCache to return false on Poll "
+                         "error, but it returned true";
+  ASSERT_TRUE(read_status.IsIOError())
+      << "Expected IOError status, got: " << read_status.ToString();
+  ASSERT_TRUE(read_status.ToString().find("Injected Poll error") !=
+              std::string::npos)
+      << "Expected error message to contain 'Injected Poll error', got: "
+      << read_status.ToString();
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(FilePrefetchBufferTest, ReadAsyncSyncFallbackOnNotSupported) {
+  std::string fname = "read-async-sync-fallback";
+  Random rand(0);
+  std::string content = rand.RandomString(32768);
+  Write(fname, content);
+
+  FileOptions opts;
+  std::unique_ptr<RandomAccessFileReader> r;
+  Read(fname, opts, &r);
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "RandomAccessFileReader::ReadAsync:InjectStatus", [](void* arg) {
+        *static_cast<IOStatus*>(arg) = IOStatus::NotSupported();
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ReadaheadParams readahead_params;
+  readahead_params.initial_readahead_size = 16384;
+  readahead_params.max_readahead_size = 16384;
+  readahead_params.num_buffers = 2;
+
+  FilePrefetchBuffer fpb(readahead_params, /*enable=*/true,
+                         /*track_min_offset=*/false, fs());
+
+  Slice result;
+  Status s;
+  ASSERT_TRUE(fpb.TryReadFromCache(IOOptions(), r.get(), 0, 4096, &result, &s));
+  ASSERT_OK(s);
+  ASSERT_EQ(result.size(), 4096);
+  ASSERT_EQ(memcmp(result.data(), content.data(), 4096), 0);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
 class FSBufferPrefetchTest
     : public testing::Test,
       public ::testing::WithParamInterface<std::tuple<bool, bool>> {
@@ -3497,9 +3814,10 @@ TEST_P(FSBufferPrefetchTest, FSBufferPrefetchStatsInternals) {
   size_t num_buffers = use_async_prefetch ? 2 : 1;
   readahead_params.num_buffers = num_buffers;
 
-  FilePrefetchBuffer fpb(readahead_params, true /* enable */,
-                         false /* track_min_offset */, fs(), clock(),
-                         stats.get());
+  FilePrefetchBuffer fpb(
+      readahead_params, true /* enable */, false /* track_min_offset */, fs(),
+      clock(), stats.get(), nullptr /* cb */,
+      FilePrefetchBufferUsage::kUserScanPrefetch /* usage */);
 
   int overlap_buffer_write_ct = 0;
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
@@ -3516,6 +3834,9 @@ TEST_P(FSBufferPrefetchTest, FSBufferPrefetchStatsInternals) {
       fpb.TryReadFromCache(IOOptions(), r.get(), 0 /* offset */, 4096 /* n */,
                            &result, &s, for_compaction);
   // Platforms that don't have IO uring may not support async IO.
+  // With the ReadAsync sync fallback, s will be OK even when async IO is
+  // unavailable — detect by checking if the second buffer has an async read
+  // in progress.
   if (use_async_prefetch && s.IsNotSupported()) {
     return;
   }
@@ -3529,6 +3850,14 @@ TEST_P(FSBufferPrefetchTest, FSBufferPrefetchStatsInternals) {
   fpb.TEST_GetOverlapBufferOffsetandSize(overlap_buffer_info);
   fpb.TEST_GetBufferOffsetandSize(buffer_info);
   if (use_async_prefetch) {
+    bool async_read_in_progress = std::get<2>(buffer_info[1]);
+    if (!async_read_in_progress) {
+      // Async IO was requested but not available (e.g., no io_uring).
+      // ReadAsync fell back to sync read. Skip async-specific assertions.
+      SyncPoint::GetInstance()->DisableProcessing();
+      SyncPoint::GetInstance()->ClearAllCallBacks();
+      return;
+    }
     // Cut the readahead of 8192 in half.
     // Overlap buffer is not used
     ASSERT_EQ(overlap_buffer_info.first, 0);
@@ -3721,6 +4050,14 @@ TEST_P(FSBufferPrefetchTest, FSBufferPrefetchUnalignedReads) {
   fpb.TEST_GetOverlapBufferOffsetandSize(overlap_buffer_info);
   fpb.TEST_GetBufferOffsetandSize(buffer_info);
   if (use_async_prefetch) {
+    bool async_read_in_progress = std::get<2>(buffer_info[1]);
+    if (!async_read_in_progress) {
+      // Async IO was requested but not available (e.g., no io_uring).
+      // ReadAsync fell back to sync read. Skip async-specific assertions.
+      SyncPoint::GetInstance()->DisableProcessing();
+      SyncPoint::GetInstance()->ClearAllCallBacks();
+      return;
+    }
     // Overlap buffer is not used
     ASSERT_EQ(overlap_buffer_info.first, 0);
     ASSERT_EQ(overlap_buffer_info.second, 0);
diff --git a/file/random_access_file_reader.cc b/file/random_access_file_reader.cc
index 46f5d1c26262..ae070ef34626 100644
--- a/file/random_access_file_reader.cc
+++ b/file/random_access_file_reader.cc
@@ -66,29 +66,53 @@ inline void RecordIOStats(Statistics* stats, Temperature file_temperature,
   }
 
   // record for temperature file
-  if (file_temperature != Temperature::kUnknown) {
-    switch (file_temperature) {
-      case Temperature::kHot:
-        IOSTATS_ADD(file_io_stats_by_temperature.hot_file_bytes_read, size);
-        IOSTATS_ADD(file_io_stats_by_temperature.hot_file_read_count, 1);
-        RecordTick(stats, HOT_FILE_READ_BYTES, size);
-        RecordTick(stats, HOT_FILE_READ_COUNT, 1);
-        break;
-      case Temperature::kWarm:
-        IOSTATS_ADD(file_io_stats_by_temperature.warm_file_bytes_read, size);
-        IOSTATS_ADD(file_io_stats_by_temperature.warm_file_read_count, 1);
-        RecordTick(stats, WARM_FILE_READ_BYTES, size);
-        RecordTick(stats, WARM_FILE_READ_COUNT, 1);
-        break;
-      case Temperature::kCold:
-        IOSTATS_ADD(file_io_stats_by_temperature.cold_file_bytes_read, size);
-        IOSTATS_ADD(file_io_stats_by_temperature.cold_file_read_count, 1);
-        RecordTick(stats, COLD_FILE_READ_BYTES, size);
-        RecordTick(stats, COLD_FILE_READ_COUNT, 1);
-        break;
-      default:
-        break;
-    }
+  switch (file_temperature) {
+    case Temperature::kHot:
+      IOSTATS_ADD(file_io_stats_by_temperature.hot_file_bytes_read, size);
+      IOSTATS_ADD(file_io_stats_by_temperature.hot_file_read_count, 1);
+      RecordTick(stats, HOT_FILE_READ_BYTES, size);
+      RecordTick(stats, HOT_FILE_READ_COUNT, 1);
+      break;
+    case Temperature::kWarm:
+      IOSTATS_ADD(file_io_stats_by_temperature.warm_file_bytes_read, size);
+      IOSTATS_ADD(file_io_stats_by_temperature.warm_file_read_count, 1);
+      RecordTick(stats, WARM_FILE_READ_BYTES, size);
+      RecordTick(stats, WARM_FILE_READ_COUNT, 1);
+      break;
+    case Temperature::kCool:
+      IOSTATS_ADD(file_io_stats_by_temperature.cool_file_bytes_read, size);
+      IOSTATS_ADD(file_io_stats_by_temperature.cool_file_read_count, 1);
+      RecordTick(stats, COOL_FILE_READ_BYTES, size);
+      RecordTick(stats, COOL_FILE_READ_COUNT, 1);
+      break;
+    case Temperature::kCold:
+      IOSTATS_ADD(file_io_stats_by_temperature.cold_file_bytes_read, size);
+      IOSTATS_ADD(file_io_stats_by_temperature.cold_file_read_count, 1);
+      RecordTick(stats, COLD_FILE_READ_BYTES, size);
+      RecordTick(stats, COLD_FILE_READ_COUNT, 1);
+      break;
+    case Temperature::kIce:
+      IOSTATS_ADD(file_io_stats_by_temperature.ice_file_bytes_read, size);
+      IOSTATS_ADD(file_io_stats_by_temperature.ice_file_read_count, 1);
+      RecordTick(stats, ICE_FILE_READ_BYTES, size);
+      RecordTick(stats, ICE_FILE_READ_COUNT, 1);
+      break;
+    case Temperature::kUnknown:
+      if (is_last_level) {
+        IOSTATS_ADD(file_io_stats_by_temperature.unknown_last_level_bytes_read,
+                    size);
+        IOSTATS_ADD(file_io_stats_by_temperature.unknown_last_level_read_count,
+                    1);
+      } else {
+        IOSTATS_ADD(
+            file_io_stats_by_temperature.unknown_non_last_level_bytes_read,
+            size);
+        IOSTATS_ADD(
+            file_io_stats_by_temperature.unknown_non_last_level_read_count, 1);
+      }
+      break;
+    default:
+      break;
   }
 }
 
@@ -106,11 +130,14 @@ IOStatus RandomAccessFileReader::Create(
 
 IOStatus RandomAccessFileReader::Read(const IOOptions& opts, uint64_t offset,
                                       size_t n, Slice* result, char* scratch,
-                                      AlignedBuf* aligned_buf) const {
+                                      AlignedBuf* aligned_buf,
+                                      IODebugContext* dbg) const {
   (void)aligned_buf;
   const Env::IOPriority rate_limiter_priority = opts.rate_limiter_priority;
 
   TEST_SYNC_POINT_CALLBACK("RandomAccessFileReader::Read", nullptr);
+  TEST_SYNC_POINT_CALLBACK("RandomAccessFileReader::Read:IODebugContext",
+                           const_cast<void*>(static_cast<void*>(dbg)));
 
   // To be paranoid: modify scratch a little bit, so in case underlying
   // FileSystem doesn't fill the buffer but return success and `scratch` returns
@@ -175,7 +202,7 @@ IOStatus RandomAccessFileReader::Read(const IOOptions& opts, uint64_t offset,
           // the opts.timeout before calling file_->Read
           assert(!opts.timeout.count() || allowed == read_size);
           io_s = file_->Read(aligned_offset + buf.CurrentSize(), allowed, opts,
-                             &tmp, buf.Destination(), nullptr);
+                             &tmp, buf.Destination(), dbg);
         }
         if (ShouldNotifyListeners()) {
           auto finish_ts = FileOperationInfo::FinishNow();
@@ -237,7 +264,7 @@ IOStatus RandomAccessFileReader::Read(const IOOptions& opts, uint64_t offset,
           // the opts.timeout before calling file_->Read
           assert(!opts.timeout.count() || allowed == n);
           io_s = file_->Read(offset + pos, allowed, opts, &tmp_result,
-                             scratch + pos, nullptr);
+                             scratch + pos, dbg);
         }
         if (ShouldNotifyListeners()) {
           auto finish_ts = FileOperationInfo::FinishNow();
@@ -311,7 +338,8 @@ bool TryMerge(FSReadRequest* dest, const FSReadRequest& src) {
 IOStatus RandomAccessFileReader::MultiRead(const IOOptions& opts,
                                            FSReadRequest* read_reqs,
                                            size_t num_reqs,
-                                           AlignedBuf* aligned_buf) const {
+                                           AlignedBuf* aligned_buf,
+                                           IODebugContext* dbg) const {
   (void)aligned_buf;  // suppress warning of unused variable in LITE mode
   assert(num_reqs > 0);
 
@@ -420,8 +448,10 @@ IOStatus RandomAccessFileReader::MultiRead(const IOOptions& opts,
           remaining_bytes -= request_bytes;
         }
       }
-      io_s = file_->MultiRead(fs_reqs, num_fs_reqs, opts,
-                              /*IODebugContext*=*/nullptr);
+      TEST_SYNC_POINT_CALLBACK(
+          "RandomAccessFileReader::MultiRead:IODebugContext",
+          const_cast<void*>(static_cast<void*>(dbg)));
+      io_s = file_->MultiRead(fs_reqs, num_fs_reqs, opts, dbg);
       RecordInHistogram(stats_, MULTIGET_IO_BATCH_SIZE, num_fs_reqs);
     }
 
@@ -475,19 +505,34 @@ IOStatus RandomAccessFileReader::MultiRead(const IOOptions& opts,
 }
 
 IOStatus RandomAccessFileReader::PrepareIOOptions(const ReadOptions& ro,
-                                                  IOOptions& opts) const {
+                                                  IOOptions& opts,
+                                                  IODebugContext* dbg) const {
   if (clock_ != nullptr) {
-    return PrepareIOFromReadOptions(ro, clock_, opts);
+    return PrepareIOFromReadOptions(ro, clock_, opts, dbg);
   } else {
-    return PrepareIOFromReadOptions(ro, SystemClock::Default().get(), opts);
+    return PrepareIOFromReadOptions(ro, SystemClock::Default().get(), opts,
+                                    dbg);
   }
 }
 
+// Notes for when direct_io is enabled:
+// Unless req.offset, req.len, req.scratch are all already aligned,
+// RandomAccessFileReader will creats aligned requests and aligned buffer for
+// the request. User should only provide either req.scratch or aligned_buf. If
+// only req.scratch is provided, result will be copied from allocated aligned
+// buffer to req.scratch. If only alignd_buf is provided, it will be set to
+// the ailgned buf allocated by RandomAccessFileReader and saves a copy.
 IOStatus RandomAccessFileReader::ReadAsync(
     FSReadRequest& req, const IOOptions& opts,
     std::function<void(FSReadRequest&, void*)> cb, void* cb_arg,
-    void** io_handle, IOHandleDeleter* del_fn, AlignedBuf* aligned_buf) {
+    void** io_handle, IOHandleDeleter* del_fn, AlignedBuf* aligned_buf,
+    IODebugContext* dbg) {
   IOStatus s;
+  TEST_SYNC_POINT_CALLBACK("RandomAccessFileReader::ReadAsync:InjectStatus",
+                           &s);
+  if (!s.ok()) {
+    return s;
+  }
   // Create a callback and populate info.
   auto read_async_callback =
       std::bind(&RandomAccessFileReader::ReadAsyncCallback, this,
@@ -532,14 +577,14 @@ IOStatus RandomAccessFileReader::ReadAsync(
                  (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
                  true /*delay_enabled*/);
     s = file_->ReadAsync(aligned_req, opts, read_async_callback,
-                         read_async_info, io_handle, del_fn, nullptr /*dbg*/);
+                         read_async_info, io_handle, del_fn, dbg);
   } else {
     StopWatch sw(clock_, stats_, hist_type_,
                  GetFileReadHistograms(stats_, opts.io_activity),
                  (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
                  true /*delay_enabled*/);
     s = file_->ReadAsync(req, opts, read_async_callback, read_async_info,
-                         io_handle, del_fn, nullptr /*dbg*/);
+                         io_handle, del_fn, dbg);
   }
   RecordTick(stats_, READ_ASYNC_MICROS, elapsed);
 
diff --git a/file/random_access_file_reader.h b/file/random_access_file_reader.h
index 945e685e3d00..c1de6b973f44 100644
--- a/file/random_access_file_reader.h
+++ b/file/random_access_file_reader.h
@@ -164,7 +164,8 @@ class RandomAccessFileReader {
   // the internally allocated buffer on return, and the result refers to a
   // region in aligned_buf.
   IOStatus Read(const IOOptions& opts, uint64_t offset, size_t n, Slice* result,
-                char* scratch, AlignedBuf* aligned_buf) const;
+                char* scratch, AlignedBuf* aligned_buf,
+                IODebugContext* dbg = nullptr) const;
 
   // REQUIRES:
   // num_reqs > 0, reqs do not overlap, and offsets in reqs are increasing.
@@ -172,10 +173,12 @@ class RandomAccessFileReader {
   // In direct IO mode, aligned_buf stores the aligned buffer allocated inside
   // MultiRead, the result Slices in reqs refer to aligned_buf.
   IOStatus MultiRead(const IOOptions& opts, FSReadRequest* reqs,
-                     size_t num_reqs, AlignedBuf* aligned_buf) const;
+                     size_t num_reqs, AlignedBuf* aligned_buf,
+                     IODebugContext* dbg = nullptr) const;
 
-  IOStatus Prefetch(const IOOptions& opts, uint64_t offset, size_t n) const {
-    return file_->Prefetch(offset, n, opts, nullptr);
+  IOStatus Prefetch(const IOOptions& opts, uint64_t offset, size_t n,
+                    IODebugContext* dbg = nullptr) const {
+    return file_->Prefetch(offset, n, opts, dbg);
   }
 
   FSRandomAccessFile* file() { return file_.get(); }
@@ -184,12 +187,13 @@ class RandomAccessFileReader {
 
   bool use_direct_io() const { return file_->use_direct_io(); }
 
-  IOStatus PrepareIOOptions(const ReadOptions& ro, IOOptions& opts) const;
+  IOStatus PrepareIOOptions(const ReadOptions& ro, IOOptions& opts,
+                            IODebugContext* dbg = nullptr) const;
 
   IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts,
                      std::function<void(FSReadRequest&, void*)> cb,
                      void* cb_arg, void** io_handle, IOHandleDeleter* del_fn,
-                     AlignedBuf* aligned_buf);
+                     AlignedBuf* aligned_buf, IODebugContext* dbg = nullptr);
 
   void ReadAsyncCallback(FSReadRequest& req, void* cb_arg);
 };
diff --git a/file/random_access_file_reader_test.cc b/file/random_access_file_reader_test.cc
index f081795b9d1f..717e985f1adb 100644
--- a/file/random_access_file_reader_test.cc
+++ b/file/random_access_file_reader_test.cc
@@ -147,8 +147,9 @@ TEST_F(RandomAccessFileReaderTest, MultiReadDirectIO) {
     reqs.push_back(std::move(r0));
     reqs.push_back(std::move(r1));
     AlignedBuf aligned_buf;
-    ASSERT_OK(
-        r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf));
+    IODebugContext dbg;
+    ASSERT_OK(r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf,
+                           &dbg));
 
     AssertResult(content, reqs);
 
@@ -192,8 +193,9 @@ TEST_F(RandomAccessFileReaderTest, MultiReadDirectIO) {
     reqs.push_back(std::move(r1));
     reqs.push_back(std::move(r2));
     AlignedBuf aligned_buf;
-    ASSERT_OK(
-        r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf));
+    IODebugContext dbg;
+    ASSERT_OK(r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf,
+                           &dbg));
 
     AssertResult(content, reqs);
 
@@ -237,8 +239,9 @@ TEST_F(RandomAccessFileReaderTest, MultiReadDirectIO) {
     reqs.push_back(std::move(r1));
     reqs.push_back(std::move(r2));
     AlignedBuf aligned_buf;
-    ASSERT_OK(
-        r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf));
+    IODebugContext dbg;
+    ASSERT_OK(r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf,
+                           &dbg));
 
     AssertResult(content, reqs);
 
@@ -274,8 +277,9 @@ TEST_F(RandomAccessFileReaderTest, MultiReadDirectIO) {
     reqs.push_back(std::move(r0));
     reqs.push_back(std::move(r1));
     AlignedBuf aligned_buf;
-    ASSERT_OK(
-        r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf));
+    IODebugContext dbg;
+    ASSERT_OK(r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf,
+                           &dbg));
 
     AssertResult(content, reqs);
 
diff --git a/file/readahead_raf.cc b/file/readahead_raf.cc
index dd09822e3e23..004f2ab746ba 100644
--- a/file/readahead_raf.cc
+++ b/file/readahead_raf.cc
@@ -108,6 +108,10 @@ class ReadaheadRandomAccessFile : public FSRandomAccessFile {
 
   bool use_direct_io() const override { return file_->use_direct_io(); }
 
+  IOStatus GetFileSize(uint64_t* result) override {
+    return file_->GetFileSize(result);
+  }
+
  private:
   // Tries to read from buffer_ n bytes starting at offset. If anything was read
   // from the cache, it sets cached_len to the number of bytes actually read,
diff --git a/file/sst_file_manager_impl.h b/file/sst_file_manager_impl.h
index 96ec271eee37..b98d8594e851 100644
--- a/file/sst_file_manager_impl.h
+++ b/file/sst_file_manager_impl.h
@@ -162,7 +162,6 @@ class SstFileManagerImpl : public SstFileManager {
   void Close();
 
   void SetStatisticsPtr(const std::shared_ptr<Statistics>& stats) override {
-    stats_ = stats;
     delete_scheduler_.SetStatisticsPtr(stats);
   }
 
@@ -216,7 +215,6 @@ class SstFileManagerImpl : public SstFileManager {
   std::list<ErrorHandler*> error_handler_list_;
   // Pointer to ErrorHandler instance that is currently processing recovery
   ErrorHandler* cur_instance_;
-  std::shared_ptr<Statistics> stats_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/file/writable_file_writer.cc b/file/writable_file_writer.cc
index 41e3b582afa4..2a92c0754dcd 100644
--- a/file/writable_file_writer.cc
+++ b/file/writable_file_writer.cc
@@ -204,13 +204,14 @@ IOStatus WritableFileWriter::Append(const IOOptions& opts, const Slice& data,
   return s;
 }
 
-IOStatus WritableFileWriter::Pad(const IOOptions& opts,
-                                 const size_t pad_bytes) {
+IOStatus WritableFileWriter::Pad(const IOOptions& opts, const size_t pad_bytes,
+                                 const size_t max_pad_size) {
+  (void)max_pad_size;
   if (seen_error()) {
     return GetWriterHasPreviousErrorStatus();
   }
   const IOOptions io_options = FinalizeIOOptions(opts);
-  assert(pad_bytes < kDefaultPageSize);
+  assert(pad_bytes < max_pad_size);
   size_t left = pad_bytes;
   size_t cap = buf_.Capacity() - buf_.CurrentSize();
 
@@ -687,9 +688,9 @@ IOStatus WritableFileWriter::WriteBufferedWithChecksum(const IOOptions& opts,
   if (rate_limiter_ != nullptr && rate_limiter_priority_used != Env::IO_TOTAL) {
     while (data_size > 0) {
       size_t tmp_size;
-      tmp_size = rate_limiter_->RequestToken(data_size, buf_.Alignment(),
-                                             rate_limiter_priority_used, stats_,
-                                             RateLimiter::OpType::kWrite);
+      tmp_size =
+          rate_limiter_->RequestToken(data_size, 0, rate_limiter_priority_used,
+                                      stats_, RateLimiter::OpType::kWrite);
       data_size -= tmp_size;
     }
   }
diff --git a/file/writable_file_writer.h b/file/writable_file_writer.h
index b880e1f216b2..619821204b3e 100644
--- a/file/writable_file_writer.h
+++ b/file/writable_file_writer.h
@@ -256,7 +256,8 @@ class WritableFileWriter {
   IOStatus Append(const IOOptions& opts, const Slice& data,
                   uint32_t crc32c_checksum = 0);
 
-  IOStatus Pad(const IOOptions& opts, const size_t pad_bytes);
+  IOStatus Pad(const IOOptions& opts, const size_t pad_bytes,
+               const size_t max_pad_size);
 
   IOStatus Flush(const IOOptions& opts);
 
diff --git a/folly.mk b/folly.mk
new file mode 100644
index 000000000000..69f99b91a9aa
--- /dev/null
+++ b/folly.mk
@@ -0,0 +1,165 @@
+# This file contains the vast majority of folly-related build configuration
+# for the checkout_folly and build_folly targets, so that this file can be
+# hashed for purposes of caching folly builds and not hitting that cache when
+# something here changes.
+
+# This provides a Makefile simulation of a Meta-internal folly integration.
+# It is not validated for general use.
+#
+# USE_FOLLY links the build targets with libfolly.a. The latter could be
+# built using 'make build_folly', or built externally and specified in
+# the CXXFLAGS and EXTRA_LDFLAGS env variables. The build_detect_platform
+# script tries to detect if an external folly dependency has been specified.
+# If not, it exports FOLLY_PATH to the path of the installed Folly and
+# dependency libraries.
+#
+# USE_FOLLY_LITE cherry picks source files from Folly to include in the
+# RocksDB library. Its faster and has fewer dependencies on 3rd party
+# libraries, but with limited functionality. For example, coroutine
+# functionality is not available.
+ifeq ($(USE_FOLLY),1)
+ifeq ($(USE_FOLLY_LITE),1)
+$(error Please specify only one of USE_FOLLY and USE_FOLLY_LITE)
+endif
+ifneq ($(strip $(FOLLY_PATH)),)
+	BOOST_PATH = $(shell (ls -d $(FOLLY_PATH)/../boost*))
+	DBL_CONV_PATH = $(shell (ls -d $(FOLLY_PATH)/../double-conversion*))
+	GFLAGS_PATH = $(shell (ls -d $(FOLLY_PATH)/../gflags*))
+	GLOG_PATH = $(shell (ls -d $(FOLLY_PATH)/../glog*))
+	LIBEVENT_PATH = $(shell (ls -d $(FOLLY_PATH)/../libevent*))
+	XZ_PATH = $(shell (ls -d $(FOLLY_PATH)/../xz*))
+	LIBSODIUM_PATH = $(shell (ls -d $(FOLLY_PATH)/../libsodium*))
+	FMT_PATH = $(shell (ls -d $(FOLLY_PATH)/../fmt*))
+
+	# For some reason, glog and fmt libraries are under either lib or lib64
+	GLOG_LIB_PATH = $(shell (ls -d $(GLOG_PATH)/lib*))
+	FMT_LIB_PATH = $(shell (ls -d $(FMT_PATH)/lib*))
+
+	# AIX: pre-defined system headers are surrounded by an extern "C" block
+	ifeq ($(PLATFORM), OS_AIX)
+		PLATFORM_CCFLAGS += -I$(BOOST_PATH)/include -I$(DBL_CONV_PATH)/include -I$(GLOG_PATH)/include -I$(LIBEVENT_PATH)/include -I$(XZ_PATH)/include -I$(LIBSODIUM_PATH)/include -I$(FOLLY_PATH)/include -I$(FMT_PATH)/include
+		PLATFORM_CXXFLAGS += -I$(BOOST_PATH)/include -I$(DBL_CONV_PATH)/include -I$(GLOG_PATH)/include -I$(LIBEVENT_PATH)/include -I$(XZ_PATH)/include -I$(LIBSODIUM_PATH)/include -I$(FOLLY_PATH)/include -I$(FMT_PATH)/include
+	else
+		PLATFORM_CCFLAGS += -isystem $(BOOST_PATH)/include -isystem $(DBL_CONV_PATH)/include -isystem $(GLOG_PATH)/include -isystem $(LIBEVENT_PATH)/include -isystem $(XZ_PATH)/include -isystem $(LIBSODIUM_PATH)/include -isystem $(FOLLY_PATH)/include -isystem $(FMT_PATH)/include
+		PLATFORM_CXXFLAGS += -isystem $(BOOST_PATH)/include -isystem $(DBL_CONV_PATH)/include -isystem $(GLOG_PATH)/include -isystem $(LIBEVENT_PATH)/include -isystem $(XZ_PATH)/include -isystem $(LIBSODIUM_PATH)/include -isystem $(FOLLY_PATH)/include -isystem $(FMT_PATH)/include
+	endif
+
+	# Add -ldl at the end as gcc resolves a symbol in a library by searching only in libraries specified later
+	# in the command line
+
+	PLATFORM_LDFLAGS += $(FOLLY_PATH)/lib/libfolly.a $(BOOST_PATH)/lib/libboost_context.a $(BOOST_PATH)/lib/libboost_filesystem.a $(BOOST_PATH)/lib/libboost_atomic.a $(BOOST_PATH)/lib/libboost_program_options.a $(BOOST_PATH)/lib/libboost_regex.a $(BOOST_PATH)/lib/libboost_system.a $(BOOST_PATH)/lib/libboost_thread.a $(DBL_CONV_PATH)/lib/libdouble-conversion.a $(LIBEVENT_PATH)/lib/libevent.a $(LIBSODIUM_PATH)/lib/libsodium.a -ldl
+ifneq ($(DEBUG_LEVEL),0)
+	PLATFORM_LDFLAGS += $(FMT_LIB_PATH)/libfmtd.a $(GLOG_LIB_PATH)/libglogd.so $(GFLAGS_PATH)/lib/libgflags_debug.so.2.2
+else
+	PLATFORM_LDFLAGS += $(FMT_LIB_PATH)/libfmt.a $(GLOG_LIB_PATH)/libglog.so $(GFLAGS_PATH)/lib/libgflags.so.2.2
+endif
+	PLATFORM_LDFLAGS += -Wl,-rpath=$(GLOG_LIB_PATH) -Wl,-rpath=$(GFLAGS_PATH)/lib
+endif
+	PLATFORM_CCFLAGS += -DUSE_FOLLY -DFOLLY_NO_CONFIG
+	PLATFORM_CXXFLAGS += -DUSE_FOLLY -DFOLLY_NO_CONFIG
+endif
+
+ifeq ($(USE_FOLLY_LITE),1)
+	# Path to the Folly source code and include files
+	FOLLY_DIR = ./third-party/folly
+ifneq ($(strip $(BOOST_SOURCE_PATH)),)
+	BOOST_INCLUDE = $(shell (ls -d $(BOOST_SOURCE_PATH)/boost*/))
+	# AIX: pre-defined system headers are surrounded by an extern "C" block
+	ifeq ($(PLATFORM), OS_AIX)
+		PLATFORM_CCFLAGS += -I$(BOOST_INCLUDE)
+		PLATFORM_CXXFLAGS += -I$(BOOST_INCLUDE)
+	else
+		PLATFORM_CCFLAGS += -isystem $(BOOST_INCLUDE)
+		PLATFORM_CXXFLAGS += -isystem $(BOOST_INCLUDE)
+	endif
+endif  # BOOST_SOURCE_PATH
+ifneq ($(strip $(FMT_SOURCE_PATH)),)
+	FMT_INCLUDE = $(shell (ls -d $(FMT_SOURCE_PATH)/fmt*/include/))
+	# AIX: pre-defined system headers are surrounded by an extern "C" block
+	ifeq ($(PLATFORM), OS_AIX)
+		PLATFORM_CCFLAGS += -I$(FMT_INCLUDE)
+		PLATFORM_CXXFLAGS += -I$(FMT_INCLUDE)
+	else
+		PLATFORM_CCFLAGS += -isystem $(FMT_INCLUDE)
+		PLATFORM_CXXFLAGS += -isystem $(FMT_INCLUDE)
+	endif
+endif  # FMT_SOURCE_PATH
+	# AIX: pre-defined system headers are surrounded by an extern "C" block
+	ifeq ($(PLATFORM), OS_AIX)
+		PLATFORM_CCFLAGS += -I$(FOLLY_DIR)
+		PLATFORM_CXXFLAGS += -I$(FOLLY_DIR)
+	else
+		PLATFORM_CCFLAGS += -isystem $(FOLLY_DIR)
+		PLATFORM_CXXFLAGS += -isystem $(FOLLY_DIR)
+	endif
+	PLATFORM_CCFLAGS += -DUSE_FOLLY -DFOLLY_NO_CONFIG
+	PLATFORM_CXXFLAGS += -DUSE_FOLLY -DFOLLY_NO_CONFIG
+# TODO: fix linking with fbcode compiler config
+	PLATFORM_LDFLAGS += -lglog
+endif
+
+FOLLY_COMMIT_HASH = 1e8ce1e5d35acff7b78fedbca3e7311b39f43529
+
+# For public CI runs, checkout folly in a way that can build with RocksDB.
+# This is mostly intended as a test-only simulation of Meta-internal folly
+# integration.
+checkout_folly:
+	if [ -e third-party/folly ]; then \
+		cd third-party/folly && ${GIT_COMMAND} fetch origin; \
+	else \
+		cd third-party && ${GIT_COMMAND} clone https://github.com/facebook/folly.git; \
+	fi
+	@# Pin to a particular version for public CI, so that PR authors don't
+	@# need to worry about folly breaking our integration. Update periodically
+	cd third-party/folly && git reset --hard $(FOLLY_COMMIT_HASH)
+	@# Apparently missing include
+	perl -pi -e 's/(#include <atomic>)/$$1\n#include <cstring>/' third-party/folly/folly/lang/Exception.h
+	@# const mismatch
+	perl -pi -e 's/: environ/: (const char**)(environ)/' third-party/folly/folly/Subprocess.cpp
+	@# Restore cached downloads and handle unreliable mirrors with fallback
+	@cd third-party/folly && \
+		DOWNLOAD_DIR=`$(PYTHON) build/fbcode_builder/getdeps.py show-inst-dir | sed 's|/installed/.*|/downloads|'` && \
+		mkdir -p "$$DOWNLOAD_DIR" && \
+		CACHE_DIR="/tmp/rocksdb-getdeps-cache" && \
+		mkdir -p "$$CACHE_DIR" && \
+		echo "Restoring cached downloads..." && \
+		if ls "$$CACHE_DIR"/*.tar.gz "$$CACHE_DIR"/*.tar.xz "$$CACHE_DIR"/*.zip >/dev/null 2>&1; then \
+			cp -n "$$CACHE_DIR"/*.tar.gz "$$CACHE_DIR"/*.tar.xz "$$CACHE_DIR"/*.zip "$$DOWNLOAD_DIR/" 2>/dev/null || true; \
+		fi && \
+		echo "Handling known unreliable downloads with fallback mirrors..." && \
+		$(PYTHON) ../../build_tools/getdeps_fallback_mirror.py "$$DOWNLOAD_DIR" "$$CACHE_DIR" build/fbcode_builder/manifests
+	@# NOTE: boost and fmt source will be needed for any build including `USE_FOLLY_LITE` builds as those depend on those headers
+	cd third-party/folly && GETDEPS_USE_WGET=1 $(PYTHON) build/fbcode_builder/getdeps.py fetch boost && GETDEPS_USE_WGET=1 $(PYTHON) build/fbcode_builder/getdeps.py fetch fmt
+	@# Update cache with any new downloads
+	@cd third-party/folly && \
+		DOWNLOAD_DIR=`$(PYTHON) build/fbcode_builder/getdeps.py show-inst-dir | sed 's|/installed/.*|/downloads|'` && \
+		CACHE_DIR="/tmp/rocksdb-getdeps-cache" && \
+		if ls "$$DOWNLOAD_DIR"/*.tar.gz "$$DOWNLOAD_DIR"/*.tar.xz "$$DOWNLOAD_DIR"/*.zip >/dev/null 2>&1; then \
+			cp -n "$$DOWNLOAD_DIR"/*.tar.gz "$$DOWNLOAD_DIR"/*.tar.xz "$$DOWNLOAD_DIR"/*.zip "$$CACHE_DIR/" 2>/dev/null || true; \
+		fi
+
+CXX_M_FLAGS = $(filter -m%, $(CXXFLAGS))
+
+FOLLY_BUILD_FLAGS = --no-tests
+# NOTE: To avoid ODR violations, we must build folly in debug mode iff
+# building RocksDB in debug mode.
+ifneq ($(DEBUG_LEVEL),0)
+FOLLY_BUILD_FLAGS += --build-type Debug
+endif
+
+build_folly:
+	FOLLY_INST_PATH=`cd third-party/folly && $(PYTHON) build/fbcode_builder/getdeps.py show-inst-dir`; \
+	if [ "$$FOLLY_INST_PATH" ]; then \
+		rm -rf $${FOLLY_INST_PATH}/../../*; \
+	else \
+		echo "Please run checkout_folly first"; \
+		false; \
+	fi
+	cd third-party/folly && \
+		CXXFLAGS=" $(CXX_M_FLAGS) -DHAVE_CXX11_ATOMIC " GETDEPS_USE_WGET=1 $(PYTHON) build/fbcode_builder/getdeps.py build $(FOLLY_BUILD_FLAGS)
+	@# In the folly build, glog and gflags are only built as dynamic libraries,
+	@# not static. This patchelf command is needed to reliably have the glog
+	@# library find its dependency gflags, because apparently the rpath of the
+	@# final binary is not used in resolving that transitive dependency.
+	FOLLY_INST_PATH=`cd third-party/folly && $(PYTHON) build/fbcode_builder/getdeps.py show-inst-dir`; \
+	cd "$$FOLLY_INST_PATH" && patchelf --add-rpath $$PWD/../gflags-*/lib ../glog-*/lib*/libglog*.so.*.*.*
diff --git a/fuzz/db_fuzzer.cc b/fuzz/db_fuzzer.cc
index e6d5bb63c06f..7b10b35ce101 100644
--- a/fuzz/db_fuzzer.cc
+++ b/fuzz/db_fuzzer.cc
@@ -31,11 +31,11 @@ constexpr char db_path[] = "/tmp/testdb";
 // enum. The goal is to capture sanitizer bugs, so the code should be
 // compiled with a given sanitizer (ASan, UBSan, MSan).
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
-  ROCKSDB_NAMESPACE::DB* db;
+  std::unique_ptr<ROCKSDB_NAMESPACE::DB> db;
   ROCKSDB_NAMESPACE::Options options;
+  ROCKSDB_NAMESPACE::Status status;
   options.create_if_missing = true;
-  ROCKSDB_NAMESPACE::Status status =
-      ROCKSDB_NAMESPACE::DB::Open(options, db_path, &db);
+  status = ROCKSDB_NAMESPACE::DB::Open(options, db_path, &db);
   if (!status.ok()) {
     return 0;
   }
@@ -88,7 +88,7 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
       }
       case kOpenClose: {
         db->Close();
-        delete db;
+        db.reset();
         status = ROCKSDB_NAMESPACE::DB::Open(options, db_path, &db);
         if (!status.ok()) {
           ROCKSDB_NAMESPACE::DestroyDB(db_path, options);
@@ -104,7 +104,7 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
                                    "new_cf", &cf);
         s = db->DestroyColumnFamilyHandle(cf);
         db->Close();
-        delete db;
+        db.reset();
 
         // open DB with two column families
         std::vector<ROCKSDB_NAMESPACE::ColumnFamilyDescriptor> column_families;
@@ -166,7 +166,7 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
 
   // Cleanup DB
   db->Close();
-  delete db;
+  db.reset();
   ROCKSDB_NAMESPACE::DestroyDB(db_path, options);
   return 0;
 }
diff --git a/fuzz/db_map_fuzzer.cc b/fuzz/db_map_fuzzer.cc
index ed9df8f8432d..8c55ac4e9e7a 100644
--- a/fuzz/db_map_fuzzer.cc
+++ b/fuzz/db_map_fuzzer.cc
@@ -50,7 +50,7 @@ DEFINE_PROTO_FUZZER(DBOperations& input) {
   }
 
   std::map<std::string, std::string> kv;
-  ROCKSDB_NAMESPACE::DB* db = nullptr;
+  std::unique_ptr<ROCKSDB_NAMESPACE::DB> db;
   ROCKSDB_NAMESPACE::Options options;
   options.create_if_missing = true;
   CHECK_OK(ROCKSDB_NAMESPACE::DB::Open(options, kDbPath, &db));
@@ -86,8 +86,7 @@ DEFINE_PROTO_FUZZER(DBOperations& input) {
     }
   }
   CHECK_OK(db->Close());
-  delete db;
-  db = nullptr;
+  db.reset();
 
   CHECK_OK(ROCKSDB_NAMESPACE::DB::Open(options, kDbPath, &db));
   auto kv_it = kv.begin();
@@ -102,6 +101,6 @@ DEFINE_PROTO_FUZZER(DBOperations& input) {
   delete it;
 
   CHECK_OK(db->Close());
-  delete db;
+  db.reset();
   CHECK_OK(ROCKSDB_NAMESPACE::DestroyDB(kDbPath, options));
 }
diff --git a/fuzz/sst_file_writer_fuzzer.cc b/fuzz/sst_file_writer_fuzzer.cc
index 676daf574fa4..ae17f64cd2fb 100644
--- a/fuzz/sst_file_writer_fuzzer.cc
+++ b/fuzz/sst_file_writer_fuzzer.cc
@@ -91,7 +91,8 @@ TableReader* NewTableReader(const std::string& sst_file_path,
   }
   if (s.ok()) {
     ImmutableOptions iopts(options, cf_ioptions);
-    TableReaderOptions t_opt(iopts, /*prefix_extractor=*/nullptr, env_options,
+    TableReaderOptions t_opt(iopts, /*prefix_extractor=*/nullptr,
+                             /*compression_manager=*/nullptr, env_options,
                              cf_ioptions.internal_comparator,
                              0 /* block_protection_bytes_per_key */);
     t_opt.largest_seqno = kMaxSequenceNumber;
diff --git a/include/rocksdb/advanced_cache.h b/include/rocksdb/advanced_cache.h
index d8eeb7d2e381..8142228205e4 100644
--- a/include/rocksdb/advanced_cache.h
+++ b/include/rocksdb/advanced_cache.h
@@ -318,7 +318,7 @@ class Cache : public Customizable {
   // REQUIRES: handle must have been returned by a method on *this.
   virtual bool Release(Handle* handle, bool erase_if_last_ref = false) = 0;
 
-  // Return the object assiciated with a handle returned by a successful
+  // Return the object associated with a handle returned by a successful
   // Lookup(). For historical reasons, this is also known at the "value"
   // associated with the key.
   // REQUIRES: handle must not have been released yet.
diff --git a/include/rocksdb/advanced_compression.h b/include/rocksdb/advanced_compression.h
new file mode 100644
index 000000000000..a680d870464f
--- /dev/null
+++ b/include/rocksdb/advanced_compression.h
@@ -0,0 +1,699 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// APIs for customizing compression in RocksDB.
+//
+// ***********************************************************************
+// EXPERIMENTAL - subject to change while under development
+// ***********************************************************************
+
+#pragma once
+
+#include <variant>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/compression_type.h"
+#include "rocksdb/data_structure.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TODO: alias/adapt for compression
+struct FilterBuildingContext;
+class Decompressor;
+
+// A Compressor represents a very specific but potentially adapting strategy for
+// compressing blocks, including the relevant algorithm(s), options, dictionary,
+// etc. as applicable--every input except the sequence of bytes to compress.
+// Compressor is generally thread-safe so can be shared by multiple threads. (It
+// could make sense to convert unique_ptr<Compressor> to
+// shared_ptr<Compressor>.) A Compressor for data files is expected to be used
+// for just one file, so that compression strategy can be explicitly
+// reconsidered for each new file. However, a Compressor for in-memory use could
+// live indefinitely.
+//
+// If a single thread is doing many compressions under the same strategy, it
+// should request a WorkingArea that will in some cases make repeated
+// compression in a single thread more efficient. Unlike the rest of Compressor,
+// each WorkingArea can only be used by one thread at a time. WorkingAreas can
+// have pre-allocated space and/or data structures, and/or thread-local
+// statistics that are later incorporated into shared statistics objects.
+//
+// The Compressor marks each block with a CompressionType to guide
+// decompression. However, the compression dictionary (or whether there is one
+// associated) is determined at Compressor creation time, though the process of
+// getting a Compressor with a dictionary starts with a Compressor without
+// dictionary (which will often be relevant alongside); see relevant functions.
+// If the Compressor wants to decide block-by-block whether to apply the
+// configured dictionary, that would need to be encoded in CompressionType or
+// the compressed output. (NOTE: this was historically NOT encoded in
+// CompressionType and instead implied by BlockType and the presence of a
+// dictionary block in the file. Some of the resulting awkwardness includes
+// a number of built-in CompressionTypes that ignore any dictionary block in
+// the file; therefore they cannot accommodate dictionary compression in the
+// future without a schema change / extension.)
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class Compressor {
+ public:  // Auxiliary types
+  // No dictionary should be used (for a given block type).
+  struct DictDisabled {};
+
+  // A recommendation for dictionary compression by collecting samples from
+  // blocks. The caller should collect up to `max_sample_bytes` of sample data
+  // and pass it to MaybeCloneSpecialized() to create a specialized compressor.
+  struct DictSampling {
+    // Maximum total bytes of sample data to collect from blocks.
+    // This controls how much data is buffered before dictionary training.
+    size_t max_sample_bytes = 0;
+  };
+
+  // A pre-defined dictionary that is recommended or specified for direct use
+  // with MaybeCloneSpecialized(), without any sampling.
+  struct DictPreDefined {
+    // The owned raw/serialized dictionary bytes. Recommend std::move to
+    // MaybeCloneSpecialized()
+    std::string dict_data;
+  };
+
+  // The result type for GetDictGuidance() - indicates how dictionary
+  // compression should be configured for a given block type.
+  using DictConfig = std::variant<DictDisabled, DictSampling, DictPreDefined>;
+
+  // Sample data collected from blocks for dictionary training.
+  struct DictSamples {
+    // All the sample input blocks stored contiguously
+    std::string sample_data;
+    // The lengths of each of the sample blocks in `sample_data`
+    std::vector<size_t> sample_lens;
+
+    bool empty() const { return sample_data.empty(); }
+    bool Verify() const {
+      size_t total_len = 0;
+      for (auto len : sample_lens) {
+        total_len += len;
+      }
+      return total_len == sample_data.size();
+    }
+  };
+
+  // Arguments for MaybeCloneSpecialized() - provides either samples, a
+  // pre-defined dictionary, or indicates no dictionary should be used.
+  // NOTE: DictPreDefined here is the same type as above, allowing the
+  // pre-defined dictionary from GetDictGuidance() to be passed through.
+  using DictConfigArgs =
+      std::variant<DictDisabled, DictSamples, DictPreDefined>;
+
+  // A WorkingArea is an optional structure (both for callers and
+  // implementations) that can enable optimizing repeated compressions by
+  // reusing working space or thread-local tracking of statistics or trends.
+  // This enables use of ZSTD context, for example.
+  //
+  // EXTENSIBLE or reinterpret_cast-able by custom Compressor implementations
+  struct WorkingArea {};
+
+ public:  // Functions
+  Compressor() = default;
+  virtual ~Compressor() = default;
+
+  // Class name for logging / debugging purposes
+  virtual const char* Name() const = 0;
+
+  // Potentially more elaborate identifier for logging / debugging purposes
+  virtual std::string GetId() const {
+    std::string id = Name();
+    return id;
+  }
+
+  // Returns the recommended dictionary configuration for the given block type.
+  // See the comments on DictConfig and variants for details.
+  //
+  // NOTE: This may be called on the "base" Compressor returned by
+  // CompressionManager, which is not yet configured with a dictionary,
+  // or it can be skipped by callers not intending to handle dictionary
+  // compression.
+  virtual DictConfig GetDictGuidance(CacheEntryRole block_type) const {
+    // Default implementation: no dictionary
+    (void)block_type;
+    return DictDisabled{};
+  }
+
+  // Returns the serialized form of the data dictionary associated with this
+  // Compressor. NOTE: empty dict is equivalent to no dict.
+  virtual Slice GetSerializedDict() const { return Slice(); }
+
+  // If there's a dominant compression type returned by this compressor as
+  // configured, return it. Otherwise, return kDisableCompressionOption.
+  virtual CompressionType GetPreferredCompressionType() const {
+    return CompressionType::kDisableCompressionOption;
+  }
+
+  // Return a distinct but functionally equivalent Compressor. This is often
+  // needed to implement MaybeCloneSpecialized() in wrapper compressors.
+  virtual std::unique_ptr<Compressor> Clone() const = 0;
+
+  // Create potential variants of the same Compressor that might be
+  // (a) optimized for a particular block type (does not affect correct
+  //     decompression), and/or
+  // (b) configured to use a compression dictionary based on the provided
+  //     configuration (samples or pre-defined dictionary). See the comments on
+  //     DictConfigArgs and its variants for detail.
+  //
+  // Return of nullptr indicates no specialization exists or was attempted
+  // and the caller should use the current Compressor for the desired scenario.
+  // Using CacheEntryRole::kMisc for block_type generally means "unspecified".
+  //
+  // The exact dictionary associated with a returned compressor must be read
+  // from GetSerializedDict().
+  virtual std::unique_ptr<Compressor> MaybeCloneSpecialized(
+      CacheEntryRole block_type, DictConfigArgs&& dict_config) const {
+    // Default implementation: no specialization
+    (void)block_type;
+    (void)dict_config;
+    return nullptr;
+  }
+
+  // A convenience function when a clone is needed and may or may not be
+  // specialized.
+  std::unique_ptr<Compressor> CloneMaybeSpecialized(
+      CacheEntryRole block_type, DictConfigArgs&& dict_config) const {
+    auto clone = MaybeCloneSpecialized(block_type, std::move(dict_config));
+    if (clone == nullptr) {
+      clone = Clone();
+      assert(clone != nullptr);
+    }
+    return clone;
+  }
+
+  // To allow for flexible re-use / reclaimation, we have explicit Get and
+  // Release functions, and usually wrap in a special RAII smart pointer.
+  // For example, a WorkingArea could be saved/recycled in thread-local or
+  // core-local storage, or heap managed, etc., though an explicit WorkingArea
+  // is only advised for repeated compression (by a single thread).
+  // ReleaseWorkingArea() in not intended to be called directly, but used by
+  // ManagedWorkingArea.
+  virtual void ReleaseWorkingArea(WorkingArea*) {}
+
+  using ManagedWorkingArea =
+      ManagedPtr<WorkingArea, Compressor, &Compressor::ReleaseWorkingArea>;
+
+  // See struct WorkingArea above
+  virtual ManagedWorkingArea ObtainWorkingArea() {
+    // Default implementation: no working area
+    return {};
+  }
+
+  // Compress `uncompressed_data` to buffer `compressed_output` of size
+  // `*compressed_output_size`, storing the final compressed size in
+  // `*compressed_output_size` and compression type in `*out_compression_type`.
+  // Note that the compressed output will be decompressed by the sequence
+  // Decompressor::ExtractUncompressedSize() followed by
+  // Decompressor::DecompressBlock(), which must also be provided the same
+  // CompressionType saved in `out_compression_type`. (In many configurations,
+  // `compressed_output` will have a prefix storing the uncompressed_data size
+  // before the compressed bytes returned by the underlying compression
+  // algorithm. And the compression type is usually stored adjacent to the
+  // compressed data, or in some cases assumed/asserted based on the particular
+  // Compressor.)
+  //
+  // If return status is not OK, then some fatal condition has arisen. On OK
+  // status, setting `*out_compression_type = kNoCompression` means compression
+  // is declined and the caller should use the original uncompressed_data and
+  // ignore any result in `compressed_output`. In this case, setting
+  // *compressed_output_size to 0 suggests that compression was quickly
+  // "bypassed" and *compressed_output_size > 0 suggests that compression was
+  // attempted but rejected (e.g. insufficient compression ratio).
+  //
+  // On OK status and `*out_compression_type != kNoCompression`, compression has
+  // happened with results in `compressed_output`, `compressed_output_size`, and
+  // `out_compression_type`. The output compression type is allowed to vary from
+  // call to call but does not for compressors from BuiltinV2CompressionManager.
+  //
+  // The working area is optional and used to optimize repeated compression by
+  // a single thread. ManagedWorkingArea is provided rather than just
+  // WorkingArea so that it can be used only if the `owner` matches expectation.
+  // This could be useful for a Compressor wrapping more than one alternative
+  // underlying Compressor.
+  virtual Status CompressBlock(Slice uncompressed_data, char* compressed_output,
+                               size_t* compressed_output_size,
+                               CompressionType* out_compression_type,
+                               ManagedWorkingArea* working_area) = 0;
+
+  // OPTIONAL: Return a decompressor that is optimized for output from this
+  // compressor.
+  virtual std::shared_ptr<Decompressor> GetOptimizedDecompressor() const {
+    // Default implementation: no optimization. Get a Decompressor from the
+    // CompressionManager.
+    return nullptr;
+  }
+
+  // TODO: something to populate table properties based on settings, after all
+  // or as WorkingAreas released. Maybe also update stats, or that could be in
+  // thread-specific WorkingArea.
+};
+
+// A Decompressor usually has a wide capability to decompress all kinds of
+// compressed data in the scope of a CompressionManager (see that class below),
+// except
+// (a) it might be optimized for or limited to a particular compression type(s)
+//     (see GetDecompressor* functions for in CompressionManager),
+// (b) distinct Decompressors are required to decompress with compression
+//     dictionaries. (Decompressors are generally associated with empty/no
+//     dictionary unless created with MaybeCloneForDict().)
+//
+// Similar to Compressor, Decompressor is generally thread safe except that each
+// WorkingArea can only be used by a single thread at a time.
+//
+// Decompressors known to be associated with no dictionary are typically
+// returned as shared_ptr, because they are broadly usable across threads.
+// Because compression dictionaries are externally managed (see
+// MaybeCloneForDict()), Decompressors associated with compression dictionaries
+// are typically returned as unique_ptr, so that they are more easily
+// guaranteed not to outlive their dictionaries (e.g. in block cache).
+// Decompressors associated with compression dictionaries might include a
+// processed or "digested" form of the raw dictionary for efficient repeated
+// compressions.
+//
+// NOTE: Splitting the interface between ExtractUncompressedSize and
+// DecompressBlock leaves to the caller details of (and flexibility in)
+// allocating buffers for decompressing into. For example, the data could be
+// decompressed into part of a single buffer allocated to hold a block's
+// uncompressed contents along with an in-memory object representation of the
+// block (to reduce fragmentation and other overheads of separate objects).
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class Decompressor {
+ public:
+  Decompressor() = default;
+  virtual ~Decompressor() = default;
+
+  // A name for logging / debugging purposes
+  virtual const char* Name() const = 0;
+
+  // A WorkingArea is an optional structure (both for callers and
+  // implementations) that can enable optimizing repeated decompressions by
+  // reusing working space or thread-local tracking of statistics. This enables
+  // use of ZSTD context, for example.
+  //
+  // EXTENSIBLE or reinterpret_cast-able by custom Compressor implementations
+  struct WorkingArea {};
+
+  // To allow for flexible re-use / reclaimation, we have explicit Obtain and
+  // Release functions, which are typically wrapped in a special RAII smart
+  // pointer. For example, a WorkingArea could be saved/recycled in thread-local
+  // or core-local storage, or heap managed, etc., though an explicit
+  // WorkingArea is only advised for repeated decompression (by a single
+  // thread). ReleaseWorkingArea() in not intended to be called directly, but
+  // used by ManagedWorkingArea.
+  virtual void ReleaseWorkingArea(WorkingArea* wa) {
+    // Default implementation: no working area
+    (void)wa;
+    assert(wa == nullptr);
+  }
+
+  using ManagedWorkingArea =
+      ManagedPtr<WorkingArea, Decompressor, &Decompressor::ReleaseWorkingArea>;
+
+  virtual ManagedWorkingArea ObtainWorkingArea(CompressionType /*preferred*/) {
+    // Default implementation: no working area
+    return {};
+  }
+
+  // If this Decompressor is associated with a (de)compression dictionary
+  // (created with MaybeCloneForDict()), this returns a pointer to those raw (or
+  // "serialized") bytes, which are externally managed (see
+  // MaybeCloneForDict()).
+  // Default: empty slice => no dictionary
+  virtual const Slice& GetSerializedDict() const;
+
+  // Create a variant of this Decompressor in `out` using the specified raw
+  // ("serialized") dictionary. This step is required for decompressing data
+  // compressed with the same dictionary. The new Decompressor references the
+  // given Slice through its lifetime so the data it points to must be managed
+  // by the caller along with (or beyond) the new Decompressor. If the
+  // dictionary is processed into a form reusable by repeated compressions in
+  // many threads, that happens within this call.
+  //
+  // Must return OK if and only if storing a result in `out`. Otherwise, could
+  // return values like NotSupported - dictionary compression is not (yet)
+  // supported for this kind of Decompressor. Corruption - dictionary is
+  // malformed (though many implementations will accept any data as a
+  // dictionary)
+  //
+  // RocksDB promises not to call this function with an empty dictionary slice
+  // (equivalent to no dictionary).
+  virtual Status MaybeCloneForDict(const Slice& /*serialized_dict*/,
+                                   std::unique_ptr<Decompressor>* /*out*/) {
+    return Status::NotSupported(
+        "Dictionary compression not (yet) supported by " + std::string(Name()));
+  }
+
+  // Memory size of this object and others it owns. Does not include the
+  // serialized dictionary (when used) which is externally managed.
+  virtual size_t ApproximateOwnedMemoryUsage() const {
+    // Default: negligible
+    return 0;
+  }
+
+  // Potentially extensible by callers of Decompressor (but not recommended)
+  struct Args {
+    CompressionType compression_type = kNoCompression;
+    Slice compressed_data;
+    uint64_t uncompressed_size = 0;
+    ManagedWorkingArea* working_area = nullptr;
+  };
+
+  // For efficiency on the read path, RocksDB strongly prefers the uncompressed
+  // data size to be encoded in the compressed data in an easily accessible way,
+  // so that allocation of a potentially long-lived buffer can be ideally sized.
+  // This function determines the uncompressed size and potentially modifies
+  // `args.compressed_data` to strip off the size metadata, for providing both
+  // to DecompressBlock along with an appropriate buffer based on that size.
+  // Some implementations will leave `compressed_data` unmodified and let
+  // DecompressBlock call a library function that processes a format that
+  // includes size metadata (e.g. Snappy).
+  //
+  // Even for legacy cases without size metadata (e.g. some very old RocksDB
+  // formats), an exact size is required and could require decompressing the
+  // data (here and in DecompressBlock()).
+  //
+  // Return non-OK in case of corrupt data or some other unworkable limitation
+  // or failure.
+  //
+  // The default implementation uses a standard format for prepending
+  // uncompressed size to the compressed payload. (RocksDB
+  // compress_format_version=2 except Snappy)
+  virtual Status ExtractUncompressedSize(Args& args);
+
+  // Called to decompress a block of data after running ExtractUncompressedSize
+  // on it. `args.compressed_data` is what ExtractUncompressedSize left there
+  // after potentially stripping off the uncompressed size metadata. Returns OK
+  // iff uncompressed data of size `uncompressed_size` is written to
+  // `uncompressed_output`.
+  virtual Status DecompressBlock(const Args& args,
+                                 char* uncompressed_output) = 0;
+};
+
+// A CompressionManager represents
+// * When/where/how to use different compressions
+// * A schema (or set of schemas) and implementation for mapping
+//     <CompressionType, dictionary, compressed data>
+//   to uncompressed data (or error), which can expand over time (error in fewer
+//   cases) for a given CompatibilityName() but can never change that mapping
+//   (because that would break backward compatibility, potential quiet
+//   corruption)
+// TODO: consider adding optional streaming compression support (low priority)
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class CompressionManager
+    : public std::enable_shared_from_this<CompressionManager>,
+      public Customizable {
+ public:
+  CompressionManager() = default;
+  virtual ~CompressionManager() = default;
+  static const char* Type() { return "CompressionManager"; }
+
+  // *************** Creating various Compression Managers *************** //
+  // A name for the schema family of this CompressionManager. In short, if
+  // two CompressionManagers have functionally the same Decompressor(s), they
+  // should have the same CompatibilityName(), so that a compatible
+  // CompressionManager/Decompressor might be used if the original is
+  // unavailable. (Name() can be useful in addition to CompatibilityName() for
+  // understanding what compression strategy was used.) This name should be
+  // limited to legal variable names in C++ (alphanumeric and underscores).
+  virtual const char* CompatibilityName() const = 0;
+
+  // Default implementation checks the current compatibility name and returns
+  // this CompressionManager (via `out`) if appropriate, and otherwise defers
+  // to CreateFromString(). Failure should simply be a matter of "not found" in
+  // which case nullptr is returned.
+  virtual std::shared_ptr<CompressionManager> FindCompatibleCompressionManager(
+      Slice compatibility_name);
+
+  // Create or find a CompressionManager from a string, including built-in
+  // CompressionManager types.
+  // TODO: ObjectLibrary stuff
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& id,
+                                 std::shared_ptr<CompressionManager>* result);
+
+  // Returns false iff a configuration that would pass the given compression
+  // type to GetCompressor/GetCompressorForSST should be rejected (not
+  // supported)
+  virtual bool SupportsCompressionType(CompressionType type) const = 0;
+
+  // TODO: function to check compatibility with or sanitize CompressionOptions
+
+  // ************************* Compressor creation *********************** //
+  // Returning nullptr means compression is entirely disabled for the file,
+  // which is valid at the discretion of the CompressionManager. Returning
+  // nullptr should normally be the result if preferred == kNoCompression.
+  //
+  // Compressors returned here are configured WITHOUT a dictionary, so that
+  // it's always possible to get correct compression->decompression results
+  // if not opting-in to dictionary handling. The compressors may recommend
+  // dictionary usage via GetDictGuidance() and creating a modified Compressor
+  // for that. See Compressor::GetDictGuidance() etc. for details.
+  //
+  // These functions must be thread-safe.
+
+  // Get a compressor for an SST file.
+  // SUBJECT TO CHANGE
+  // TODO: is it practical to get ColumnFamilyOptions plumbed into here?
+  virtual std::unique_ptr<Compressor> GetCompressorForSST(
+      const FilterBuildingContext&, const CompressionOptions& opts,
+      CompressionType preferred) {
+    return GetCompressor(opts, preferred);
+  }
+
+  // Get a compressor for a generic/unspecified purpose (e.g. in-memory
+  // compression).
+  virtual std::unique_ptr<Compressor> GetCompressor(
+      const CompressionOptions& opts, CompressionType type) = 0;
+
+  // **************************** Decompressors ************************** //
+  // Get a decompressor that is compatible with any blocks compressed by
+  // compressors returned by this CompressionManager (at least this code
+  // revision and earlier). (NOTE: recommended to return a shared_ptr alias of
+  // this shared_ptr to a field that is a Decompressor.)
+  // Justification for not making CompressionManager inherit Decompressor: this
+  // tends to run into the diamond inheritance problem in implementations and
+  // potential overheads of virtual inheritance.
+  virtual std::shared_ptr<Decompressor> GetDecompressor() = 0;
+
+  // Compatible with same as above, but potentially optimized for a certain
+  // expected CompressionType
+  virtual std::shared_ptr<Decompressor> GetDecompressorOptimizeFor(
+      CompressionType /*optimize_for_type*/) {
+    // Safe default implementation
+    return GetDecompressor();
+  }
+
+  // Get a decompressor that is allowed to have support only for the
+  // CompressionTypes in the given start-to-end array (unique, sorted by
+  // unsigned char)
+  virtual std::shared_ptr<Decompressor> GetDecompressorForTypes(
+      const CompressionType* /*types_begin*/,
+      const CompressionType* /*types_end*/) {
+    // Safe default implementation
+    return GetDecompressor();
+  }
+};
+
+// ************************* Utility wrappers etc. *********************** //
+class CompressorWrapper : public Compressor {
+ public:
+  explicit CompressorWrapper(std::unique_ptr<Compressor> compressor)
+      : wrapped_(std::move(compressor)) {}
+  // No copies
+  CompressorWrapper(const CompressorWrapper&) = delete;
+  CompressorWrapper& operator=(const CompressorWrapper&) = delete;
+
+  DictConfig GetDictGuidance(CacheEntryRole block_type) const override {
+    return wrapped_->GetDictGuidance(block_type);
+  }
+
+  Slice GetSerializedDict() const override {
+    return wrapped_->GetSerializedDict();
+  }
+
+  CompressionType GetPreferredCompressionType() const override {
+    return wrapped_->GetPreferredCompressionType();
+  }
+
+  // NOTE: Clone() not implemented here because it needs to be in the derived
+  // class
+
+  // NOTE: MaybeCloneSpecialized() is only implemented here for convenience
+  // when the wrapped Compressor uses the default implementation of
+  // MaybeCloneSpecialized(). This needs to be overridden if not.
+  std::unique_ptr<Compressor> MaybeCloneSpecialized(
+      CacheEntryRole block_type, DictConfigArgs&& dict_config) const override {
+    auto clone =
+        wrapped_->MaybeCloneSpecialized(block_type, std::move(dict_config));
+    // Assert default no-op MaybeCloneSpecialized()
+    assert(clone == nullptr);
+    return clone;
+  }
+
+  ManagedWorkingArea ObtainWorkingArea() override {
+    return wrapped_->ObtainWorkingArea();
+  }
+
+  // NOTE: Don't need to override ReleaseWorkingArea() here because
+  // ManagedWorkingArea takes care of calling it on the Compressor that created
+  // the WorkingArea.
+
+  Status CompressBlock(Slice uncompressed_data, char* compressed_output,
+                       size_t* compressed_output_size,
+                       CompressionType* out_compression_type,
+                       ManagedWorkingArea* working_area) override {
+    return wrapped_->CompressBlock(uncompressed_data, compressed_output,
+                                   compressed_output_size, out_compression_type,
+                                   working_area);
+  }
+
+  std::shared_ptr<Decompressor> GetOptimizedDecompressor() const override {
+    return wrapped_->GetOptimizedDecompressor();
+  }
+
+ protected:
+  std::unique_ptr<Compressor> wrapped_;
+};
+
+class DecompressorWrapper : public Decompressor {
+ public:
+  explicit DecompressorWrapper(std::shared_ptr<Decompressor> decompressor)
+      : wrapped_(std::move(decompressor)) {}
+  // No copies
+  DecompressorWrapper(const DecompressorWrapper&) = delete;
+  DecompressorWrapper& operator=(const DecompressorWrapper&) = delete;
+
+  const char* Name() const override { return wrapped_->Name(); }
+
+  void ReleaseWorkingArea(WorkingArea* wa) override {
+    wrapped_->ReleaseWorkingArea(wa);
+  }
+
+  // NOTE: Don't need to override ReleaseWorkingArea() here because
+  // ManagedWorkingArea takes care of calling it on the Decompressor that
+  // created the WorkingArea.
+
+  ManagedWorkingArea ObtainWorkingArea(CompressionType preferred) override {
+    return wrapped_->ObtainWorkingArea(preferred);
+  }
+
+  const Slice& GetSerializedDict() const override {
+    return wrapped_->GetSerializedDict();
+  }
+
+  Status MaybeCloneForDict(const Slice& serialized_dict,
+                           std::unique_ptr<Decompressor>* out) override {
+    // NOTE: derived class probably needs to override this to ensure a
+    // derived wrapper around the new Decompressor
+    return wrapped_->MaybeCloneForDict(serialized_dict, out);
+  }
+
+  size_t ApproximateOwnedMemoryUsage() const override {
+    return wrapped_->ApproximateOwnedMemoryUsage();
+  }
+
+  Status ExtractUncompressedSize(Args& args) override {
+    return wrapped_->ExtractUncompressedSize(args);
+  }
+
+  Status DecompressBlock(const Args& args, char* uncompressed_output) override {
+    return wrapped_->DecompressBlock(args, uncompressed_output);
+  }
+
+ protected:
+  std::shared_ptr<Decompressor> wrapped_;
+};
+
+// TODO: CompressorBase, for custom compressions
+
+class CompressionManagerWrapper : public CompressionManager {
+ public:
+  explicit CompressionManagerWrapper(
+      std::shared_ptr<CompressionManager> wrapped)
+      : wrapped_(std::move(wrapped)) {}
+
+  const char* CompatibilityName() const override {
+    return wrapped_->CompatibilityName();
+  }
+
+  std::shared_ptr<CompressionManager> FindCompatibleCompressionManager(
+      Slice compatibility_name) override {
+    // NOTE: We expect that the wrapped CompressionManager will generally
+    // be preferred if compatible, so the default implementation here does
+    // not purely defer to the wrapped instance
+    if (compatibility_name == CompatibilityName()) {
+      return shared_from_this();
+    } else {
+      return wrapped_->FindCompatibleCompressionManager(compatibility_name);
+    }
+  }
+
+  bool SupportsCompressionType(CompressionType type) const override {
+    return wrapped_->SupportsCompressionType(type);
+  }
+
+  std::unique_ptr<Compressor> GetCompressorForSST(
+      const FilterBuildingContext& context, const CompressionOptions& opts,
+      CompressionType preferred) override {
+    return wrapped_->GetCompressorForSST(context, opts, preferred);
+  }
+
+  std::unique_ptr<Compressor> GetCompressor(const CompressionOptions& opts,
+                                            CompressionType type) override {
+    return wrapped_->GetCompressor(opts, type);
+  }
+
+  std::shared_ptr<Decompressor> GetDecompressor() override {
+    return wrapped_->GetDecompressor();
+  }
+
+  std::shared_ptr<Decompressor> GetDecompressorOptimizeFor(
+      CompressionType optimize_for_type) override {
+    return wrapped_->GetDecompressorOptimizeFor(optimize_for_type);
+  }
+
+  std::shared_ptr<Decompressor> GetDecompressorForTypes(
+      const CompressionType* types_begin,
+      const CompressionType* types_end) override {
+    return wrapped_->GetDecompressorForTypes(types_begin, types_end);
+  }
+
+ protected:
+  std::shared_ptr<CompressionManager> wrapped_;
+};
+
+// Compression manager that implements the second schema for RocksDB built-in
+// compression support. (The first schema is intentionally not provided here.)
+// *** CURRENT STATE ***
+// This is currently the latest schema for built-in compression, and the
+// compression manager used when compression_manager=nullptr.
+const std::shared_ptr<CompressionManager>& GetBuiltinV2CompressionManager();
+
+// NOTE: No GetLatestBuiltinCompressionManager() is provided because that could
+// lead to unexpected schema changes for user CompressionManagers building on
+// the built-in schema, in the unlikely/rare case of a new built-in schema.
+
+// Creates CompressionManager designed for the automated compression strategy.
+// This may include deciding to compress or not.
+// EXPERIMENTAL
+std::shared_ptr<CompressionManagerWrapper> CreateAutoSkipCompressionManager(
+    std::shared_ptr<CompressionManager> wrapped = nullptr);
+// Creates CompressionManager designed for the CPU and IO cost aware compression
+// strategy
+// EXPERIMENTAL
+std::shared_ptr<CompressionManagerWrapper> CreateCostAwareCompressionManager(
+    std::shared_ptr<CompressionManager> wrapped = nullptr);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/advanced_iterator.h b/include/rocksdb/advanced_iterator.h
new file mode 100644
index 000000000000..abab5aeb4574
--- /dev/null
+++ b/include/rocksdb/advanced_iterator.h
@@ -0,0 +1,36 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+enum class IterBoundCheck : char {
+  kUnknown = 0,
+  kOutOfBound,
+  kInbound,
+};
+
+// This structure encapsulates the result of NextAndGetResult()
+struct IterateResult {
+  // The lifetime of key is guaranteed until Next()/NextAndGetResult() is
+  // called.
+  Slice key;
+  // If the iterator becomes invalid after a NextAndGetResult(), the table
+  // iterator should set this to indicate whether it became invalid due
+  // to the next key being out of bound (kOutOfBound) or it reached end
+  // of file (kUnknown). If the iiterator is still valid, this should
+  // be set to kInbound.
+  IterBoundCheck bound_check_result = IterBoundCheck::kUnknown;
+  // If false, PrepareValue() needs to be called before value()
+  // This is useful if the table reader wants to materialize the value in a
+  // lazy manner. In that case, it can set this to false and RocksDB
+  // guarantees that it'll call PrepareValue() before calling value().
+  bool value_prepared = true;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h
index ad9b90f735bb..898d07a6021d 100644
--- a/include/rocksdb/advanced_options.h
+++ b/include/rocksdb/advanced_options.h
@@ -64,9 +64,7 @@ enum CompactionPri : char {
 struct FileTemperatureAge {
   Temperature temperature = Temperature::kUnknown;
   uint64_t age = 0;
-#if __cplusplus >= 202002L
   bool operator==(const FileTemperatureAge& rhs) const = default;
-#endif
 };
 
 struct CompactionOptionsFIFO {
@@ -115,14 +113,71 @@ struct CompactionOptionsFIFO {
   // Default: empty
   std::vector<FileTemperatureAge> file_temperature_age_thresholds{};
 
+  // EXPERIMENTAL
+  // If true, when compaction is picked for kChangeTemperature reason,
+  // allow the trivia copy of the sst file from source FileSystem to
+  // destination FileSystem. If false, the changeTemperature will be
+  // the non-trivial copy by iterating/appending blocks by blocks of the
+  // sst file.
+  bool allow_trivial_copy_when_change_temperature = false;
+
+  // EXPERIMENTAL
+  // If 'allow_trivia_copy_op_when_change_temperature=true', the tmp buffer size
+  // to copy the file from the source FileSystem to the destnation FileSystem.
+  // If 'allow_trivia_copy_op_when_change_temperature=false', this field will
+  // not be used. The minmum buffer size must be at least 4KiB
+  uint64_t trivial_copy_buffer_size = 4096;
+
+  // When non-zero, FIFO compaction uses the combined size of SST files and
+  // blob files for size-based trimming decisions. When the total data size
+  // (SST + blob) exceeds this limit, the oldest SST files are dropped along
+  // with their associated blob files.
+  //
+  // When non-zero, this takes precedence over max_table_files_size for all
+  // FIFO compaction decisions: size-based dropping, TTL threshold checks,
+  // and compaction score computation. max_table_files_size is ignored.
+  //
+  // When zero (default), FIFO compaction uses max_table_files_size which
+  // only considers SST file sizes, maintaining backward compatibility.
+  //
+  // This option is primarily intended for use with integrated BlobDB where
+  // blob files can represent a significant portion of the total data.
+  //
+  // Dynamically changeable through SetOptions() API.
+  // Default: 0 (use max_table_files_size behavior)
+  uint64_t max_data_files_size = 0;
+
+  // When true, enables a capacity-derived intra-L0 compaction strategy
+  // optimized for BlobDB workloads where SST files are much smaller than
+  // write_buffer_size. Uses the observed key/value size ratio (SST vs blob
+  // file sizes) to compute a target compacted file size, producing uniform
+  // files for predictable FIFO trimming.
+  //
+  // Uses level0_file_num_compaction_trigger as the target max L0 file count.
+  //
+  // When max_compaction_bytes is 0, the target is auto-calculated from the
+  // data capacity and observed SST/blob ratio. When max_compaction_bytes is
+  // explicitly set to a non-zero value, it overrides the auto-calculated
+  // target.
+  //
+  // Requires:
+  //   - allow_compaction = true (master switch for intra-L0 compaction)
+  //   - max_data_files_size > 0 (needed to compute the target file size)
+  // Setting this to true without these will fail option validation.
+  //
+  // When false, the old intra-L0 strategy is used if allow_compaction is
+  // true (PickCostBasedIntraL0Compaction with 1.1 * write_buffer_size guard).
+  //
+  // Dynamically changeable through SetOptions() API.
+  // Default: false
+  bool use_kv_ratio_compaction = false;
+
   CompactionOptionsFIFO() : max_table_files_size(1 * 1024 * 1024 * 1024) {}
   CompactionOptionsFIFO(uint64_t _max_table_files_size, bool _allow_compaction)
       : max_table_files_size(_max_table_files_size),
         allow_compaction(_allow_compaction) {}
 
-#if __cplusplus >= 202002L
   bool operator==(const CompactionOptionsFIFO& rhs) const = default;
-#endif
 };
 
 // The control option of how the cache tiers will be used. Currently rocksdb
@@ -145,6 +200,61 @@ enum class PrepopulateBlobCache : uint8_t {
   kFlushOnly = 0x1,  // Prepopulate blobs during flush only
 };
 
+// Bitmask enum for verify output flags during compaction.
+// This allows fine-grained control over what verification is performed
+// on compaction output files and when it's enabled.
+enum class VerifyOutputFlags : uint32_t {
+  kVerifyNone = 0x0,  // No verification
+
+  // First set of bits: type of verifications
+  kVerifyBlockChecksum = 1 << 0,  // Verify block checksums
+  kVerifyIteration = 1 << 1,      // Verify iteration and full key/value hash
+                                  // by comparing the one inserted into a
+                                  // file, and what is read back.
+
+  // TODO - Implement
+  // kVerifyFileChecksum = 1 << 2,   // Verify file-level checksum
+
+  // Second set of bits: when to enable verification
+  kEnableForLocalCompaction = 1 << 10,   // Enable for local compaction
+  kEnableForRemoteCompaction = 1 << 11,  // Enable for remote compaction
+
+  // TODO - Implement
+  // kEnableForFlush = 1 << 12,  // Enable for flush
+
+  kVerifyAll = 0xFFFFFFFF,
+};
+
+inline VerifyOutputFlags operator|(VerifyOutputFlags lhs,
+                                   VerifyOutputFlags rhs) {
+  using T = std::underlying_type_t<VerifyOutputFlags>;
+  return static_cast<VerifyOutputFlags>(static_cast<T>(lhs) |
+                                        static_cast<T>(rhs));
+}
+
+inline VerifyOutputFlags& operator|=(VerifyOutputFlags& lhs,
+                                     VerifyOutputFlags rhs) {
+  lhs = lhs | rhs;
+  return lhs;
+}
+
+inline VerifyOutputFlags operator&(VerifyOutputFlags lhs,
+                                   VerifyOutputFlags rhs) {
+  using T = std::underlying_type_t<VerifyOutputFlags>;
+  return static_cast<VerifyOutputFlags>(static_cast<T>(lhs) &
+                                        static_cast<T>(rhs));
+}
+
+inline VerifyOutputFlags& operator&=(VerifyOutputFlags& lhs,
+                                     VerifyOutputFlags rhs) {
+  lhs = lhs & rhs;
+  return lhs;
+}
+
+inline bool operator!(VerifyOutputFlags flag) {
+  return flag == VerifyOutputFlags::kVerifyNone;
+}
+
 struct AdvancedColumnFamilyOptions {
   // The maximum number of write buffers that are built up in memory.
   // The default and the minimum number is 2, so that when 1 write buffer
@@ -171,15 +281,6 @@ struct AdvancedColumnFamilyOptions {
   // Default: 1
   int min_write_buffer_number_to_merge = 1;
 
-  // DEPRECATED
-  // The total maximum number of write buffers to maintain in memory including
-  // copies of buffers that have already been flushed.  Unlike
-  // max_write_buffer_number, this parameter does not affect flushing.
-  // This parameter is being replaced by max_write_buffer_size_to_maintain.
-  // If both parameters are set to non-zero values, this parameter will be
-  // ignored.
-  int max_write_buffer_number_to_maintain = 0;
-
   // The target number of write history bytes to hold in memory. Write history
   // comprises the latest write buffers (memtables). To reach the target, write
   // buffers that were most recently flushed to SST files may be retained in
@@ -471,6 +572,17 @@ struct AdvancedColumnFamilyOptions {
   // Dynamically changeable through SetOptions() API
   int target_file_size_multiplier = 1;
 
+  // If true, RocksDB will consider the estimated tail size (filter + index +
+  // meta blocks) when deciding whether to cut a compaction output file. This
+  // helps prevent output files from exceeding the target_file_size_base due to
+  // large tail blocks. When disabled, only the data block size is considered,
+  // which may result in SST files exceeding the target_file_size_base.
+  //
+  // Default: false
+  //
+  // Dynamically changeable through SetOptions() API
+  bool target_file_size_is_upper_bound = false;
+
   // If true, RocksDB will pick target size of each level dynamically.
   // We will pick a base level b >= 1. L0 will be directly merged into level b,
   // instead of always into level 1. Level 1 to b-1 need to be empty.
@@ -520,7 +632,7 @@ struct AdvancedColumnFamilyOptions {
   // By doing it, we give max_bytes_for_level_multiplier a priority against
   // max_bytes_for_level_base, for a more predictable LSM tree shape. It is
   // useful to limit worse case space amplification.
-  // If `allow_ingest_behind=true` or `preclude_last_level_data_seconds > 0`,
+  // If `cf_allow_ingest_behind=true` or `preclude_last_level_data_seconds > 0`,
   // then the last level is reserved, and we will start filling LSM from the
   // second last level.
   //
@@ -575,6 +687,15 @@ struct AdvancedColumnFamilyOptions {
   //
   // Default: target_file_size_base * 25
   //
+  // For FIFO compaction with use_kv_ratio_compaction=true:
+  // When set to 0 (and compaction_style is FIFO), the value is NOT sanitized
+  // to the default. Instead, the target compacted file size is automatically
+  // calculated from the data capacity (max_data_files_size) and observed
+  // SST/blob ratio. When explicitly set to a non-zero value, it overrides
+  // the auto-calculated target and is used directly as the max compaction
+  // input size. Note: for FIFO, this controls the output file size target,
+  // not a general compaction byte limit as in level/universal compaction.
+  //
   // Dynamically changeable through SetOptions() API
   uint64_t max_compaction_bytes = 0;
 
@@ -702,6 +823,13 @@ struct AdvancedColumnFamilyOptions {
   // Dynamically changeable through SetOptions() API
   bool paranoid_file_checks = false;
 
+  // Bitmask enum for output verification option.
+  //
+  // Default: 0 (kVerifyNone)
+  //
+  // Dynamically changeable (as a uint32_t) through SetOptions() API.
+  VerifyOutputFlags verify_output_flags = VerifyOutputFlags::kVerifyNone;
+
   // In debug mode, RocksDB runs consistency checks on the LSM every time the
   // LSM changes (Flush, Compaction, AddFile). When this option is true, these
   // checks are also enabled in release mode. These checks were historically
@@ -719,6 +847,17 @@ struct AdvancedColumnFamilyOptions {
   // Dynamically changeable through SetOptions() API
   bool report_bg_io_stats = false;
 
+  // Setting this option to true disallows ordinary writes to the column family
+  // and it can only be populated through import and ingestion. It is intended
+  // to protect "ingestion only" column families. This option is not currently
+  // supported on the default column family because of error handling challenges
+  // analogous to https://github.com/facebook/rocksdb/issues/13429
+  //
+  // This option is not mutable with SetOptions(). It can be changed between
+  // DB::Open() calls, but open will fail if recovering WAL writes to a CF with
+  // this option set.
+  bool disallow_memtable_writes = false;
+
   // This option has different meanings for different compaction styles:
   //
   // Leveled: Non-bottom-level files with all keys older than TTL will go
@@ -846,7 +985,7 @@ struct AdvancedColumnFamilyOptions {
   //
   // Default: 0 (disable the feature)
   //
-  // Not dynamically changeable, change it requires db restart.
+  // Dynamically changeable through the SetOptions() API
   uint64_t preclude_last_level_data_seconds = 0;
 
   // EXPERIMENTAL
@@ -869,7 +1008,7 @@ struct AdvancedColumnFamilyOptions {
   //
   // Default: 0 (disable the feature)
   //
-  // Not dynamically changeable, change it requires db restart.
+  // Dynamically changeable through the SetOptions() API
   uint64_t preserve_internal_time_seconds = 0;
 
   // When set, large values (blobs) are written to separate blob files, and
@@ -1088,12 +1227,84 @@ struct AdvancedColumnFamilyOptions {
   uint32_t bottommost_file_compaction_delay = 0;
 
   // Enables additional integrity checks during reads/scans.
-  // Specifically, for skiplist-based memtables, we verify that keys visited
-  // are in order. This is helpful to detect corrupted memtable keys during
-  // reads. Enabling this feature incurs a performance overhead due to an
-  // additional key comparison during memtable lookup.
+  // Specifically, for skiplist-based memtables, key ordering validation could
+  // be enabled optionally. This is helpful to detect corrupted memtable keys
+  // during reads. Enabling this feature incurs a performance overhead due to
+  // additional comparison during memtable lookup.
   bool paranoid_memory_checks = false;
 
+  // Enables additional integrity checks during seek.
+  // Specifically, for skiplist-based memtables, key checksum validation could
+  // be enabled during seek optionally. This is helpful to detect corrupted
+  // memtable keys during reads. Enabling this feature incurs a performance
+  // overhead due to additional key checksum validation during memtable seek
+  // operation.
+  // This option depends on memtable_protection_bytes_per_key to be non zero.
+  // If memtable_protection_bytes_per_key is zero, no validation is performed.
+  bool memtable_veirfy_per_key_checksum_on_seek = false;
+
+  // When an iterator scans this number of invisible entries (tombstones or
+  // hidden puts) from the active memtable during a single iterator operation,
+  // we will attempt to flush the memtable. Currently only forward scans are
+  // supported (SeekToFirst(), Seek() and Next()).
+  // This option helps to reduce the overhead of scanning through a
+  // large number of entries in memtable.
+  // Users should consider enable deletion-triggered-compaction (see
+  // CompactOnDeletionCollectorFactory) together with this option to compact
+  // away tombstones after the memtable is flushed.
+  //
+  // Note that this option has no effect on tailing iterators yet.
+  //
+  // Default: 0 (disabled)
+  // Dynamically changeable through the SetOptions() API.
+  uint32_t memtable_op_scan_flush_trigger = 0;
+
+  // Similar to `memtable_op_scan_flush_trigger`, but this option applies to
+  // Next() calls between Seeks or until iterator destruction. If the average
+  // of the number of invisible entries scanned from the active memtable, the
+  // memtable will be marked for flush.
+  // Note that to avoid the case where the window between Seeks is too small,
+  // the option only takes effect if the total number of hidden entries scanned
+  // within a window is at least `memtable_op_scan_flush_trigger`. So this
+  // option is only effective when `memtable_op_scan_flush_trigger` is set.
+  //
+  // This option should be set to a lower value than
+  // `memtable_op_scan_flush_trigger`. It covers the case where an iterator
+  // scans through an expensive key range with many invisible entries from the
+  // active memtable, but the number of invisible entries per operation does not
+  // exceed `memtable_op_scan_flush_trigger`.
+  //
+  // Default: 0 (disabled)
+  // Dynamically changeable through the SetOptions() API.
+  uint32_t memtable_avg_op_scan_flush_trigger = 0;
+
+  // If either DBOptions::allow_ingest_behind or this option is set to true,
+  // this column family will prepare for ingesting files to the last level
+  // (IngestExternalFiles() with ingest_behind=true). Users should set only
+  // this option since DBOptions::allow_ingest_behind is deprecated.
+  //
+  // Specifically, preparing a column family for ingesting files to the last
+  // level has the following effects:
+  // 1) Disables some internal optimizations around SST file compression.
+  // 2) Reserves the last level for ingested files only.
+  // 3) Compaction will not include any file from the last level.
+  // 4) Compaction will preserve necessary tombstones that can apply on
+  // top of ingested files.
+  //
+  // Note that only Universal Compaction supports cf_allow_ingest_behind.
+  // `num_levels` should be >= 3 if this option is turned on.
+  //
+  // Note that this option needs to be set to true before any write to the CF.
+  // It's recommended to set the option to true since CF creation. Otherwise,
+  // ingestion with ingest_behind = true might fail. Once file ingestions are
+  // done, the option should be flipped to false. Flipping this option to false
+  // allows the CF to disable the behavior changes detailed above and resume
+  // more efficient operation.
+  //
+  // Default: false
+  // Immutable.
+  bool cf_allow_ingest_behind = false;
+
   // Create ColumnFamilyOptions with default values for all fields
   AdvancedColumnFamilyOptions();
   // Create ColumnFamilyOptions from Options
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index f2616ea3e7f8..3ab0c8551d34 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -69,6 +69,7 @@ extern "C" {
 /* Exported types */
 
 typedef struct rocksdb_t rocksdb_t;
+typedef struct rocksdb_status_ptr_t rocksdb_status_ptr_t;
 typedef struct rocksdb_backup_engine_t rocksdb_backup_engine_t;
 typedef struct rocksdb_backup_engine_info_t rocksdb_backup_engine_info_t;
 typedef struct rocksdb_backup_engine_options_t rocksdb_backup_engine_options_t;
@@ -79,11 +80,18 @@ typedef struct rocksdb_hyper_clock_cache_options_t
     rocksdb_hyper_clock_cache_options_t;
 typedef struct rocksdb_cache_t rocksdb_cache_t;
 typedef struct rocksdb_write_buffer_manager_t rocksdb_write_buffer_manager_t;
+typedef struct rocksdb_sst_file_manager_t rocksdb_sst_file_manager_t;
 typedef struct rocksdb_compactionfilter_t rocksdb_compactionfilter_t;
 typedef struct rocksdb_compactionfiltercontext_t
     rocksdb_compactionfiltercontext_t;
 typedef struct rocksdb_compactionfilterfactory_t
     rocksdb_compactionfilterfactory_t;
+typedef struct rocksdb_file_checksum_gen_factory_t
+    rocksdb_file_checksum_gen_factory_t;
+typedef struct rocksdb_sst_partitioner_factory_t
+    rocksdb_sst_partitioner_factory_t;
+typedef struct rocksdb_table_properties_collector_factory_t
+    rocksdb_table_properties_collector_factory_t;
 typedef struct rocksdb_comparator_t rocksdb_comparator_t;
 typedef struct rocksdb_dbpath_t rocksdb_dbpath_t;
 typedef struct rocksdb_env_t rocksdb_env_t;
@@ -111,10 +119,15 @@ typedef struct rocksdb_writebatch_wi_t rocksdb_writebatch_wi_t;
 typedef struct rocksdb_writeoptions_t rocksdb_writeoptions_t;
 typedef struct rocksdb_universal_compaction_options_t
     rocksdb_universal_compaction_options_t;
+typedef struct rocksdb_livefile_t rocksdb_livefile_t;
 typedef struct rocksdb_livefiles_t rocksdb_livefiles_t;
 typedef struct rocksdb_column_family_handle_t rocksdb_column_family_handle_t;
 typedef struct rocksdb_column_family_metadata_t
     rocksdb_column_family_metadata_t;
+typedef struct rocksdb_import_column_family_options_t
+    rocksdb_import_column_family_options_t;
+typedef struct rocksdb_export_import_files_metadata_t
+    rocksdb_export_import_files_metadata_t;
 typedef struct rocksdb_level_metadata_t rocksdb_level_metadata_t;
 typedef struct rocksdb_sst_file_metadata_t rocksdb_sst_file_metadata_t;
 typedef struct rocksdb_envoptions_t rocksdb_envoptions_t;
@@ -142,6 +155,48 @@ typedef struct rocksdb_statistics_histogram_data_t
 typedef struct rocksdb_wait_for_compact_options_t
     rocksdb_wait_for_compact_options_t;
 
+/* rocksdb_slice_t: Optimized slice type for high-performance C API operations
+ * This struct is ABI-compatible with rocksdb::Slice for zero-copy interop.
+ * Used by slice iterator functions and batched operations. */
+typedef struct rocksdb_slice_t {
+  const char* data;
+  size_t size;
+} rocksdb_slice_t;
+typedef struct rocksdb_flushjobinfo_t rocksdb_flushjobinfo_t;
+typedef struct rocksdb_compactionjobinfo_t rocksdb_compactionjobinfo_t;
+typedef struct rocksdb_subcompactionjobinfo_t rocksdb_subcompactionjobinfo_t;
+typedef struct rocksdb_externalfileingestioninfo_t
+    rocksdb_externalfileingestioninfo_t;
+typedef struct rocksdb_eventlistener_t rocksdb_eventlistener_t;
+typedef struct rocksdb_writestallinfo_t rocksdb_writestallinfo_t;
+typedef struct rocksdb_writestallcondition_t rocksdb_writestallcondition_t;
+typedef struct rocksdb_memtableinfo_t rocksdb_memtableinfo_t;
+
+// Remote Compaction typedef
+typedef struct rocksdb_compactionservice_scheduleresponse_t
+    rocksdb_compactionservice_scheduleresponse_t;
+typedef struct rocksdb_compactionservice_jobinfo_t
+    rocksdb_compactionservice_jobinfo_t;
+typedef struct rocksdb_compactionservice_t rocksdb_compactionservice_t;
+typedef struct rocksdb_compaction_service_options_override_t
+    rocksdb_compaction_service_options_override_t;
+typedef struct rocksdb_open_and_compact_options_t
+    rocksdb_open_and_compact_options_t;
+typedef rocksdb_compactionservice_scheduleresponse_t* (
+    *rocksdb_compaction_service_schedule_cb)(
+    void* state, const rocksdb_compactionservice_jobinfo_t* info,
+    const char* compaction_service_input, size_t input_len);
+
+typedef int (*rocksdb_compaction_service_wait_cb)(void* state,
+                                                  const char* scheduled_job_id,
+                                                  char** result,
+                                                  size_t* result_len);
+
+typedef void (*rocksdb_compaction_service_cancel_awaiting_jobs_cb)(void* state);
+
+typedef void (*rocksdb_compaction_service_on_installation_cb)(
+    void* state, const char* scheduled_job_id, int status);
+
 /* DB operations */
 
 extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open(
@@ -366,6 +421,12 @@ extern ROCKSDB_LIBRARY_API void rocksdb_checkpoint_create(
     rocksdb_checkpoint_t* checkpoint, const char* checkpoint_dir,
     uint64_t log_size_for_flush, char** errptr);
 
+extern ROCKSDB_LIBRARY_API rocksdb_export_import_files_metadata_t*
+rocksdb_checkpoint_export_column_family(
+    rocksdb_checkpoint_t* checkpoint,
+    rocksdb_column_family_handle_t* column_family, const char* export_dir,
+    char** errptr);
+
 extern ROCKSDB_LIBRARY_API void rocksdb_checkpoint_object_destroy(
     rocksdb_checkpoint_t* checkpoint);
 
@@ -426,6 +487,13 @@ rocksdb_create_column_families(rocksdb_t* db,
 extern ROCKSDB_LIBRARY_API void rocksdb_create_column_families_destroy(
     rocksdb_column_family_handle_t** list);
 
+extern ROCKSDB_LIBRARY_API rocksdb_column_family_handle_t*
+rocksdb_create_column_family_with_import(
+    rocksdb_t* db, rocksdb_options_t* column_family_options,
+    const char* column_family_name,
+    rocksdb_import_column_family_options_t* import_options,
+    rocksdb_export_import_files_metadata_t* metadata, char** errptr);
+
 extern ROCKSDB_LIBRARY_API rocksdb_column_family_handle_t*
 rocksdb_create_column_family_with_ttl(
     rocksdb_t* db, const rocksdb_options_t* column_family_options,
@@ -581,6 +649,16 @@ extern ROCKSDB_LIBRARY_API void rocksdb_batched_multi_get_cf(
     const char* const* keys_list, const size_t* keys_list_sizes,
     rocksdb_pinnableslice_t** values, char** errs, const bool sorted_input);
 
+/* Batched MultiGet with slice array: Takes rocksdb_slice_t array directly,
+ * avoiding key conversion. faster than rocksdb_batched_multi_get_cf for
+ * operations with many keys. Eliminates overhead of converting keys from
+ * separate pointer+size arrays to Slice objects. */
+extern ROCKSDB_LIBRARY_API void rocksdb_batched_multi_get_cf_slice(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, size_t num_keys,
+    const rocksdb_slice_t* keys_list, rocksdb_pinnableslice_t** values,
+    char** errs, const bool sorted_input);
+
 // The value is only allocated (using malloc) and returned if it is found and
 // value_found isn't NULL. In that case the user is responsible for freeing it.
 extern ROCKSDB_LIBRARY_API unsigned char rocksdb_key_may_exist(
@@ -747,6 +825,18 @@ extern ROCKSDB_LIBRARY_API const char* rocksdb_iter_timestamp(
     const rocksdb_iterator_t*, size_t* tslen);
 extern ROCKSDB_LIBRARY_API void rocksdb_iter_get_error(
     const rocksdb_iterator_t*, char** errptr);
+
+/* Slice iterator functions: Return rocksdb_slice_t directly for better
+ * performance. These functions avoid the overhead of passing output parameters
+ * and provide zero-copy access to key/value/timestamp data. faster than
+ * traditional rocksdb_iter_key/value/timestamp functions. */
+extern ROCKSDB_LIBRARY_API rocksdb_slice_t
+rocksdb_iter_key_slice(const rocksdb_iterator_t* iter);
+extern ROCKSDB_LIBRARY_API rocksdb_slice_t
+rocksdb_iter_value_slice(const rocksdb_iterator_t* iter);
+extern ROCKSDB_LIBRARY_API rocksdb_slice_t
+rocksdb_iter_timestamp_slice(const rocksdb_iterator_t* iter);
+
 extern ROCKSDB_LIBRARY_API void rocksdb_iter_refresh(
     const rocksdb_iterator_t* iter, char** errptr);
 
@@ -860,6 +950,11 @@ extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_iterate(
     rocksdb_writebatch_t*, void* state,
     void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
     void (*deleted)(void*, const char* k, size_t klen));
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_iterate_ld(
+    rocksdb_writebatch_t*, void* state,
+    void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
+    void (*deleted)(void*, const char* k, size_t klen),
+    void (*log_data)(void*, const char* blob, size_t blob_len));
 extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_iterate_cf(
     rocksdb_writebatch_t*, void* state,
     void (*put_cf)(void*, uint32_t cfid, const char* k, size_t klen,
@@ -867,6 +962,14 @@ extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_iterate_cf(
     void (*deleted_cf)(void*, uint32_t cfid, const char* k, size_t klen),
     void (*merge_cf)(void*, uint32_t cfid, const char* k, size_t klen,
                      const char* v, size_t vlen));
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_iterate_cf_ld(
+    rocksdb_writebatch_t*, void* state,
+    void (*put_cf)(void*, uint32_t cfid, const char* k, size_t klen,
+                   const char* v, size_t vlen),
+    void (*deleted_cf)(void*, uint32_t cfid, const char* k, size_t klen),
+    void (*merge_cf)(void*, uint32_t cfid, const char* k, size_t klen,
+                     const char* v, size_t vlen),
+    void (*log_data)(void*, const char* blob, size_t blob_len));
 extern ROCKSDB_LIBRARY_API const char* rocksdb_writebatch_data(
     rocksdb_writebatch_t*, size_t* size);
 extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_set_save_point(
@@ -986,11 +1089,22 @@ extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch_and_db(
     rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db,
     const rocksdb_readoptions_t* options, const char* key, size_t keylen,
     size_t* vallen, char** errptr);
+extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t*
+rocksdb_writebatch_wi_get_pinned_from_batch_and_db(
+    rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db,
+    const rocksdb_readoptions_t* options, const char* key, size_t keylen,
+    char** errptr);
 extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch_and_db_cf(
     rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db,
     const rocksdb_readoptions_t* options,
     rocksdb_column_family_handle_t* column_family, const char* key,
     size_t keylen, size_t* vallen, char** errptr);
+extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t*
+rocksdb_writebatch_wi_get_pinned_from_batch_and_db_cf(
+    rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db,
+    const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, char** errptr);
 extern ROCKSDB_LIBRARY_API void rocksdb_write_writebatch_wi(
     rocksdb_t* db, const rocksdb_writeoptions_t* options,
     rocksdb_writebatch_wi_t* wbwi, char** errptr);
@@ -998,13 +1112,20 @@ extern ROCKSDB_LIBRARY_API rocksdb_iterator_t*
 rocksdb_writebatch_wi_create_iterator_with_base(
     rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator);
 extern ROCKSDB_LIBRARY_API rocksdb_iterator_t*
+rocksdb_writebatch_wi_create_iterator_with_base_readopts(
+    rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator,
+    const rocksdb_readoptions_t* options);
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t*
 rocksdb_writebatch_wi_create_iterator_with_base_cf(
     rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator,
     rocksdb_column_family_handle_t* cf);
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t*
+rocksdb_writebatch_wi_create_iterator_with_base_cf_readopts(
+    rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator,
+    rocksdb_column_family_handle_t* cf, const rocksdb_readoptions_t* options);
 extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_update_timestamps(
     rocksdb_writebatch_wi_t* wbwi, const char* ts, size_t tslen, void* state,
     size_t (*get_ts_size)(void*, uint32_t), char** errptr);
-
 /* Options utils */
 
 // Load the latest rocksdb options from the specified db_path.
@@ -1088,6 +1209,13 @@ enum {
 extern ROCKSDB_LIBRARY_API void
 rocksdb_block_based_options_set_data_block_index_type(
     rocksdb_block_based_table_options_t*, int);  // uses one of the above enums
+enum {
+  rocksdb_block_based_table_index_block_search_type_binary = 0,
+  rocksdb_block_based_table_index_block_search_type_interpolation = 1,
+};
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_index_block_search_type(
+    rocksdb_block_based_table_options_t*, int);  // uses one of the above enums
 extern ROCKSDB_LIBRARY_API void
 rocksdb_block_based_options_set_data_block_hash_ratio(
     rocksdb_block_based_table_options_t* options, double v);
@@ -1123,8 +1251,150 @@ rocksdb_block_based_options_set_partition_pinning_tier(
 extern ROCKSDB_LIBRARY_API void
 rocksdb_block_based_options_set_unpartitioned_pinning_tier(
     rocksdb_block_based_table_options_t*, int);
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_block_align(
+    rocksdb_block_based_table_options_t*, unsigned char);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_write_buffer_manager(
     rocksdb_options_t* opt, rocksdb_write_buffer_manager_t* wbm);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_sst_file_manager(
+    rocksdb_options_t* opt, rocksdb_sst_file_manager_t* sfm);
+
+/* Flush job info */
+
+extern ROCKSDB_LIBRARY_API const char* rocksdb_flushjobinfo_cf_name(
+    const rocksdb_flushjobinfo_t*, size_t*);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_flushjobinfo_file_path(
+    const rocksdb_flushjobinfo_t*, size_t*);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_flushjobinfo_triggered_writes_slowdown(const rocksdb_flushjobinfo_t*);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_flushjobinfo_triggered_writes_stop(const rocksdb_flushjobinfo_t*);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_flushjobinfo_largest_seqno(const rocksdb_flushjobinfo_t*);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_flushjobinfo_smallest_seqno(const rocksdb_flushjobinfo_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_reset_status(
+    rocksdb_status_ptr_t* status_ptr);
+extern ROCKSDB_LIBRARY_API uint32_t
+rocksdb_flushjobinfo_flush_reason(const rocksdb_flushjobinfo_t* info);
+extern ROCKSDB_LIBRARY_API void rocksdb_status_ptr_get_error(
+    rocksdb_status_ptr_t* status, char** errptr);
+
+/* Compaction job info */
+extern ROCKSDB_LIBRARY_API void rocksdb_compactionjobinfo_status(
+    const rocksdb_compactionjobinfo_t* info, char** errptr);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_compactionjobinfo_cf_name(
+    const rocksdb_compactionjobinfo_t*, size_t*);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_compactionjobinfo_input_files_count(const rocksdb_compactionjobinfo_t*);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_compactionjobinfo_input_file_at(
+    const rocksdb_compactionjobinfo_t*, size_t pos, size_t*);
+extern ROCKSDB_LIBRARY_API size_t rocksdb_compactionjobinfo_output_files_count(
+    const rocksdb_compactionjobinfo_t*);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_compactionjobinfo_output_file_at(
+    const rocksdb_compactionjobinfo_t*, size_t pos, size_t*);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_compactionjobinfo_elapsed_micros(const rocksdb_compactionjobinfo_t*);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_compactionjobinfo_num_corrupt_keys(const rocksdb_compactionjobinfo_t*);
+extern ROCKSDB_LIBRARY_API int rocksdb_compactionjobinfo_base_input_level(
+    const rocksdb_compactionjobinfo_t*);
+extern ROCKSDB_LIBRARY_API int rocksdb_compactionjobinfo_output_level(
+    const rocksdb_compactionjobinfo_t*);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_compactionjobinfo_input_records(const rocksdb_compactionjobinfo_t*);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_compactionjobinfo_output_records(const rocksdb_compactionjobinfo_t*);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_compactionjobinfo_total_input_bytes(const rocksdb_compactionjobinfo_t*);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_compactionjobinfo_total_output_bytes(
+    const rocksdb_compactionjobinfo_t*);
+extern ROCKSDB_LIBRARY_API uint32_t rocksdb_compactionjobinfo_compaction_reason(
+    const rocksdb_compactionjobinfo_t* info);
+extern ROCKSDB_LIBRARY_API size_t rocksdb_compactionjobinfo_num_input_files(
+    const rocksdb_compactionjobinfo_t* info);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_compactionjobinfo_num_input_files_at_output_level(
+    const rocksdb_compactionjobinfo_t* info);
+
+/* Subcompaction job info */
+extern ROCKSDB_LIBRARY_API void rocksdb_subcompactionjobinfo_status(
+    const rocksdb_subcompactionjobinfo_t*, char**);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_subcompactionjobinfo_cf_name(
+    const rocksdb_subcompactionjobinfo_t*, size_t*);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_subcompactionjobinfo_thread_id(const rocksdb_subcompactionjobinfo_t*);
+extern ROCKSDB_LIBRARY_API int rocksdb_subcompactionjobinfo_base_input_level(
+    const rocksdb_subcompactionjobinfo_t*);
+extern ROCKSDB_LIBRARY_API int rocksdb_subcompactionjobinfo_output_level(
+    const rocksdb_subcompactionjobinfo_t*);
+extern ROCKSDB_LIBRARY_API uint32_t
+rocksdb_subcompactionjobinfo_compaction_reason(
+    const rocksdb_subcompactionjobinfo_t* info);
+
+/* External file ingestion info */
+extern ROCKSDB_LIBRARY_API const char*
+rocksdb_externalfileingestioninfo_cf_name(
+    const rocksdb_externalfileingestioninfo_t*, size_t*);
+extern ROCKSDB_LIBRARY_API const char*
+rocksdb_externalfileingestioninfo_internal_file_path(
+    const rocksdb_externalfileingestioninfo_t*, size_t*);
+
+/* External write stall info */
+extern ROCKSDB_LIBRARY_API const char* rocksdb_writestallinfo_cf_name(
+    const rocksdb_writestallinfo_t*, size_t*);
+extern ROCKSDB_LIBRARY_API const rocksdb_writestallcondition_t*
+rocksdb_writestallinfo_cur(const rocksdb_writestallinfo_t*);
+extern ROCKSDB_LIBRARY_API const rocksdb_writestallcondition_t*
+rocksdb_writestallinfo_prev(const rocksdb_writestallinfo_t*);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_memtableinfo_cf_name(
+    const rocksdb_memtableinfo_t*, size_t*);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_memtableinfo_first_seqno(const rocksdb_memtableinfo_t*);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_memtableinfo_earliest_seqno(const rocksdb_memtableinfo_t*);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_memtableinfo_num_entries(const rocksdb_memtableinfo_t*);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_memtableinfo_num_deletes(const rocksdb_memtableinfo_t*);
+
+/* Event listener */
+
+typedef void (*on_flush_begin_cb)(void*, rocksdb_t*,
+                                  const rocksdb_flushjobinfo_t*);
+typedef void (*on_flush_completed_cb)(void*, rocksdb_t*,
+                                      const rocksdb_flushjobinfo_t*);
+typedef void (*on_compaction_begin_cb)(void*, rocksdb_t*,
+                                       const rocksdb_compactionjobinfo_t*);
+typedef void (*on_compaction_completed_cb)(void*, rocksdb_t*,
+                                           const rocksdb_compactionjobinfo_t*);
+typedef void (*on_subcompaction_begin_cb)(
+    void*, const rocksdb_subcompactionjobinfo_t*);
+typedef void (*on_subcompaction_completed_cb)(
+    void*, const rocksdb_subcompactionjobinfo_t*);
+typedef void (*on_external_file_ingested_cb)(
+    void*, rocksdb_t*, const rocksdb_externalfileingestioninfo_t*);
+typedef void (*on_background_error_cb)(void*, uint32_t, rocksdb_status_ptr_t*);
+typedef void (*on_stall_conditions_changed_cb)(void*,
+                                               const rocksdb_writestallinfo_t*);
+typedef void (*rocksdb_logger_logv_cb)(void*, uint32_t log_level, const char*);
+typedef void (*on_memtable_sealed_cb)(void*, const rocksdb_memtableinfo_t*);
+extern ROCKSDB_LIBRARY_API rocksdb_eventlistener_t*
+rocksdb_eventlistener_create(
+    void* state_, void (*destructor_)(void*), on_flush_begin_cb on_flush_begin,
+    on_flush_completed_cb on_flush_completed,
+    on_compaction_begin_cb on_compaction_begin,
+    on_compaction_completed_cb on_compaction_completed,
+    on_subcompaction_begin_cb on_subcompaction_begin,
+    on_subcompaction_completed_cb on_subcompaction_completed,
+    on_external_file_ingested_cb on_external_file_ingested,
+    on_background_error_cb on_background_error,
+    on_stall_conditions_changed_cb on_stall_conditions_changed,
+    on_memtable_sealed_cb on_memtable_sealed);
+extern ROCKSDB_LIBRARY_API void rocksdb_eventlistener_destroy(
+    rocksdb_eventlistener_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_add_eventlistener(
+    rocksdb_options_t*, rocksdb_eventlistener_t*);
 
 /* Cuckoo table options */
 
@@ -1229,6 +1499,31 @@ rocksdb_logger_create_callback_logger(int log_level,
                                       void* priv);
 extern ROCKSDB_LIBRARY_API void rocksdb_logger_destroy(
     rocksdb_logger_t* logger);
+
+/* File Checksum Gen Factory */
+extern ROCKSDB_LIBRARY_API rocksdb_file_checksum_gen_factory_t*
+rocksdb_file_checksum_gen_crc32c_factory_create(void);
+extern ROCKSDB_LIBRARY_API void rocksdb_file_checksum_gen_factory_destroy(
+    rocksdb_file_checksum_gen_factory_t* factory);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_file_checksum_gen_factory(
+    rocksdb_options_t*, rocksdb_file_checksum_gen_factory_t*);
+
+/* SST Partitioner Factory */
+extern ROCKSDB_LIBRARY_API rocksdb_sst_partitioner_factory_t*
+rocksdb_sst_partitioner_fixed_prefix_factory_create(size_t prefix_len);
+extern ROCKSDB_LIBRARY_API void rocksdb_sst_partitioner_factory_destroy(
+    rocksdb_sst_partitioner_factory_t* factory);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_sst_partitioner_factory(
+    rocksdb_options_t*, rocksdb_sst_partitioner_factory_t*);
+
+/* Table Properties Collector Factory */
+extern ROCKSDB_LIBRARY_API void
+rocksdb_table_properties_collector_factory_destroy(
+    rocksdb_table_properties_collector_factory_t* factory);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_add_table_properties_collector_factory(
+    rocksdb_options_t*, rocksdb_table_properties_collector_factory_t*);
+
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_write_buffer_size(
     rocksdb_options_t*, size_t);
 extern ROCKSDB_LIBRARY_API size_t
@@ -1341,6 +1636,17 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_periodic_compaction_seconds(
     rocksdb_options_t*, uint64_t);
 extern ROCKSDB_LIBRARY_API uint64_t
 rocksdb_options_get_periodic_compaction_seconds(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_memtable_op_scan_flush_trigger(rocksdb_options_t*,
+                                                   uint32_t);
+extern ROCKSDB_LIBRARY_API uint32_t
+rocksdb_options_get_memtable_op_scan_flush_trigger(rocksdb_options_t*);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_memtable_avg_op_scan_flush_trigger(rocksdb_options_t*,
+                                                       uint32_t);
+extern ROCKSDB_LIBRARY_API uint32_t
+rocksdb_options_get_memtable_avg_op_scan_flush_trigger(rocksdb_options_t*);
 
 enum {
   rocksdb_statistics_level_disable_all = 0,
@@ -1362,13 +1668,6 @@ rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt,
                                                  unsigned char val);
 extern ROCKSDB_LIBRARY_API unsigned char
 rocksdb_options_get_skip_stats_update_on_db_open(rocksdb_options_t* opt);
-extern ROCKSDB_LIBRARY_API void
-rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(
-    rocksdb_options_t* opt, unsigned char val);
-extern ROCKSDB_LIBRARY_API unsigned char
-rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(
-    rocksdb_options_t* opt);
-
 /* Blob Options Settings */
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_blob_files(
     rocksdb_options_t* opt, unsigned char val);
@@ -1448,11 +1747,6 @@ rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t*, int);
 extern ROCKSDB_LIBRARY_API int
 rocksdb_options_get_min_write_buffer_number_to_merge(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void
-rocksdb_options_set_max_write_buffer_number_to_maintain(rocksdb_options_t*,
-                                                        int);
-extern ROCKSDB_LIBRARY_API int
-rocksdb_options_get_max_write_buffer_number_to_maintain(rocksdb_options_t*);
-extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_max_write_buffer_size_to_maintain(rocksdb_options_t*,
                                                       int64_t);
 extern ROCKSDB_LIBRARY_API int64_t
@@ -1752,6 +2046,10 @@ extern ROCKSDB_LIBRARY_API void
 rocksdb_options_add_compact_on_deletion_collector_factory_del_ratio(
     rocksdb_options_t*, size_t window_size, size_t num_dels_trigger,
     double deletion_ratio);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_add_compact_on_deletion_collector_factory_min_file_size(
+    rocksdb_options_t*, size_t window_size, size_t num_dels_trigger,
+    double deletion_ratio, uint64_t min_file_size);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_manual_wal_flush(
     rocksdb_options_t* opt, unsigned char);
 extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_manual_wal_flush(
@@ -1875,7 +2173,8 @@ enum {
   rocksdb_blob_decompress_time,
   rocksdb_internal_range_del_reseek_count,
   rocksdb_block_read_cpu_time,
-  rocksdb_total_metric_count = 79
+  rocksdb_internal_merge_point_lookup_count,
+  rocksdb_total_metric_count = 80
 };
 
 extern ROCKSDB_LIBRARY_API void rocksdb_set_perf_level(int);
@@ -2012,9 +2311,6 @@ extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_tailing(
     rocksdb_readoptions_t*, unsigned char);
 extern ROCKSDB_LIBRARY_API unsigned char rocksdb_readoptions_get_tailing(
     rocksdb_readoptions_t*);
-// The functionality that this option controlled has been removed.
-extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_managed(
-    rocksdb_readoptions_t*, unsigned char);
 extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_readahead_size(
     rocksdb_readoptions_t*, size_t);
 extern ROCKSDB_LIBRARY_API size_t
@@ -2225,6 +2521,51 @@ extern ROCKSDB_LIBRARY_API void rocksdb_write_buffer_manager_set_buffer_size(
 extern ROCKSDB_LIBRARY_API void rocksdb_write_buffer_manager_set_allow_stall(
     rocksdb_write_buffer_manager_t* wbm, bool new_allow_stall);
 
+/* SstFileManager */
+
+extern ROCKSDB_LIBRARY_API rocksdb_sst_file_manager_t*
+rocksdb_sst_file_manager_create(rocksdb_env_t* env);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_sst_file_manager_destroy(
+    rocksdb_sst_file_manager_t* sfm);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_sst_file_manager_set_max_allowed_space_usage(
+    rocksdb_sst_file_manager_t* sfm, uint64_t max_allowed_space);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_sst_file_manager_set_compaction_buffer_size(
+    rocksdb_sst_file_manager_t* sfm, uint64_t compaction_buffer_size);
+
+extern ROCKSDB_LIBRARY_API bool
+rocksdb_sst_file_manager_is_max_allowed_space_reached(
+    rocksdb_sst_file_manager_t* sfm);
+
+extern ROCKSDB_LIBRARY_API bool
+rocksdb_sst_file_manager_is_max_allowed_space_reached_including_compactions(
+    rocksdb_sst_file_manager_t* sfm);
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_sst_file_manager_get_total_size(rocksdb_sst_file_manager_t* sfm);
+
+extern ROCKSDB_LIBRARY_API int64_t
+rocksdb_sst_file_manager_get_delete_rate_bytes_per_second(
+    rocksdb_sst_file_manager_t* sfm);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_sst_file_manager_set_delete_rate_bytes_per_second(
+    rocksdb_sst_file_manager_t* sfm, int64_t delete_rate);
+
+extern ROCKSDB_LIBRARY_API double
+rocksdb_sst_file_manager_get_max_trash_db_ratio(
+    rocksdb_sst_file_manager_t* sfm);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_sst_file_manager_set_max_trash_db_ratio(
+    rocksdb_sst_file_manager_t* sfm, double ratio);
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_sst_file_manager_get_total_trash_size(rocksdb_sst_file_manager_t* sfm);
+
 /* HyperClockCache */
 
 extern ROCKSDB_LIBRARY_API rocksdb_hyper_clock_cache_options_t*
@@ -2381,10 +2722,9 @@ rocksdb_slicetransform_create(
     char* (*transform)(void*, const char* key, size_t length,
                        size_t* dst_length),
     unsigned char (*in_domain)(void*, const char* key, size_t length),
-    unsigned char (*in_range)(void*, const char* key, size_t length),
     const char* (*name)(void*));
 extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t*
-    rocksdb_slicetransform_create_fixed_prefix(size_t);
+rocksdb_slicetransform_create_fixed_prefix(size_t);
 extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t*
 rocksdb_slicetransform_create_noop(void);
 extern ROCKSDB_LIBRARY_API void rocksdb_slicetransform_destroy(
@@ -2453,15 +2793,32 @@ rocksdb_fifo_compaction_options_set_max_table_files_size(
 extern ROCKSDB_LIBRARY_API uint64_t
 rocksdb_fifo_compaction_options_get_max_table_files_size(
     rocksdb_fifo_compaction_options_t* fifo_opts);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_fifo_compaction_options_set_max_data_files_size(
+    rocksdb_fifo_compaction_options_t* fifo_opts, uint64_t size);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_fifo_compaction_options_get_max_data_files_size(
+    rocksdb_fifo_compaction_options_t* fifo_opts);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_fifo_compaction_options_set_use_kv_ratio_compaction(
+    rocksdb_fifo_compaction_options_t* fifo_opts,
+    unsigned char use_kv_ratio_compaction);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_fifo_compaction_options_get_use_kv_ratio_compaction(
+    rocksdb_fifo_compaction_options_t* fifo_opts);
 extern ROCKSDB_LIBRARY_API void rocksdb_fifo_compaction_options_destroy(
     rocksdb_fifo_compaction_options_t* fifo_opts);
 
+extern ROCKSDB_LIBRARY_API rocksdb_livefiles_t* rocksdb_livefiles_create(void);
+
 extern ROCKSDB_LIBRARY_API int rocksdb_livefiles_count(
     const rocksdb_livefiles_t*);
 extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_column_family_name(
     const rocksdb_livefiles_t*, int index);
 extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_name(
     const rocksdb_livefiles_t*, int index);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_directory(
+    const rocksdb_livefiles_t*, int index);
 extern ROCKSDB_LIBRARY_API int rocksdb_livefiles_level(
     const rocksdb_livefiles_t*, int index);
 extern ROCKSDB_LIBRARY_API size_t
@@ -2471,12 +2828,44 @@ extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_smallestkey(
 extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_largestkey(
     const rocksdb_livefiles_t*, int index, size_t* size);
 extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_livefiles_smallest_seqno(const rocksdb_livefiles_t*, int index);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_livefiles_largest_seqno(const rocksdb_livefiles_t*, int index);
+extern ROCKSDB_LIBRARY_API uint64_t
 rocksdb_livefiles_entries(const rocksdb_livefiles_t*, int index);
 extern ROCKSDB_LIBRARY_API uint64_t
 rocksdb_livefiles_deletions(const rocksdb_livefiles_t*, int index);
 extern ROCKSDB_LIBRARY_API void rocksdb_livefiles_destroy(
     const rocksdb_livefiles_t*);
 
+extern ROCKSDB_LIBRARY_API rocksdb_livefile_t* rocksdb_livefile_create(void);
+extern ROCKSDB_LIBRARY_API void rocksdb_livefile_set_column_family_name(
+    rocksdb_livefile_t*, const char*);
+extern ROCKSDB_LIBRARY_API void rocksdb_livefile_set_level(rocksdb_livefile_t*,
+                                                           int);
+extern ROCKSDB_LIBRARY_API void rocksdb_livefile_set_name(rocksdb_livefile_t*,
+                                                          const char*);
+extern ROCKSDB_LIBRARY_API void rocksdb_livefile_set_directory(
+    rocksdb_livefile_t*, const char*);
+extern ROCKSDB_LIBRARY_API void rocksdb_livefile_set_size(rocksdb_livefile_t*,
+                                                          size_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_livefile_set_smallest_key(
+    rocksdb_livefile_t*, const char*, size_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_livefile_set_largest_key(
+    rocksdb_livefile_t*, const char*, size_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_livefile_set_smallest_seqno(
+    rocksdb_livefile_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_livefile_set_largest_seqno(
+    rocksdb_livefile_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_livefile_set_num_entries(
+    rocksdb_livefile_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_livefile_set_num_deletions(
+    rocksdb_livefile_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_livefile_destroy(rocksdb_livefile_t*);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_livefiles_add(rocksdb_livefiles_t*,
+                                                      rocksdb_livefile_t*);
+
 /* Utility Helpers */
 
 extern ROCKSDB_LIBRARY_API void rocksdb_get_options_from_string(
@@ -2497,6 +2886,37 @@ extern ROCKSDB_LIBRARY_API void rocksdb_delete_file_in_range_cf(
 extern ROCKSDB_LIBRARY_API rocksdb_column_family_metadata_t*
 rocksdb_get_column_family_metadata(rocksdb_t* db);
 
+extern ROCKSDB_LIBRARY_API rocksdb_import_column_family_options_t*
+rocksdb_import_column_family_options_create(void);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_import_column_family_options_set_move_files(
+    rocksdb_import_column_family_options_t*, unsigned char);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_import_column_family_options_destroy(
+    rocksdb_import_column_family_options_t*);
+
+extern ROCKSDB_LIBRARY_API rocksdb_export_import_files_metadata_t*
+rocksdb_export_import_files_metadata_create(void);
+
+extern ROCKSDB_LIBRARY_API char*
+rocksdb_export_import_files_metadata_get_db_comparator_name(
+    rocksdb_export_import_files_metadata_t*);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_export_import_files_metadata_set_db_comparator_name(
+    rocksdb_export_import_files_metadata_t*, const char*);
+
+extern ROCKSDB_LIBRARY_API rocksdb_livefiles_t*
+rocksdb_export_import_files_metadata_get_files(
+    rocksdb_export_import_files_metadata_t*);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_export_import_files_metadata_set_files(
+    rocksdb_export_import_files_metadata_t*, rocksdb_livefiles_t*);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_export_import_files_metadata_destroy(
+    rocksdb_export_import_files_metadata_t*);
+
 /**
  * Returns the rocksdb_column_family_metadata_t of the specified
  * column family.
@@ -3130,6 +3550,266 @@ extern ROCKSDB_LIBRARY_API uint64_t
 rocksdb_wait_for_compact_options_get_timeout(
     rocksdb_wait_for_compact_options_t* opt);
 
+/* High-performance zero-copy Get variants
+   These functions avoid unnecessary memory allocations and copies.
+   The returned buffer is valid until the handle is destroyed.
+   Bindings should migrate to these for better performance. */
+
+/* Zero-copy get that returns a handle to pinned data.
+   The data remains valid until rocksdb_pinnable_handle_destroy is called.
+   Returns NULL on error or not found. Check errptr to distinguish. */
+typedef struct rocksdb_pinnable_handle_t rocksdb_pinnable_handle_t;
+
+extern ROCKSDB_LIBRARY_API rocksdb_pinnable_handle_t* rocksdb_get_pinned_v2(
+    rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key,
+    size_t keylen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_pinnable_handle_t* rocksdb_get_pinned_cf_v2(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, char** errptr);
+
+/* Get the data pointer and size from a pinnable handle.
+   The data pointer is valid until the handle is destroyed. */
+extern ROCKSDB_LIBRARY_API const char* rocksdb_pinnable_handle_get_value(
+    const rocksdb_pinnable_handle_t* handle, size_t* vallen);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_pinnable_handle_destroy(
+    rocksdb_pinnable_handle_t* handle);
+
+/* Direct get into caller-provided buffer.
+   Returns 1 if value fits in buffer, 0 if buffer too small.
+   Sets *vallen to actual value size.
+   If buffer is too small, no data is copied but *vallen is set. */
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_get_into_buffer(
+    rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key,
+    size_t keylen, char* buffer, size_t buffer_size, size_t* vallen,
+    unsigned char* found, char** errptr);
+
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_get_into_buffer_cf(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, char* buffer, size_t buffer_size, size_t* vallen,
+    unsigned char* found, char** errptr);
+
+// Remote compaction
+enum {
+  rocksdb_compactionservice_jobstatus_success = 0,
+  rocksdb_compactionservice_jobstatus_failure = 1,
+  rocksdb_compactionservice_jobstatus_aborted = 2,
+  rocksdb_compactionservice_jobstatus_use_local = 3,
+};
+
+extern ROCKSDB_LIBRARY_API rocksdb_compactionservice_scheduleresponse_t*
+rocksdb_compactionservice_scheduleresponse_create(const char* scheduled_job_id,
+                                                  int status, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_compactionservice_scheduleresponse_t*
+rocksdb_compactionservice_scheduleresponse_create_with_status(int status,
+                                                              char** errptr);
+
+extern ROCKSDB_LIBRARY_API int
+rocksdb_compactionservice_scheduleresponse_getstatus(
+    const rocksdb_compactionservice_scheduleresponse_t* response);
+
+extern ROCKSDB_LIBRARY_API const char*
+rocksdb_compactionservice_scheduleresponse_get_scheduled_job_id(
+    const rocksdb_compactionservice_scheduleresponse_t* response, size_t* len);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compactionservice_scheduleresponse_t_destroy(
+    rocksdb_compactionservice_scheduleresponse_t* response);
+
+extern ROCKSDB_LIBRARY_API const char*
+rocksdb_compactionservice_jobinfo_t_get_db_name(
+    const rocksdb_compactionservice_jobinfo_t* info, size_t* len);
+
+extern ROCKSDB_LIBRARY_API const char*
+rocksdb_compactionservice_jobinfo_t_get_db_id(
+    const rocksdb_compactionservice_jobinfo_t* info, size_t* len);
+
+extern ROCKSDB_LIBRARY_API const char*
+rocksdb_compactionservice_jobinfo_t_get_db_session_id(
+    const rocksdb_compactionservice_jobinfo_t* info, size_t* len);
+
+extern ROCKSDB_LIBRARY_API const char*
+rocksdb_compactionservice_jobinfo_t_get_cf_name(
+    const rocksdb_compactionservice_jobinfo_t* info, size_t* len);
+
+extern ROCKSDB_LIBRARY_API uint32_t
+rocksdb_compactionservice_jobinfo_t_get_cf_id(
+    const rocksdb_compactionservice_jobinfo_t* info);
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_compactionservice_jobinfo_t_get_job_id(
+    const rocksdb_compactionservice_jobinfo_t* info);
+
+extern ROCKSDB_LIBRARY_API int rocksdb_compactionservice_jobinfo_t_get_priority(
+    const rocksdb_compactionservice_jobinfo_t* info);
+
+extern ROCKSDB_LIBRARY_API int
+rocksdb_compactionservice_jobinfo_t_get_compaction_reason(
+    const rocksdb_compactionservice_jobinfo_t* info);
+
+extern ROCKSDB_LIBRARY_API int
+rocksdb_compactionservice_jobinfo_t_get_base_input_level(
+    const rocksdb_compactionservice_jobinfo_t* info);
+
+extern ROCKSDB_LIBRARY_API int
+rocksdb_compactionservice_jobinfo_t_get_output_level(
+    const rocksdb_compactionservice_jobinfo_t* info);
+
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_compactionservice_jobinfo_t_is_full_compaction(
+    const rocksdb_compactionservice_jobinfo_t* info);
+
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_compactionservice_jobinfo_t_is_manual_compaction(
+    const rocksdb_compactionservice_jobinfo_t* info);
+
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_compactionservice_jobinfo_t_is_bottommost_level(
+    const rocksdb_compactionservice_jobinfo_t* info);
+
+extern ROCKSDB_LIBRARY_API rocksdb_compactionservice_t*
+rocksdb_compactionservice_create(
+    void* state, void (*destructor)(void*),
+    rocksdb_compaction_service_schedule_cb schedule, const char* name,
+    rocksdb_compaction_service_wait_cb wait,
+    rocksdb_compaction_service_cancel_awaiting_jobs_cb cancel_awaiting_jobs,
+    rocksdb_compaction_service_on_installation_cb on_installation);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_service(
+    rocksdb_options_t* options, rocksdb_compactionservice_t* service);
+
+// CompactionServiceOptionsOverride
+extern ROCKSDB_LIBRARY_API rocksdb_compaction_service_options_override_t*
+rocksdb_compaction_service_options_override_create(void);
+
+extern ROCKSDB_LIBRARY_API rocksdb_compaction_service_options_override_t*
+rocksdb_compaction_service_options_override_create_from_options(
+    rocksdb_options_t* option);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compaction_service_options_override_destroy(
+    rocksdb_compaction_service_options_override_t* override_options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compaction_service_options_override_set_env(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_env_t* env);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compaction_service_options_override_set_comparator(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_comparator_t* comparator);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compaction_service_options_override_set_merge_operator(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_mergeoperator_t* merge_operator);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compaction_service_options_override_set_compaction_filter(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_compactionfilter_t* compaction_filter);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compaction_service_options_override_set_compaction_filter_factory(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_compactionfilterfactory_t* compaction_filter_factory);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compaction_service_options_override_set_prefix_extractor(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_slicetransform_t* prefix_extractor);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compaction_service_options_override_set_block_based_table_factory(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_block_based_table_options_t* table_options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compaction_service_options_override_set_cuckoo_table_factory(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_cuckoo_table_options_t* table_options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compaction_service_options_override_add_event_listener(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_eventlistener_t* event_listener);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compaction_service_options_override_set_statistics(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compaction_service_options_override_set_info_log(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_logger_t* logger);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compaction_service_options_override_set_option(
+    rocksdb_compaction_service_options_override_t* override_options,
+    const char* key, const char* value);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compaction_service_options_override_set_file_checksum_gen_factory(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_file_checksum_gen_factory_t* factory);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compaction_service_options_override_set_sst_partitioner_factory(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_sst_partitioner_factory_t* factory);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compaction_service_options_override_add_table_properties_collector_factory(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_table_properties_collector_factory_t* factory);
+
+// Atomic bool management for cancellation
+// Creates an atomic bool that can be used for cancellation.
+// User must call rocksdb_open_and_compact_canceled_destroy() to free it.
+extern ROCKSDB_LIBRARY_API unsigned char*
+rocksdb_open_and_compact_canceled_create(void);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_open_and_compact_canceled_destroy(
+    unsigned char* canceled);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_open_and_compact_canceled_set(
+    unsigned char* canceled, unsigned char value);
+
+// OpenAndCompactOptions
+extern ROCKSDB_LIBRARY_API rocksdb_open_and_compact_options_t*
+rocksdb_open_and_compact_options_create(void);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_open_and_compact_options_destroy(
+    rocksdb_open_and_compact_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_open_and_compact_options_set_canceled(
+    rocksdb_open_and_compact_options_t* options, unsigned char* canceled);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_open_and_compact_options_set_allow_resumption(
+    rocksdb_open_and_compact_options_t* options,
+    unsigned char allow_resumption);
+
+// OpenAndCompact - main functions
+extern ROCKSDB_LIBRARY_API char* rocksdb_open_and_compact(
+    const char* db_path, const char* output_directory, const char* input,
+    size_t input_len, size_t* output_len,
+    const rocksdb_compaction_service_options_override_t* override_options,
+    char** errptr);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_open_and_compact_with_options(
+    const rocksdb_open_and_compact_options_t* options, const char* db_path,
+    const char* output_directory, const char* input, size_t input_len,
+    size_t* output_len,
+    const rocksdb_compaction_service_options_override_t* override_options,
+    char** errptr);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h
index 54e9e88aacba..f52d5246bbfe 100644
--- a/include/rocksdb/cache.h
+++ b/include/rocksdb/cache.h
@@ -210,7 +210,15 @@ struct ShardedCacheOptions {
 // shard has its own LRU list for evictions. Each shard also has a mutex for
 // exclusive access during operations; even read operations need exclusive
 // access in order to update the LRU list. Mutex contention is usually low
-// with enough shards.
+// with enough shards. However,
+// * For a single hot block, there will be mutex contention even for reads
+// regardless of the number of shards.
+// * LRUCaches in the size of MBs instead of GBs can have shards small enough
+// that there is a random probability of some modest number of large blocks
+// (especially non-partitioned filters) thrashing a single cache shard.
+//
+// HYPERCLOCKCACHE IS NOW GENERALLY RECOMMENDED OVER LRUCACHE. See
+// HyperClockCacheOptions below.
 struct LRUCacheOptions : public ShardedCacheOptions {
   // Ratio of cache reserved for high-priority and low-priority entries,
   // respectively. (See Cache::Priority below more information on the levels.)
@@ -298,13 +306,6 @@ struct CompressedSecondaryCacheOptions : LRUCacheOptions {
   // Options specific to the compression algorithm
   CompressionOptions compression_opts;
 
-  // compress_format_version can have two values:
-  // compress_format_version == 1 -- decompressed size is not included in the
-  // block header.
-  // compress_format_version == 2 -- decompressed size is included in the block
-  // header in varint32 format.
-  uint32_t compress_format_version = 2;
-
   // Enable the custom split and merge feature, which split the compressed value
   // into chunks so that they may better fit jemalloc bins.
   bool enable_custom_split_merge = false;
@@ -322,7 +323,6 @@ struct CompressedSecondaryCacheOptions : LRUCacheOptions {
       CacheMetadataChargePolicy _metadata_charge_policy =
           kDefaultCacheMetadataChargePolicy,
       CompressionType _compression_type = CompressionType::kLZ4Compression,
-      uint32_t _compress_format_version = 2,
       bool _enable_custom_split_merge = false,
       const CacheEntryRoleSet& _do_not_compress_roles =
           {CacheEntryRole::kFilterBlock})
@@ -331,7 +331,6 @@ struct CompressedSecondaryCacheOptions : LRUCacheOptions {
                         _use_adaptive_mutex, _metadata_charge_policy,
                         _low_pri_pool_ratio),
         compression_type(_compression_type),
-        compress_format_version(_compress_format_version),
         enable_custom_split_merge(_enable_custom_split_merge),
         do_not_compress_roles(_do_not_compress_roles) {}
 
@@ -352,7 +351,6 @@ inline std::shared_ptr<SecondaryCache> NewCompressedSecondaryCache(
     CacheMetadataChargePolicy metadata_charge_policy =
         kDefaultCacheMetadataChargePolicy,
     CompressionType compression_type = CompressionType::kLZ4Compression,
-    uint32_t compress_format_version = 2,
     bool enable_custom_split_merge = false,
     const CacheEntryRoleSet& _do_not_compress_roles = {
         CacheEntryRole::kFilterBlock}) {
@@ -360,8 +358,7 @@ inline std::shared_ptr<SecondaryCache> NewCompressedSecondaryCache(
              capacity, num_shard_bits, strict_capacity_limit,
              high_pri_pool_ratio, low_pri_pool_ratio, memory_allocator,
              use_adaptive_mutex, metadata_charge_policy, compression_type,
-             compress_format_version, enable_custom_split_merge,
-             _do_not_compress_roles)
+             enable_custom_split_merge, _do_not_compress_roles)
       .MakeSharedSecondaryCache();
 }
 
@@ -371,64 +368,50 @@ inline std::shared_ptr<SecondaryCache> NewCompressedSecondaryCache(
   return opts.MakeSharedSecondaryCache();
 }
 
-// HyperClockCache - A lock-free Cache alternative for RocksDB block cache
-// that offers much improved CPU efficiency vs. LRUCache under high parallel
-// load or high contention, with some caveats:
+// HyperClockCache (also known as HCC) - A lock-free Cache alternative for
+// RocksDB block cache that offers much improved CPU efficiency vs. LRUCache
+// under high parallel load or high contention. Additionally, HCC only uses
+// sharding for a modest performance boost, so can use much larger cache shards
+// than LRUCache, dramatically reducing the risk of thrashing in configurations
+// or work loads with some large blocks.
+//
+// HYPERCLOCKCACHE IS NOW GENERALLY RECOMMENDED OVER LRUCACHE
+//
+// Some caveats:
 // * Not a general Cache implementation: can only be used for
 // BlockBasedTableOptions::block_cache, which RocksDB uses in a way that is
 // compatible with HyperClockCache.
-// * Requires an extra tuning parameter: see estimated_entry_charge below.
-// Similarly, substantially changing the capacity with SetCapacity could
-// harm efficiency. -> EXPERIMENTAL: the tuning parameter can be set to 0
-// to find the appropriate balance automatically.
 // * Cache priorities are less aggressively enforced, which could cause
 // cache dilution from long range scans (unless they use fill_cache=false).
+// * In some configurations, depends on anonymous mmap support, available in
+// Linux, Windows and more.
+// * May have slightly lower (or slightly higher) cache hit rate vs. LRUCache,
+// because of the bounded counting-CLOCK eviction algorithm.
 //
 // See internal cache/clock_cache.h for full description.
 struct HyperClockCacheOptions : public ShardedCacheOptions {
-  // The estimated average `charge` associated with cache entries.
-  //
-  // EXPERIMENTAL: the field can be set to 0 to size the table dynamically
-  // and automatically. See also min_avg_entry_charge. This feature requires
-  // platform support for lazy anonymous memory mappings (incl Linux, Windows).
-  // Performance is very similar to choosing the best configuration parameter.
-  //
-  // PRODUCTION-TESTED: This is a critical configuration parameter for good
-  // performance, because having a table size that is fixed at creation time
-  // greatly reduces the required synchronization between threads.
-  // * If the estimate is substantially too low (e.g. less than half the true
-  // average) then metadata space overhead with be substantially higher (e.g.
-  // 200 bytes per entry rather than 100). With kFullChargeCacheMetadata, this
-  // can slightly reduce cache hit rates, and slightly reduce access times due
-  // to the larger working memory size.
-  // * If the estimate is substantially too high (e.g. 25% higher than the true
-  // average) then there might not be sufficient slots in the hash table for
-  // both efficient operation and capacity utilization (hit rate). The hyper
-  // cache will evict entries to prevent load factors that could dramatically
-  // affect lookup times, instead letting the hit rate suffer by not utilizing
-  // the full capacity.
+  // OPTIONAL: The estimated average `charge` associated with cache entries.
   //
-  // A reasonable choice is the larger of block_size and metadata_block_size.
-  // When WriteBufferManager (and similar) charge memory usage to the block
-  // cache, this can lead to the same effect as estimate being too low, which
-  // is better than the opposite. Therefore, the general recommendation is to
-  // assume that other memory charged to block cache could be negligible, and
-  // ignore it in making the estimate.
+  // When not provided (== 0, recommended and default), an HCC variant with a
+  // dynamically-growing table and generally good performance is used. This
+  // variant depends on anonymous mmaps so might not be available on all
+  // platforms.
   //
-  // The best parameter choice based on a cache in use is given by
-  // GetUsage() / GetOccupancyCount(), ignoring metadata overheads such as
-  // with kDontChargeCacheMetadata. More precisely with
-  // kFullChargeCacheMetadata is (GetUsage() - 64 * GetTableAddressCount()) /
-  // GetOccupancyCount(). However, when the average value size might vary
-  // (e.g. balance between metadata and data blocks in cache), it is better
-  // to estimate toward the lower side than the higher side.
+  // If the average "charge" (uncompressed block size) of block cache entries
+  // is reasonably predicted and provided here, the most efficient variant of
+  // HCC is used. Performance is degraded if the prediction is inaccurate.
+  // Prediction could be difficult or impossible with cache-charging features
+  // such as WriteBufferManager. The best parameter choice based on a cache
+  // in use is roughly given by GetUsage() / GetOccupancyCount(), though it is
+  // better to estimate toward the lower side than the higher side when the
+  // ratio might vary.
   size_t estimated_entry_charge;
 
-  // EXPERIMENTAL: When estimated_entry_charge == 0, this parameter establishes
-  // a promised lower bound on the average charge of all entries in the table,
-  // which is roughly the average uncompressed SST block size of block cache
-  // entries, typically > 4KB. The default should generally suffice with almost
-  // no cost. (This option is ignored for estimated_entry_charge > 0.)
+  // When estimated_entry_charge == 0, this parameter establishes a promised
+  // lower bound on the average charge of all entries in the table, which is
+  // roughly the average uncompressed SST block size of block cache entries,
+  // typically > 4KB. The default should generally suffice with almost no cost.
+  // (This option is ignored for estimated_entry_charge > 0.)
   //
   // More detail: The table for indexing cache entries will grow automatically
   // as needed, but a hard upper bound on that size is needed at creation time.
@@ -478,8 +461,8 @@ struct HyperClockCacheOptions : public ShardedCacheOptions {
   // keep operations very fast.
   int eviction_effort_cap = 30;
 
-  HyperClockCacheOptions(
-      size_t _capacity, size_t _estimated_entry_charge,
+  explicit HyperClockCacheOptions(
+      size_t _capacity, size_t _estimated_entry_charge = 0,
       int _num_shard_bits = -1, bool _strict_capacity_limit = false,
       std::shared_ptr<MemoryAllocator> _memory_allocator = nullptr,
       CacheMetadataChargePolicy _metadata_charge_policy =
diff --git a/include/rocksdb/compaction_filter.h b/include/rocksdb/compaction_filter.h
index 66f2f390e7d1..68a7116de9bd 100644
--- a/include/rocksdb/compaction_filter.h
+++ b/include/rocksdb/compaction_filter.h
@@ -284,9 +284,7 @@ class CompactionFilter : public Customizable {
       std::string* new_value,
       std::vector<std::pair<std::string, std::string>>* /* new_columns */,
       std::string* skip_until) const {
-#ifdef NDEBUG
     (void)existing_columns;
-#endif
 
     assert(!existing_value || !existing_columns);
     assert(value_type == ValueType::kWideColumnEntity || existing_value);
diff --git a/include/rocksdb/compaction_job_stats.h b/include/rocksdb/compaction_job_stats.h
index 91709795a176..c9476d70a78d 100644
--- a/include/rocksdb/compaction_job_stats.h
+++ b/include/rocksdb/compaction_job_stats.h
@@ -24,15 +24,18 @@ struct CompactionJobStats {
   // the elapsed CPU time of this compaction in microseconds.
   uint64_t cpu_micros = 0;
 
-  // Used internally indicating whether a subcompaction's
-  // `num_input_records` is accurate.
-  bool has_num_input_records = false;
+  // True if `num_input_records` is accurate across all subcompactions.
+  // See CompactionIterator::must_count_input_entries for some implementation
+  // details why `num_input_records` may not be accurate.
+  bool has_accurate_num_input_records = true;
   // the number of compaction input records.
   uint64_t num_input_records = 0;
   // the number of blobs read from blob files
   uint64_t num_blobs_read = 0;
   // the number of compaction input files (table files)
   size_t num_input_files = 0;
+  // The number of input files that get trivially moved.
+  size_t num_input_files_trivially_moved = 0;
   // the number of compaction input files at the output level (table files)
   size_t num_input_files_at_output_level = 0;
   // the number of compaction input files that are filtered out by compaction
@@ -118,6 +121,6 @@ struct CompactionJobStats {
   // number of single-deletes which meet something other than a put
   uint64_t num_single_del_mismatch = 0;
 
-  // TODO: Add output_to_penultimate_level output information
+  // TODO: Add output_to_proximal_level output information
 };
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/compression_type.h b/include/rocksdb/compression_type.h
index 2ca742aa3853..2261a44439b9 100644
--- a/include/rocksdb/compression_type.h
+++ b/include/rocksdb/compression_type.h
@@ -18,14 +18,148 @@ namespace ROCKSDB_NAMESPACE {
 enum CompressionType : unsigned char {
   // NOTE: do not change the values of existing entries, as these are
   // part of the persistent format on disk.
-  kNoCompression = 0x0,
-  kSnappyCompression = 0x1,
-  kZlibCompression = 0x2,
-  kBZip2Compression = 0x3,
-  kLZ4Compression = 0x4,
-  kLZ4HCCompression = 0x5,
-  kXpressCompression = 0x6,
-  kZSTD = 0x7,
+  kNoCompression = 0x00,
+  kSnappyCompression = 0x01,
+  kZlibCompression = 0x02,
+  kBZip2Compression = 0x03,
+  kLZ4Compression = 0x04,
+  kLZ4HCCompression = 0x05,
+  kXpressCompression = 0x06,
+  kZSTD = 0x07,
+  kLastBuiltinCompression = kZSTD,
+
+  // Reserved for future use: up to 0x7F
+
+  // For use by user custom CompressionManagers
+  kCustomCompression80 = 0x80,
+  kFirstCustomCompression = kCustomCompression80,
+  kCustomCompression81 = 0x81,
+  kCustomCompression82 = 0x82,
+  kCustomCompression83 = 0x83,
+  kCustomCompression84 = 0x84,
+  kCustomCompression85 = 0x85,
+  kCustomCompression86 = 0x86,
+  kCustomCompression87 = 0x87,
+  kCustomCompression88 = 0x88,
+  kCustomCompression89 = 0x89,
+  kCustomCompression8A = 0x8A,
+  kCustomCompression8B = 0x8B,
+  kCustomCompression8C = 0x8C,
+  kCustomCompression8D = 0x8D,
+  kCustomCompression8E = 0x8E,
+  kCustomCompression8F = 0x8F,
+  kCustomCompression90 = 0x90,
+  kCustomCompression91 = 0x91,
+  kCustomCompression92 = 0x92,
+  kCustomCompression93 = 0x93,
+  kCustomCompression94 = 0x94,
+  kCustomCompression95 = 0x95,
+  kCustomCompression96 = 0x96,
+  kCustomCompression97 = 0x97,
+  kCustomCompression98 = 0x98,
+  kCustomCompression99 = 0x99,
+  kCustomCompression9A = 0x9A,
+  kCustomCompression9B = 0x9B,
+  kCustomCompression9C = 0x9C,
+  kCustomCompression9D = 0x9D,
+  kCustomCompression9E = 0x9E,
+  kCustomCompression9F = 0x9F,
+  kCustomCompressionA0 = 0xA0,
+  kCustomCompressionA1 = 0xA1,
+  kCustomCompressionA2 = 0xA2,
+  kCustomCompressionA3 = 0xA3,
+  kCustomCompressionA4 = 0xA4,
+  kCustomCompressionA5 = 0xA5,
+  kCustomCompressionA6 = 0xA6,
+  kCustomCompressionA7 = 0xA7,
+  kCustomCompressionA8 = 0xA8,
+  kCustomCompressionA9 = 0xA9,
+  kCustomCompressionAA = 0xAA,
+  kCustomCompressionAB = 0xAB,
+  kCustomCompressionAC = 0xAC,
+  kCustomCompressionAD = 0xAD,
+  kCustomCompressionAE = 0xAE,
+  kCustomCompressionAF = 0xAF,
+  kCustomCompressionB0 = 0xB0,
+  kCustomCompressionB1 = 0xB1,
+  kCustomCompressionB2 = 0xB2,
+  kCustomCompressionB3 = 0xB3,
+  kCustomCompressionB4 = 0xB4,
+  kCustomCompressionB5 = 0xB5,
+  kCustomCompressionB6 = 0xB6,
+  kCustomCompressionB7 = 0xB7,
+  kCustomCompressionB8 = 0xB8,
+  kCustomCompressionB9 = 0xB9,
+  kCustomCompressionBA = 0xBA,
+  kCustomCompressionBB = 0xBB,
+  kCustomCompressionBC = 0xBC,
+  kCustomCompressionBD = 0xBD,
+  kCustomCompressionBE = 0xBE,
+  kCustomCompressionBF = 0xBF,
+  kCustomCompressionC0 = 0xC0,
+  kCustomCompressionC1 = 0xC1,
+  kCustomCompressionC2 = 0xC2,
+  kCustomCompressionC3 = 0xC3,
+  kCustomCompressionC4 = 0xC4,
+  kCustomCompressionC5 = 0xC5,
+  kCustomCompressionC6 = 0xC6,
+  kCustomCompressionC7 = 0xC7,
+  kCustomCompressionC8 = 0xC8,
+  kCustomCompressionC9 = 0xC9,
+  kCustomCompressionCA = 0xCA,
+  kCustomCompressionCB = 0xCB,
+  kCustomCompressionCC = 0xCC,
+  kCustomCompressionCD = 0xCD,
+  kCustomCompressionCE = 0xCE,
+  kCustomCompressionCF = 0xCF,
+  kCustomCompressionD0 = 0xD0,
+  kCustomCompressionD1 = 0xD1,
+  kCustomCompressionD2 = 0xD2,
+  kCustomCompressionD3 = 0xD3,
+  kCustomCompressionD4 = 0xD4,
+  kCustomCompressionD5 = 0xD5,
+  kCustomCompressionD6 = 0xD6,
+  kCustomCompressionD7 = 0xD7,
+  kCustomCompressionD8 = 0xD8,
+  kCustomCompressionD9 = 0xD9,
+  kCustomCompressionDA = 0xDA,
+  kCustomCompressionDB = 0xDB,
+  kCustomCompressionDC = 0xDC,
+  kCustomCompressionDD = 0xDD,
+  kCustomCompressionDE = 0xDE,
+  kCustomCompressionDF = 0xDF,
+  kCustomCompressionE0 = 0xE0,
+  kCustomCompressionE1 = 0xE1,
+  kCustomCompressionE2 = 0xE2,
+  kCustomCompressionE3 = 0xE3,
+  kCustomCompressionE4 = 0xE4,
+  kCustomCompressionE5 = 0xE5,
+  kCustomCompressionE6 = 0xE6,
+  kCustomCompressionE7 = 0xE7,
+  kCustomCompressionE8 = 0xE8,
+  kCustomCompressionE9 = 0xE9,
+  kCustomCompressionEA = 0xEA,
+  kCustomCompressionEB = 0xEB,
+  kCustomCompressionEC = 0xEC,
+  kCustomCompressionED = 0xED,
+  kCustomCompressionEE = 0xEE,
+  kCustomCompressionEF = 0xEF,
+  kCustomCompressionF0 = 0xF0,
+  kCustomCompressionF1 = 0xF1,
+  kCustomCompressionF2 = 0xF2,
+  kCustomCompressionF3 = 0xF3,
+  kCustomCompressionF4 = 0xF4,
+  kCustomCompressionF5 = 0xF5,
+  kCustomCompressionF6 = 0xF6,
+  kCustomCompressionF7 = 0xF7,
+  kCustomCompressionF8 = 0xF8,
+  kCustomCompressionF9 = 0xF9,
+  kCustomCompressionFA = 0xFA,
+  kCustomCompressionFB = 0xFB,
+  kCustomCompressionFC = 0xFC,
+  kCustomCompressionFD = 0xFD,
+  kCustomCompressionFE = 0xFE,
+  kLastCustomCompression = kCustomCompressionFE,
 
   // kDisableCompressionOption is used to disable some compression options.
   kDisableCompressionOption = 0xff,
@@ -92,11 +226,15 @@ struct CompressionOptions {
   // The training data will be used to generate a dictionary of max_dict_bytes.
   uint32_t zstd_max_train_bytes = 0;
 
-  // Number of threads for parallel compression.
-  // Parallel compression is enabled only if threads > 1.
-  // THE FEATURE IS STILL EXPERIMENTAL
+  // Number of threads for parallel compression for each running flush or
+  // compaction job. Parallel compression is enabled only if threads > 1. Not
+  // recommended for lightweight compression algorithms such as Snappy, LZ4, and
+  // obviously kNoCompression because there is unlikely to be a throughput gain.
   //
-  // This option is valid only when BlockBasedTable is used.
+  // This option is valid only when BlockBasedTable is used and is disabled
+  // (sanitized to 1) with any of these:
+  // * User-defined index (UserDefinedIndexFactory)
+  // * partition_filters == true && decouple_partitioned_filters == false
   //
   // When parallel compression is enabled, SST size file sizes might be
   // more inflated compared to the target size, because more data of unknown
@@ -175,9 +313,10 @@ struct CompressionOptions {
     max_compressed_bytes_per_kb = static_cast<int>(1024.0 / min_ratio + 0.5);
   }
 
-#if __cplusplus >= 202002L
   bool operator==(const CompressionOptions& rhs) const = default;
-#endif
 };
 
+// See advanced_compression.h
+class CompressionManager;
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/convenience.h b/include/rocksdb/convenience.h
index 27127fbebfbf..95bfe2c692b6 100644
--- a/include/rocksdb/convenience.h
+++ b/include/rocksdb/convenience.h
@@ -450,6 +450,22 @@ Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family,
 // Delete files in multiple ranges at once
 // Delete files in a lot of ranges one at a time can be slow, use this API for
 // better performance in that case.
+Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family,
+                           const RangeOpt* ranges, size_t n,
+                           bool include_end = true);
+
+// DEPRECATED
+struct RangePtr {
+  // In case of user_defined timestamp, if enabled, `start` and `limit` should
+  // point to key without timestamp part.
+  const Slice* start;
+  const Slice* limit;
+
+  RangePtr() : start(nullptr), limit(nullptr) {}
+  RangePtr(const Slice* s, const Slice* l) : start(s), limit(l) {}
+};
+
+// DEPRECATED
 Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family,
                            const RangePtr* ranges, size_t n,
                            bool include_end = true);
diff --git a/include/rocksdb/data_structure.h b/include/rocksdb/data_structure.h
index ffab82c514a5..7563a83abfcf 100644
--- a/include/rocksdb/data_structure.h
+++ b/include/rocksdb/data_structure.h
@@ -7,34 +7,48 @@
 
 #include <assert.h>
 
+#include <array>
 #include <cstddef>
 #include <cstdint>
-#include <vector>
+#include <functional>
+#include <set>
+#include <variant>
 
+#include "rocksdb/comparator.h"
 #include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/slice.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 namespace detail {
 int CountTrailingZeroBitsForSmallEnumSet(uint64_t);
+int BitsSetToOneForSmallEnumSet(uint64_t);
 }  // namespace detail
 
-// Represents a set of values of some enum type with a small number of
-// possible enumerators. For now, it supports enums where no enumerator
-// exceeds 63 when converted to int.
+// Represents a set of values of some enum type with a small number of possible
+// enumerators. Assumes that any combination of enumerators with values 0
+// through MAX_ENUMERATOR (inclusive) might be part of the set. NOTE: would like
+// to use std::bitset, but it doesn't support constexpr (in C++17) operations
+// and doesn't support efficient iteration over sparse "set to true" entries.
 template <typename ENUM_TYPE, ENUM_TYPE MAX_ENUMERATOR>
 class SmallEnumSet {
  private:
-  using StateT = uint64_t;
-  static constexpr int kStateBits = sizeof(StateT) * 8;
-  static constexpr int kMaxMax = kStateBits - 1;
   static constexpr int kMaxValue = static_cast<int>(MAX_ENUMERATOR);
   static_assert(kMaxValue >= 0);
-  static_assert(kMaxValue <= kMaxMax);
+  static_assert(kMaxValue < 1024, "MAX_ENUMERATOR is suspiciously large");
+  using PieceT = uint64_t;
+  static constexpr int kPieceBits = 64;
+  static constexpr int kPieceMask = 63;
+  static constexpr int kPieceShift = 6;
+  static constexpr int kPieceCount = kMaxValue / kPieceBits + 1;
+  using StateT = std::array<PieceT, kPieceCount>;
+  static constexpr int kStateBits = kPieceBits * kPieceCount;
+  static_assert(kStateBits == sizeof(StateT) * 8);
+  static_assert(kMaxValue <= kStateBits - 1);
 
  public:
-  // construct / create
-  SmallEnumSet() : state_(0) {}
+  // construct / create empty set
+  SmallEnumSet() : state_{} {}
 
   template <class... TRest>
   /*implicit*/ constexpr SmallEnumSet(const ENUM_TYPE e, TRest... rest) {
@@ -44,8 +58,16 @@ class SmallEnumSet {
   // Return the set that includes all valid values, assuming the enum
   // is "dense" (includes all values converting to 0 through kMaxValue)
   static constexpr SmallEnumSet All() {
-    StateT tmp = StateT{1} << kMaxValue;
-    return SmallEnumSet(RawStateMarker(), tmp | (tmp - 1));
+    StateT tmp;
+    for (int i = 0; i < kPieceCount - 1; ++i) {
+      tmp[i] = ~PieceT{0};
+    }
+    if constexpr (((kMaxValue + 1) & kPieceMask) != 0) {
+      tmp[kPieceCount - 1] = (PieceT{1} << ((kMaxValue + 1) & kPieceMask)) - 1;
+    } else {
+      tmp[kPieceCount - 1] = ~PieceT{0};
+    }
+    return SmallEnumSet(RawStateMarker(), tmp);
   }
 
   // equality
@@ -60,11 +82,17 @@ class SmallEnumSet {
   bool Contains(const ENUM_TYPE e) const {
     int value = static_cast<int>(e);
     assert(value >= 0 && value <= kMaxValue);
-    StateT tmp = 1;
-    return state_ & (tmp << value);
+    return GetPiece(value) & (PieceT{1} << (value & kPieceMask));
   }
 
-  bool empty() const { return state_ == 0; }
+  bool empty() const {
+    for (int i = 0; i < kPieceCount; ++i) {
+      if (state_[i] != 0) {
+        return false;
+      }
+    }
+    return true;
+  }
 
   // iterator
   class const_iterator {
@@ -92,7 +120,7 @@ class SmallEnumSet {
       if (pos_ < kMaxValue) {
         pos_ = set_->SkipUnset(pos_ + 1);
       } else {
-        pos_ = kStateBits;
+        pos_ = kMaxValue + 1;
       }
       return *this;
     }
@@ -118,7 +146,15 @@ class SmallEnumSet {
 
   const_iterator begin() const { return const_iterator(this, SkipUnset(0)); }
 
-  const_iterator end() const { return const_iterator(this, kStateBits); }
+  const_iterator end() const { return const_iterator(this, kMaxValue + 1); }
+
+  size_t count() const {
+    size_t rv = 0;
+    for (int i = 0; i < kPieceCount; ++i) {
+      rv += static_cast<size_t>(detail::BitsSetToOneForSmallEnumSet(state_[i]));
+    }
+    return rv;
+  }
 
   // mutable ops
 
@@ -127,9 +163,10 @@ class SmallEnumSet {
   bool Add(const ENUM_TYPE e) {
     int value = static_cast<int>(e);
     assert(value >= 0 && value <= kMaxValue);
-    StateT old_state = state_;
-    state_ |= (StateT{1} << value);
-    return old_state != state_;
+    PieceT& piece_ref = RefPiece(value);
+    PieceT old_piece = piece_ref;
+    piece_ref |= (PieceT{1} << (value & kPieceMask));
+    return old_piece != piece_ref;
   }
 
   // Modifies the set (if needed) not to include the given value. Returns true
@@ -137,18 +174,20 @@ class SmallEnumSet {
   bool Remove(const ENUM_TYPE e) {
     int value = static_cast<int>(e);
     assert(value >= 0 && value <= kMaxValue);
-    StateT old_state = state_;
-    state_ &= ~(StateT{1} << value);
-    return old_state != state_;
+    PieceT& piece_ref = RefPiece(value);
+    PieceT old_piece = piece_ref;
+    piece_ref &= ~(PieceT{1} << (value & kPieceMask));
+    return old_piece != piece_ref;
   }
 
   // applicative ops
 
   // Return a new set based on this one with the additional value(s) inserted
   constexpr SmallEnumSet With(const ENUM_TYPE e) const {
-    int value = static_cast<int>(e);
-    assert(value >= 0 && value <= kMaxValue);
-    return SmallEnumSet(RawStateMarker(), state_ | (StateT{1} << value));
+    assert(static_cast<int>(e) >= 0 && static_cast<int>(e) <= kMaxValue);
+    SmallEnumSet rv(*this);
+    rv.Add(e);
+    return rv;
   }
   template <class... TRest>
   constexpr SmallEnumSet With(const ENUM_TYPE e1, const ENUM_TYPE e2,
@@ -158,9 +197,10 @@ class SmallEnumSet {
 
   // Return a new set based on this one excluding the given value(s)
   constexpr SmallEnumSet Without(const ENUM_TYPE e) const {
-    int value = static_cast<int>(e);
-    assert(value >= 0 && value <= kMaxValue);
-    return SmallEnumSet(RawStateMarker(), state_ & ~(StateT{1} << value));
+    assert(static_cast<int>(e) >= 0 && static_cast<int>(e) <= kMaxValue);
+    SmallEnumSet rv(*this);
+    rv.Remove(e);
+    return rv;
   }
   template <class... TRest>
   constexpr SmallEnumSet Without(const ENUM_TYPE e1, const ENUM_TYPE e2,
@@ -170,17 +210,568 @@ class SmallEnumSet {
 
  private:
   int SkipUnset(int pos) const {
-    StateT tmp = state_ >> pos;
-    if (tmp == 0) {
-      return kStateBits;
-    } else {
-      return pos + detail::CountTrailingZeroBitsForSmallEnumSet(tmp);
+    while (pos <= kMaxValue) {
+      PieceT remainder = GetPiece(pos) >> (pos & kPieceMask);
+      if (remainder != 0) {
+        return pos + detail::CountTrailingZeroBitsForSmallEnumSet(remainder);
+      }
+      pos = (pos + kPieceBits) & ~kPieceMask;
     }
+    return kMaxValue + 1;
   }
   struct RawStateMarker {};
   explicit SmallEnumSet(RawStateMarker, StateT state) : state_(state) {}
+  PieceT GetPiece(int pos) const {
+    if constexpr (kPieceCount == 1) {
+      return state_[0];
+    } else {
+      return state_[pos >> kPieceShift];
+    }
+  }
+  PieceT& RefPiece(int pos) {
+    if constexpr (kPieceCount == 1) {
+      return state_[0];
+    } else {
+      return state_[pos >> kPieceShift];
+    }
+  }
 
   StateT state_;
 };
 
+// A smart pointer that tracks an object and an owner, using a statically
+// determined function on those to reclaim the object, if both object and owner
+// are non-null
+template <typename T, class Owner, auto Fn>
+class ManagedPtr {
+ public:
+  ManagedPtr() = default;
+  ManagedPtr(T* ptr, Owner* owner) : ptr_(ptr), owner_(owner) {}
+  ~ManagedPtr() { Free(); }
+  // No copies
+  ManagedPtr(const ManagedPtr&) = delete;
+  ManagedPtr& operator=(const ManagedPtr&) = delete;
+  // Moves
+  ManagedPtr(ManagedPtr&& other) noexcept {
+    ptr_ = other.ptr_;
+    owner_ = other.owner_;
+    other.ptr_ = nullptr;
+    other.owner_ = nullptr;
+  }
+  ManagedPtr& operator=(ManagedPtr&& other) noexcept {
+    if (this == &other) {
+      return *this;
+    }
+    Free();
+    ptr_ = other.ptr_;
+    owner_ = other.owner_;
+    other.ptr_ = nullptr;
+    other.owner_ = nullptr;
+    return *this;
+  }
+
+  T* get() const { return ptr_; }
+  T* operator->() const { return ptr_; }
+  T& operator*() const { return *ptr_; }
+  operator bool() const { return ptr_ != nullptr; }
+
+  Owner* owner() const { return owner_; }
+
+ private:
+  T* ptr_ = nullptr;
+  Owner* owner_ = nullptr;
+
+  void Free() {
+    if (ptr_ && owner_) {
+      if constexpr (std::is_member_function_pointer_v<decltype(Fn)>) {
+        (owner_->*Fn)(ptr_);
+      } else {
+        Fn(owner_, ptr_);
+      }
+    }
+  }
+};
+
+template <typename T, typename comp>
+class Interval;
+
+// The Interval Class is a generic class for holding a range, for example [2,
+// 4]. It can be used within the IntervalSet class, which is able to keep an
+// ordered, non-intersecting set of intervals within it.  Intervals can have
+// open-ended end points, (i.e., to infinity) for example [2,).
+template <typename T, typename comp = std::less<T>>
+class Interval {
+ public:
+  enum class End { INF };
+  struct CompareVariant {
+    comp comparator;
+    bool operator()(const std::variant<T, End>& a,
+                    const std::variant<T, End>& b) const {
+      if (std::holds_alternative<T>(a) && std::holds_alternative<T>(b)) {
+        return comparator(std::get<T>(a), std::get<T>(b));
+      }
+      if (std::holds_alternative<End>(a) && std::holds_alternative<End>(b)) {
+        return false;
+      }
+      if (std::holds_alternative<T>(a) && std::holds_alternative<End>(b)) {
+        return false;
+      }
+      return true;  // std::holds_alternative<End>(a) &&
+                    // std::holds_alternative<T>(b)
+    }
+  };
+
+  /* implicit */ Interval(const T& start, const T& end)
+      : start_(start), end_(end) {}
+  /* implicit */ Interval(const T& start) : start_(start), end_(End::INF) {}
+
+  // Add constructor that takes a pair
+  /* implicit */ Interval(const std::pair<T, T>& p)
+      : start_(p.first), end_(p.second) {}
+
+  T& start() { return start_; }
+
+  const T& start() const { return start_; }
+
+  bool has_end() const { return std::holds_alternative<T>(end_); }
+
+  T& end() { return std::get<T>(end_); }
+
+  const T& end() const { return std::get<T>(end_); }
+
+  // Support comparison with std::pair
+  bool operator==(const std::pair<T, T>& p) const {
+    return start_ == p.first && has_end() && end() == p.second;
+  }
+
+  // Support comparison with another Interval
+  bool operator==(const Interval& other) const {
+    if (start_ != other.start_) {
+      return false;
+    }
+
+    // Both have infinite end
+    if (!has_end() && !other.has_end()) {
+      return true;
+    }
+
+    // One has infinite end, the other doesn't
+    if (has_end() != other.has_end()) {
+      return false;
+    }
+
+    // Both have finite end
+    return end() == other.end();
+  }
+
+  // Support comparison with another Interval
+  bool operator<(const Interval& other) const {
+    return comparator(start_, other.start_);
+  }
+
+  bool Compare(const Interval& other) const {
+    return comparator(start_, other.start_);
+  }
+
+ private:
+  T start_;
+  std::variant<T, End> end_;
+  comp comparator;
+};
+
+// Specialized version of Interval for Slice
+template <>
+class Interval<Slice, Comparator> {
+ public:
+  enum class End { INF };
+
+  // Constructors that take a Comparator
+  /* implicit */ Interval(const Comparator* c, const Slice& start,
+                          const Slice& end)
+      : start_(start), end_(end), comparator_(c) {}
+
+  /* implicit */ Interval(const Comparator* c, const Slice& start)
+      : start_(start), end_(End::INF), comparator_(c) {}
+
+  // Constructor that takes a pair
+  /* implicit */ Interval(const Comparator* c, const std::pair<Slice, Slice>& p)
+      : start_(p.first), end_(p.second), comparator_(c) {}
+
+  Slice& start() { return start_; }
+
+  const Slice& start() const { return start_; }
+
+  bool has_end() const { return std::holds_alternative<Slice>(end_); }
+
+  Slice& end() { return std::get<Slice>(end_); }
+
+  const Slice& end() const { return std::get<Slice>(end_); }
+
+  // Support comparison with std::pair
+  bool operator==(const std::pair<Slice, Slice>& p) const {
+    return start_ == p.first && has_end() && end() == p.second;
+  }
+
+  // Support comparison with another Interval
+  bool operator==(const Interval& other) const {
+    if (comparator_->Compare(start_, other.start_) != 0) {
+      return false;
+    }
+
+    // Both have infinite end
+    if (!has_end() && !other.has_end()) {
+      return true;
+    }
+
+    // One has infinite end, the other doesn't
+    if (has_end() != other.has_end()) {
+      return false;
+    }
+
+    // Both have finite end
+    return comparator_->Compare(end(), other.end()) == 0;
+  }
+
+  // Support comparison with another Interval
+  bool operator<(const Interval& other) const {
+    return comparator_->Compare(start_, other.start_) < 0;
+  }
+
+  bool Compare(const Interval& other) const {
+    return comparator_->Compare(start_, other.start_) < 0;
+  }
+
+  const Comparator* GetComparator() const { return comparator_; }
+
+ private:
+  Slice start_;
+  std::variant<Slice, End> end_;
+  const Comparator* comparator_;
+
+  std::unordered_map<std::string, std::string> property_bag;
+};
+
+template <typename T, typename Compare = std::less<T>>
+struct CompareInterval {
+  bool operator()(const Interval<T, Compare>& a,
+                  const Interval<T, Compare>& b) const {
+    return a.Compare(b);
+  }
+};
+
+// IntervalSet will be used to represent a set of intervals (including unbounded
+// ones). The intervals are unique and disjoint. Intervals that are inserted
+// will merge with any range they intersect with.
+template <typename T, typename Compare = typename Interval<T>::CompareVariant>
+class IntervalSet {
+ public:
+  IntervalSet(Compare c = Compare()) : comp_(c) {}
+
+  void insert(Interval<T>&& i) { insertImpl(i); }
+
+  void insert(const T& start, const T& end) {
+    insertImpl(Interval<T>(start, end));
+  }
+
+  void insert(const T& start) { insertImpl(Interval<T>(start)); }
+
+  bool empty() const { return intervals_.empty(); }
+  void clear() { intervals_.clear(); }
+
+  auto begin() { return intervals_.begin(); }
+  auto end() { return intervals_.end(); }
+
+  auto cbegin() const { return intervals_.cbegin(); }
+  auto cend() const { return intervals_.cend(); }
+
+  size_t size() const { return intervals_.size(); }
+
+ private:
+  void insertImpl(const Interval<T>& i) {
+    // Skip empty intervals
+    if (i.has_end() && !comp_(i.start(), i.end()) &&
+        !comp_(i.end(), i.start())) {
+      return;
+    }
+
+    // First, check if there's any infinite interval that would contain this one
+    for (auto it = intervals_.begin(); it != intervals_.end(); ++it) {
+      if (!it->has_end() && !comp_(i.start(), it->start())) {
+        // This interval starts at or after an infinite interval
+        return;
+      }
+    }
+
+    // Find the position where the interval should be inserted
+    auto it = intervals_.begin();
+    while (it != intervals_.end() && comp_(it->start(), i.start())) {
+      ++it;
+    }
+
+    // Check if we need to consider the previous interval
+    if (it != intervals_.begin()) {
+      --it;
+      if (it->has_end() && comp_(it->end(), i.start())) {
+        ++it;
+      }
+    }
+
+    T new_start = i.start();
+    T new_end;
+    bool inf_end = false;
+    if (i.has_end()) {
+      new_end = i.end();
+    } else {
+      // For infinite end intervals, we need to merge all intervals that start
+      // after new_start
+      std::vector<decltype(it)> to_erase;
+      while (it != intervals_.end()) {
+        new_start = comp_(it->start(), new_start) ? it->start() : new_start;
+        to_erase.push_back(it++);
+      }
+
+      for (auto& eit : to_erase) {
+        intervals_.erase(eit);
+      }
+
+      // Insert the new interval with infinite end
+      intervals_.insert(Interval<T>(new_start));
+      return;
+    }
+
+    // For finite end intervals, proceed as before
+    std::vector<decltype(it)> to_erase;
+    while (it != intervals_.end() && !comp_(new_end, it->start())) {
+      if (it->has_end() && comp_(it->end(), new_start)) {
+        ++it;
+        continue;
+      }
+      new_start = comp_(it->start(), new_start) ? it->start() : new_start;
+      if (it->has_end()) {
+        new_end = comp_(new_end, it->end()) ? it->end() : new_end;
+      } else {
+        // If we encounter an interval with infinite end, our new interval also
+        // becomes infinite
+        inf_end = true;
+        break;
+      }
+      to_erase.push_back(it++);
+    }
+
+    // Check for any infinite intervals that start after this one
+    auto check_it = it;
+    while (check_it != intervals_.end()) {
+      if (!check_it->has_end()) {
+        inf_end = true;
+        to_erase.push_back(check_it);
+      }
+      ++check_it;
+    }
+
+    for (auto& eit : to_erase) {
+      intervals_.erase(eit);
+    }
+
+    if (inf_end) {
+      intervals_.insert(Interval<T>(new_start));
+    } else {
+      intervals_.insert(Interval<T>(new_start, new_end));
+    }
+  }
+
+  std::set<Interval<T>, CompareInterval<T>> intervals_;
+  Compare comp_;
+};
+
+// Specialization of IntervalSet for Slices.
+// Slice based intervals can have properties attached to them. This is used to
+// push down properties in the MultiScan API.  We accept two modes with
+// IntervalSet, fail_on_intersect, which imposes a restriction that inserted
+// ranges will be disjoint, this is needed when using properties. Insert will
+// fail if a range is found to not be disjoint. When fail_on_instersect is
+// false, the ranges will be merged.
+template <>
+class IntervalSet<Slice, Comparator> {
+ public:
+  explicit IntervalSet(const Comparator* c, bool fail_on_intersect = false)
+      : comp_(c), prop_(fail_on_intersect) {}
+
+  // Insert returns true if the interval was inserted. False indicates that the
+  // interval was not inserted, this could be do to an empty range OR that the
+  // IntervalSet is in with_properties mode and the interval overlaps with an
+  // existing interval.
+  bool insert(const Slice& start, const Slice& end) {
+    return insertImpl(Interval<Slice, Comparator>(comp_, start, end));
+  }
+
+  // Insert returns true if the interval was inserted. False indicates that the
+  // interval was not inserted, this could be do to an empty range OR that the
+  // IntervalSet is in with_properties mode and the interval overlaps with an
+  // existing interval.
+  bool insert(const Slice& start) {
+    // Create an interval with infinite end
+    Interval<Slice, Comparator> interval(comp_, start);
+    return insertImpl(interval);
+  }
+
+  bool insert(Interval<Slice, Comparator>&& i) { return insertImpl(i); }
+
+  bool empty() const { return intervals_.empty(); }
+  void clear() { intervals_.clear(); }
+
+  auto begin() { return intervals_.begin(); }
+  auto end() { return intervals_.end(); }
+
+  auto cbegin() const { return intervals_.cbegin(); }
+  auto cend() const { return intervals_.cend(); }
+
+  size_t size() const { return intervals_.size(); }
+
+ private:
+  // Custom comparator for finding intervals in the vector
+  struct IntervalComparator {
+    explicit IntervalComparator(const Comparator* comp) : comp_(comp) {}
+
+    bool operator()(const Interval<Slice, Comparator>& a,
+                    const Interval<Slice, Comparator>& b) const {
+      return comp_->Compare(a.start(), b.start()) < 0;
+    }
+
+    const Comparator* comp_;
+  };
+
+  typename std::vector<Interval<Slice, Comparator>>::iterator findPosition(
+      const Interval<Slice, Comparator>& interval) {
+    // Find the position where the new interval should be inserted
+    for (auto it = intervals_.begin(); it != intervals_.end(); ++it) {
+      if (comp_->Compare(it->start(), interval.start()) >= 0) {
+        return it;
+      }
+    }
+    return intervals_.end();
+  }
+
+  bool insertImpl(const Interval<Slice, Comparator>& i) {
+    // Skip empty intervals
+    if (i.has_end() && comp_->Compare(i.start(), i.end()) >= 0) {
+      return false;
+    }
+
+    // Find the position where this interval would be inserted
+    // This also checks if the interval is completely contained within an
+    // existing one
+    auto it = findPosition(i);
+
+    // Check if we need to merge with previous interval
+    if (it != intervals_.begin()) {
+      auto prev = it - 1;
+      if (prev->has_end() && comp_->Compare(prev->end(), i.start()) < 0) {
+        // No overlap with previous interval
+      } else {
+        // There is overlap, adjust iterator to include previous interval
+        if (prop_) {
+          return false;
+        }
+        it = prev;
+      }
+    }
+
+    Slice new_start = i.start();
+    Slice new_end;
+    bool inf_end = false;
+
+    if (i.has_end()) {
+      new_end = i.end();
+    } else {
+      // For infinite end intervals, we need to merge all intervals that start
+      // after new_start
+      auto erase_start = it;
+      while (it != intervals_.end()) {
+        if (comp_->Compare(it->start(), new_start) < 0) {
+          if (prop_) {
+            return false;
+          }
+          new_start = it->start();
+        }
+        ++it;
+      }
+
+      // Erase all intervals from erase_start to end
+      if (erase_start != intervals_.end()) {
+        if (prop_) {
+          return false;
+        }
+        intervals_.erase(erase_start, intervals_.end());
+      }
+
+      // Insert the new interval with infinite end
+      Interval<Slice, Comparator> new_interval(comp_, new_start);
+      auto pos = findPosition(new_interval);
+      intervals_.insert(pos, new_interval);
+      return true;
+    }
+
+    // For finite end intervals, find all overlapping intervals
+    auto erase_start = it;
+    auto erase_end = it;
+
+    while (it != intervals_.end() &&
+           comp_->Compare(new_end, it->start()) >= 0) {
+      if (it->has_end() && comp_->Compare(it->end(), new_start) < 0) {
+        // No overlap
+        ++it;
+        erase_end = it;
+        continue;
+      }
+
+      if (comp_->Compare(it->start(), new_start) < 0) {
+        new_start = it->start();
+      }
+
+      if (it->has_end()) {
+        if (comp_->Compare(new_end, it->end()) < 0) {
+          new_end = it->end();
+        }
+      } else {
+        // If we encounter an interval with infinite end, our new interval also
+        // becomes infinite
+        inf_end = true;
+        erase_end = intervals_.end();
+        break;
+      }
+
+      ++it;
+      erase_end = it;
+    }
+
+    // Check for any infinite intervals that start after this one
+    while (it != intervals_.end()) {
+      if (!it->has_end()) {
+        inf_end = true;
+        erase_end = intervals_.end();
+        break;
+      }
+      ++it;
+    }
+
+    // Erase all merged intervals
+    if (erase_start != erase_end) {
+      intervals_.erase(erase_start, erase_end);
+    }
+
+    // Insert the new merged interval
+    Interval<Slice, Comparator> new_interval =
+        inf_end ? Interval<Slice, Comparator>(comp_, new_start)
+                : Interval<Slice, Comparator>(comp_, new_start, new_end);
+
+    auto pos = findPosition(new_interval);
+    intervals_.insert(pos, new_interval);
+    return true;
+  }
+
+  const Comparator* comp_;
+  std::vector<Interval<Slice, Comparator>> intervals_;
+  bool prop_;
+};
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 300af520ee9e..d31660de4ae4 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -22,6 +22,7 @@
 #include "rocksdb/iterator.h"
 #include "rocksdb/listener.h"
 #include "rocksdb/metadata.h"
+#include "rocksdb/multi_scan.h"
 #include "rocksdb/options.h"
 #include "rocksdb/snapshot.h"
 #include "rocksdb/sst_file_writer.h"
@@ -30,15 +31,10 @@
 #include "rocksdb/types.h"
 #include "rocksdb/user_write_callback.h"
 #include "rocksdb/utilities/table_properties_collectors.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
 #include "rocksdb/version.h"
 #include "rocksdb/wide_columns.h"
 
-#if defined(__GNUC__) || defined(__clang__)
-#define ROCKSDB_DEPRECATED_FUNC __attribute__((__deprecated__))
-#elif _WIN32
-#define ROCKSDB_DEPRECATED_FUNC __declspec(deprecated)
-#endif
-
 namespace ROCKSDB_NAMESPACE {
 
 struct ColumnFamilyOptions;
@@ -47,6 +43,7 @@ struct CompactRangeOptions;
 struct DBOptions;
 struct ExternalSstFileInfo;
 struct FlushOptions;
+struct FlushWALOptions;
 struct Options;
 struct ReadOptions;
 struct TableProperties;
@@ -55,6 +52,7 @@ struct WaitForCompactOptions;
 class Env;
 class EventListener;
 class FileSystem;
+class MultiScan;
 class Replayer;
 class StatsHistoryIterator;
 class TraceReader;
@@ -93,45 +91,8 @@ class ColumnFamilyHandle {
   virtual const Comparator* GetComparator() const = 0;
 };
 
-static const int kMajorVersion = __ROCKSDB_MAJOR__;
-static const int kMinorVersion = __ROCKSDB_MINOR__;
-
-// A range of keys
-struct Range {
-  // In case of user_defined timestamp, if enabled, `start` and `limit` should
-  // point to key without timestamp part.
-  Slice start;
-  Slice limit;
-
-  Range() {}
-  Range(const Slice& s, const Slice& l) : start(s), limit(l) {}
-};
-
-struct RangePtr {
-  // In case of user_defined timestamp, if enabled, `start` and `limit` should
-  // point to key without timestamp part.
-  const Slice* start;
-  const Slice* limit;
-
-  RangePtr() : start(nullptr), limit(nullptr) {}
-  RangePtr(const Slice* s, const Slice* l) : start(s), limit(l) {}
-};
-
-// It is valid that files_checksums and files_checksum_func_names are both
-// empty (no checksum information is provided for ingestion). Otherwise,
-// their sizes should be the same as external_files. The file order should
-// be the same in three vectors and guaranteed by the caller.
-// Note that, we assume the temperatures of this batch of files to be
-// ingested are the same.
-struct IngestExternalFileArg {
-  ColumnFamilyHandle* column_family = nullptr;
-  std::vector<std::string> external_files;
-  IngestExternalFileOptions options;
-  std::vector<std::string> files_checksums;
-  std::vector<std::string> files_checksum_func_names;
-  // A hint as to the temperature for *reading* the files to be ingested.
-  Temperature file_temperature = Temperature::kUnknown;
-};
+static const int kMajorVersion = ROCKSDB_MAJOR;
+static const int kMinorVersion = ROCKSDB_MINOR;
 
 struct GetMergeOperandsOptions {
   using ContinueCallback = std::function<bool(Slice)>;
@@ -170,24 +131,13 @@ using TablePropertiesCollection =
 class DB {
  public:
   // Open the database with the specified "name" for reads and writes.
-  // Stores a pointer to a heap-allocated database in *dbptr and returns
-  // OK on success.
-  // Stores nullptr in *dbptr and returns a non-OK status on error, including
+  // On success, stores the database in *dbptr and returns OK.
+  // On error, resets *dbptr and returns a non-OK status, including
   // if the DB is already open (read-write) by another DB object. (This
   // guarantee depends on options.env->LockFile(), which might not provide
   // this guarantee in a custom Env implementation.)
-  //
-  // Caller must delete *dbptr when it is no longer needed.
   static Status Open(const Options& options, const std::string& name,
                      std::unique_ptr<DB>* dbptr);
-  // DEPRECATED: raw pointer variant
-  static Status Open(const Options& options, const std::string& name,
-                     DB** dbptr) {
-    std::unique_ptr<DB> smart_ptr;
-    Status s = Open(options, name, &smart_ptr);
-    *dbptr = smart_ptr.release();
-    return s;
-  }
 
   // Open DB with column families.
   // db_options specify database specific options
@@ -201,21 +151,12 @@ class DB {
   // If everything is OK, handles will on return be the same size
   // as column_families --- handles[i] will be a handle that you
   // will use to operate on column family column_family[i].
-  // Before delete DB, you have to close All column families by calling
+  // Before destroying the DB, you have to close all column families by calling
   // DestroyColumnFamilyHandle() with all the handles.
   static Status Open(const DBOptions& db_options, const std::string& name,
                      const std::vector<ColumnFamilyDescriptor>& column_families,
                      std::vector<ColumnFamilyHandle*>* handles,
                      std::unique_ptr<DB>* dbptr);
-  // DEPRECATED: raw pointer variant
-  static Status Open(const DBOptions& db_options, const std::string& name,
-                     const std::vector<ColumnFamilyDescriptor>& column_families,
-                     std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) {
-    std::unique_ptr<DB> smart_ptr;
-    Status s = Open(db_options, name, column_families, handles, &smart_ptr);
-    *dbptr = smart_ptr.release();
-    return s;
-  }
 
   // OpenForReadOnly() creates a Read-only instance that supports reads alone.
   //
@@ -234,16 +175,6 @@ class DB {
   static Status OpenForReadOnly(const Options& options, const std::string& name,
                                 std::unique_ptr<DB>* dbptr,
                                 bool error_if_wal_file_exists = false);
-  // DEPRECATED: raw pointer variant
-  static Status OpenForReadOnly(const Options& options, const std::string& name,
-                                DB** dbptr,
-                                bool error_if_wal_file_exists = false) {
-    std::unique_ptr<DB> smart_ptr;
-    Status s =
-        OpenForReadOnly(options, name, &smart_ptr, error_if_wal_file_exists);
-    *dbptr = smart_ptr.release();
-    return s;
-  }
 
   // Open the database for read only with column families.
   //
@@ -257,18 +188,6 @@ class DB {
       const std::vector<ColumnFamilyDescriptor>& column_families,
       std::vector<ColumnFamilyHandle*>* handles, std::unique_ptr<DB>* dbptr,
       bool error_if_wal_file_exists = false);
-  // DEPRECATED: raw pointer variant
-  static Status OpenForReadOnly(
-      const DBOptions& db_options, const std::string& name,
-      const std::vector<ColumnFamilyDescriptor>& column_families,
-      std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
-      bool error_if_wal_file_exists = false) {
-    std::unique_ptr<DB> smart_ptr;
-    Status s = OpenForReadOnly(db_options, name, column_families, handles,
-                               &smart_ptr, error_if_wal_file_exists);
-    *dbptr = smart_ptr.release();
-    return s;
-  }
 
   // OpenAsSecondary() creates a secondary instance that supports read-only
   // operations and supports dynamic catch up with the primary (through a
@@ -290,8 +209,6 @@ class DB {
   // The secondary_path argument points to a directory where the secondary
   // instance stores its info log.
   // The dbptr is an out-arg corresponding to the opened secondary instance.
-  // The pointer points to a heap-allocated database, and the caller should
-  // delete it after use.
   //
   // Return OK on success, non-OK on failures.
   //
@@ -304,14 +221,6 @@ class DB {
   static Status OpenAsSecondary(const Options& options, const std::string& name,
                                 const std::string& secondary_path,
                                 std::unique_ptr<DB>* dbptr);
-  // DEPRECATED: raw pointer variant
-  static Status OpenAsSecondary(const Options& options, const std::string& name,
-                                const std::string& secondary_path, DB** dbptr) {
-    std::unique_ptr<DB> smart_ptr;
-    Status s = OpenAsSecondary(options, name, secondary_path, &smart_ptr);
-    *dbptr = smart_ptr.release();
-    return s;
-  }
 
   // Open DB as secondary instance with specified column families
   //
@@ -340,9 +249,8 @@ class DB {
   // The handles is an out-arg corresponding to the opened database column
   // family handles.
   // The dbptr is an out-arg corresponding to the opened secondary instance.
-  // The pointer points to a heap-allocated database, and the caller should
-  // delete it after use. Before deleting the dbptr, the user should also
-  // delete the pointers stored in handles vector.
+  // Before destroying the DB, the user should call
+  // DestroyColumnFamilyHandle() on all the handles.
   //
   // Return OK on success, non-OK on failures.
   static Status OpenAsSecondary(
@@ -350,18 +258,6 @@ class DB {
       const std::string& secondary_path,
       const std::vector<ColumnFamilyDescriptor>& column_families,
       std::vector<ColumnFamilyHandle*>* handles, std::unique_ptr<DB>* dbptr);
-  // DEPRECATED: raw pointer variant
-  static Status OpenAsSecondary(
-      const DBOptions& db_options, const std::string& name,
-      const std::string& secondary_path,
-      const std::vector<ColumnFamilyDescriptor>& column_families,
-      std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) {
-    std::unique_ptr<DB> smart_ptr;
-    Status s = OpenAsSecondary(db_options, name, secondary_path,
-                               column_families, handles, &smart_ptr);
-    *dbptr = smart_ptr.release();
-    return s;
-  }
 
   // EXPERIMENTAL
 
@@ -386,16 +282,30 @@ class DB {
       std::vector<ColumnFamilyHandle*>* handles, std::unique_ptr<DB>* dbptr);
   // End EXPERIMENTAL
 
-  // Open DB and run the compaction.
-  // It's a read-only operation, the result won't be installed to the DB, it
-  // will be output to the `output_directory`. The API should only be used with
-  // `options.CompactionService` to run compaction triggered by
-  // `CompactionService`.
   static Status OpenAndCompact(
       const std::string& name, const std::string& output_directory,
       const std::string& input, std::string* output,
       const CompactionServiceOptionsOverride& override_options);
 
+  // Opens a database and runs compaction without modifying the original DB.
+  //
+  // This read-only operation outputs compaction results to `output_directory`
+  // instead of installing them back to the source database. Designed primarily
+  // for use with `CompactionService` to process remote compaction jobs.
+  //
+  // Parameters:
+  // - `options`: Additional controls
+  //   * When `allow_resumption = false`: The `output_directory` MUST be empty
+  //     before calling this function. Any existing files (including resume
+  //     state or output files from previous runs) in the directory may
+  //     cause correctness errors as the compaction will start from scratch.
+  // - `name`: Source database path
+  // - `output_directory`: Where compaction output files are written
+  // - `input`: Serialized compaction input information
+  // - `output`: Serialized compaction result
+  // - `override_options`: Configuration overrides for the operation
+  //
+  // Returns: Status of the compaction operation
   static Status OpenAndCompact(
       const OpenAndCompactOptions& options, const std::string& name,
       const std::string& output_directory, const std::string& input,
@@ -414,18 +324,6 @@ class DB {
       const std::vector<ColumnFamilyDescriptor>& column_families,
       std::vector<ColumnFamilyHandle*>* handles, std::unique_ptr<DB>* dbptr,
       std::string trim_ts);
-  // DEPRECATED: raw pointer variant
-  static Status OpenAndTrimHistory(
-      const DBOptions& db_options, const std::string& dbname,
-      const std::vector<ColumnFamilyDescriptor>& column_families,
-      std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
-      std::string trim_ts) {
-    std::unique_ptr<DB> smart_ptr;
-    Status s = OpenAndTrimHistory(db_options, dbname, column_families, handles,
-                                  &smart_ptr, trim_ts);
-    *dbptr = smart_ptr.release();
-    return s;
-  }
 
   // Manually, synchronously attempt to resume DB writes after a write failure
   // to the underlying filesystem. See
@@ -653,7 +551,7 @@ class DB {
                        const Slice& /*key*/, const Slice& /*ts*/,
                        const Slice& /*value*/);
 
-  // Apply the specified updates to the database.
+  // Apply the specified updates atomically to the database.
   // If `updates` contains no update, WAL will still be synced if
   // options.sync=true.
   // Returns OK on success, non-OK on failure.
@@ -669,6 +567,21 @@ class DB {
         "WriteWithCallback not implemented for this interface.");
   }
 
+  // EXPERIMENTAL, subject to change
+  // Ingest a WriteBatchWithIndex into DB, bypassing memtable writes for better
+  // write performance. Useful when there is a large number of updates
+  // in the write batch.
+  // The WriteBatchWithIndex must be created with overwrite_key=true.
+  // Currently this requires WriteOptions::disableWAL=true.
+  // The following options are currently not supported:
+  // - unordered_write
+  // - enable_pipelined_write
+  virtual Status IngestWriteBatchWithIndex(
+      const WriteOptions& /*options*/,
+      std::shared_ptr<WriteBatchWithIndex> /*wbwi*/) {
+    return Status::NotSupported("IngestWriteBatchWithIndex not implemented.");
+  }
+
   // If the column family specified by "column_family" contains an entry for
   // "key", return the corresponding value in "*value". If the entry is a plain
   // key-value, return the value as-is; if it is a wide-column entity, return
@@ -1073,7 +986,7 @@ class DB {
   // call one of the Seek methods on the iterator before using it).
   //
   // Caller should delete the iterator when it is no longer needed.
-  // The returned iterator should be deleted before this db is deleted.
+  // The returned iterator should be deleted before this db is destroyed.
   virtual Iterator* NewIterator(const ReadOptions& options,
                                 ColumnFamilyHandle* column_family) = 0;
   virtual Iterator* NewIterator(const ReadOptions& options) {
@@ -1081,7 +994,7 @@ class DB {
   }
   // Returns iterators from a consistent database state across multiple
   // column families. Iterators are heap allocated and need to be deleted
-  // before the db is deleted
+  // before the db is destroyed
   virtual Status NewIterators(
       const ReadOptions& options,
       const std::vector<ColumnFamilyHandle*>& column_families,
@@ -1110,6 +1023,44 @@ class DB {
       const ReadOptions& options,
       const std::vector<ColumnFamilyHandle*>& column_families) = 0;
 
+  // Get an iterator that scans multiple key ranges. The scan ranges should
+  // be in increasing order of start key. See multi_scan_iterator.h for more
+  // details. For optimal performance, ensure that either all entries in
+  // scan_opts specify the range limit, or none of them do.
+  //
+  // NOTE: NOT YET SUPPORTED in DBs using user timestamp (see
+  // Comparator::timestamp_size())
+  //
+  // NOTE: iterate_upper_bound in ReadOptions will
+  // be ignored. Instead, the range.limit in ScanOptions is consulted to
+  // determine the upper bound key, if specified.
+  //
+  // Example usage -
+  //  std::vector<ScanOptions> scans{{.start = Slice("bar")},
+  //                              {.start = Slice("foo")}};
+  //  std::unique_ptr<MultiScan> iter.reset(
+  //                                      db->NewMultiScan());
+  //  try {
+  //    for (auto scan : *iter) {
+  //      for (auto it : scan) {
+  //        // Do something with key - it.first
+  //        // Do something with value - it.second
+  //      }
+  //    }
+  //  } catch (MultiScanException& ex) {
+  //    // Check ex.status()
+  //  } catch (std::logic_error& ex) {
+  //    // Check ex.what()
+  //  }
+  virtual std::unique_ptr<MultiScan> NewMultiScan(
+      const ReadOptions& /*options*/, ColumnFamilyHandle* column_family,
+      const MultiScanArgs& /*scan_opts*/) {
+    std::unique_ptr<Iterator> iter(NewErrorIterator(Status::NotSupported()));
+    std::unique_ptr<MultiScan> ms_iter = std::make_unique<MultiScan>(
+        column_family->GetComparator(), std::move(iter));
+    return ms_iter;
+  }
+
   // Return a handle to the current DB state.  Iterators created with
   // this handle will all observe a stable snapshot of the current DB
   // state.  The caller must call ReleaseSnapshot(result) when the
@@ -1225,6 +1176,10 @@ class DB {
     //  sorted runs being processed by currently running compactions.
     static const std::string kNumRunningCompactionSortedRuns;
 
+    //  "rocksdb.compaction-abort-count" - returns the current value of the
+    //      compaction abort counter.
+    static const std::string kCompactionAbortCount;
+
     //  "rocksdb.background-errors" - returns accumulated number of background
     //      errors.
     static const std::string kBackgroundErrors;
@@ -1600,15 +1555,39 @@ class DB {
   //  s = db->SetOptions(cfh, {{"block_based_table_factory",
   //                            "{prepopulate_block_cache=kDisable;}"}});
   virtual Status SetOptions(
-      ColumnFamilyHandle* /*column_family*/,
-      const std::unordered_map<std::string, std::string>& /*opts_map*/) {
-    return Status::NotSupported("Not implemented");
+      ColumnFamilyHandle* column_family,
+      const std::unordered_map<std::string, std::string>& opts_map) {
+    return SetOptions(std::vector<ColumnFamilyHandle*>{column_family},
+                      opts_map);
   }
   // Shortcut for SetOptions on the default column family handle.
   virtual Status SetOptions(
       const std::unordered_map<std::string, std::string>& new_options) {
     return SetOptions(DefaultColumnFamily(), new_options);
   }
+  // Shortcut where you want to apply the same options to multiple column
+  // families. Beneficial for avoiding reserialization of OPTIONS file.
+  virtual Status SetOptions(
+      const std::vector<ColumnFamilyHandle*>& column_families,
+      const std::unordered_map<std::string, std::string>& opts_map) {
+    std::unordered_map<ColumnFamilyHandle*,
+                       std::unordered_map<std::string, std::string>>
+        column_families_opts_map;
+    column_families_opts_map.reserve(column_families.size());
+    for (auto* cf : column_families) {
+      column_families_opts_map[cf] = opts_map;
+    }
+    return SetOptions(column_families_opts_map);
+  }
+  // SetOptions with potentially different options per column family. It is
+  // typically better to batch all option changes together as the OPTIONS file
+  // is written to once per SetOptions call.
+  virtual Status SetOptions(
+      const std::unordered_map<ColumnFamilyHandle*,
+                               std::unordered_map<std::string, std::string>>&
+      /*column_families_opts_map*/) {
+    return Status::NotSupported("Not implemented");
+  }
 
   // Like SetOptions but for DBOptions, including the same caveats for
   // usability, reliability, and performance. See GetDBOptionsFromMap() (and
@@ -1679,6 +1658,46 @@ class DB {
   // DisableManualCompaction() has been called.
   virtual void EnableManualCompaction() = 0;
 
+  // Abort all compaction work/jobs. This function will signal all
+  // running compactions (both automatic and manual, background and foreground)
+  // to abort and will wait for them to finish or abort before returning. After
+  // this function returns, new compaction work will be aborted immediately
+  // until ResumeAllCompactions() is called.
+  //
+  // The compaction abort is checked periodically (every 1000 keys processed),
+  // so ongoing compactions should abort as well within a reasonable time.
+  // This function blocks until all compactions have completed or aborted.
+  //
+  // Any output files from aborted compactions are automatically cleaned up,
+  // ensuring no partial compaction results are installed, except for resumable
+  // compaction.
+  //
+  // This function supports concurrent abort requests from multiple callers
+  // without coordination between them. The call count is tracked, and
+  // compactions only resume after the number of ResumeAllCompactions() calls
+  // matches number of AbortAllCompactions() calls.
+  //
+  // Differences with other compaction control APIs:
+  // - DisableManualCompaction(): Only pauses manual compactions, waits for
+  //   them to finish naturally. AbortAllCompactions() actively cancels both
+  //   automatic and manual compactions.
+  // - PauseBackgroundWork(): Pauses all background work (flush + compaction),
+  //   waits for work to finish naturally. AbortAllCompactions() only affects
+  //   compactions and actively cancels them.
+  //
+  // Note: Compaction service (remote compaction) is not currently supported.
+  // Aborted compactions return Status::Incomplete with subcode
+  // kCompactionAborted.
+  virtual void AbortAllCompactions() = 0;
+
+  // Resume all compactions that were aborted by AbortAllCompactions().
+  // This function must be called as many times as AbortAllCompactions()
+  // has been called in order to resume compactions. This reference-counting
+  // behavior ensures that if multiple callers independently request an
+  // abort, compactions will not resume until all of them have called
+  // ResumeAllCompactions().
+  virtual void ResumeAllCompactions() = 0;
+
   // Wait for all flush and compactions jobs to finish. Jobs to wait include the
   // unscheduled (queued, but not scheduled yet). If the db is shutting down,
   // Status::ShutdownInProgress will be returned.
@@ -1695,13 +1714,6 @@ class DB {
   virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0;
   virtual int NumberLevels() { return NumberLevels(DefaultColumnFamily()); }
 
-  // Maximum level to which a new compacted memtable is pushed if it
-  // does not create overlap.
-  virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) = 0;
-  virtual int MaxMemCompactionLevel() {
-    return MaxMemCompactionLevel(DefaultColumnFamily());
-  }
-
   // Number of files in level-0 that would stop writes.
   virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) = 0;
   virtual int Level0StopWriteTrigger() {
@@ -1758,6 +1770,10 @@ class DB {
     return Status::NotSupported("FlushWAL not implemented");
   }
 
+  virtual Status FlushWAL(const FlushWALOptions& /*options*/) {
+    return Status::NotSupported("FlushWAL not implemented");
+  }
+
   // Ensure all WAL writes have been synced to storage, so that (assuming OS
   // and hardware support) data will survive power loss. This function does
   // not imply FlushWAL, so `FlushWAL(true)` is recommended if using
@@ -1803,6 +1819,25 @@ class DB {
   virtual Status GetFullHistoryTsLow(ColumnFamilyHandle* column_family,
                                      std::string* ts_low) = 0;
 
+  // EXPERIMENTAL
+  // Get the newest timestamp of the column family. This is only for when the
+  // column family enables user defined timestamp and when timestamps are not
+  // persisted in SST files, a.k.a `persist_user_defined_timestamps=false`.
+  // This checks the mutable memtable, the immutable memtable and the SST files,
+  // and returns the first newest user defined timestamp found.
+  // When user defined timestamp is not persisted in SST files, metadata in
+  // MANIFEST tracks the most recently seen timestamp for SST files, so the
+  // newest timestamp in SST files can be found.
+  // OK status is returned if finding the newest timestamp succeeds, if
+  // `newest_timestamp` is empty, it means the column family hasn't seen any
+  // timestamp. The returned timestamp is encoded, util method `DecodeU64Ts` can
+  // be used to decode it into uint64_t.
+  // User-defined timestamp is required to be increasing per key, the return
+  // value of this API would be most useful if the user-defined timestamp is
+  // monotonically increasing across keys.
+  virtual Status GetNewestUserDefinedTimestamp(
+      ColumnFamilyHandle* column_family, std::string* newest_timestamp) = 0;
+
   // Suspend deleting obsolete files. Compactions will continue to occur,
   // but no obsolete files will be deleted. To resume file deletions, each
   // call to DisableFileDeletions() must be matched by a subsequent call to
@@ -1878,11 +1913,24 @@ class DB {
   virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* /*column_family*/,
                                        ColumnFamilyMetaData* /*metadata*/) {}
 
+  // Obtains the LSM-tree meta data of the specified column family of the DB
+  // with optional filtering by key range and level.
+  virtual void GetColumnFamilyMetaData(
+      ColumnFamilyHandle* /*column_family*/,
+      const GetColumnFamilyMetaDataOptions& /*options*/,
+      ColumnFamilyMetaData* /*metadata*/) {}
+
   // Get the metadata of the default column family.
   void GetColumnFamilyMetaData(ColumnFamilyMetaData* metadata) {
     GetColumnFamilyMetaData(DefaultColumnFamily(), metadata);
   }
 
+  // Get the metadata of the default column family with optional filtering.
+  void GetColumnFamilyMetaData(const GetColumnFamilyMetaDataOptions& options,
+                               ColumnFamilyMetaData* metadata) {
+    GetColumnFamilyMetaData(DefaultColumnFamily(), options, metadata);
+  }
+
   // Obtains the LSM-tree meta data of all column families of the DB, including
   // metadata for each live table (SST) file and each blob file in the DB.
   virtual void GetAllColumnFamilyMetaData(
@@ -1914,12 +1962,12 @@ class DB {
   // Retrieve information about the current wal file
   //
   // Note that the log might have rolled after this call in which case
-  // the current_log_file would not point to the current log file.
+  // the current_wal_file would not point to the current log file.
   //
-  // Additionally, for the sake of optimization current_log_file->StartSequence
+  // Additionally, for the sake of optimization current_wal_file->StartSequence
   // would always be set to 0
   virtual Status GetCurrentWalFile(
-      std::unique_ptr<WalFile>* current_log_file) = 0;
+      std::unique_ptr<WalFile>* current_wal_file) = 0;
 
   // IngestExternalFile() will load a list of external SST files (1) into the DB
   // Two primary modes are supported:
@@ -1928,7 +1976,9 @@ class DB {
   // In the first mode we will try to find the lowest possible level that
   // the file can fit in, and ingest the file into this level (2). A file that
   // have a key range that overlap with the memtable key range will require us
-  // to Flush the memtable first before ingesting the file.
+  // to Flush the memtable first before ingesting the file. If ingested files
+  // have any overlap with each other, level and sequence number assignment
+  // ensure later files overwrite earlier files.
   // In the second mode we will always ingest in the bottom most level (see
   // docs to IngestExternalFileOptions::ingest_behind).
   // For a column family that enables user-defined timestamps, ingesting
@@ -1946,7 +1996,7 @@ class DB {
   //     even if the file compression doesn't match the level compression
   // (3) If IngestExternalFileOptions->ingest_behind is set to true,
   //     we always ingest at the bottommost level, which should be reserved
-  //     for this purpose (see DBOPtions::allow_ingest_behind flag).
+  //     for this purpose (see ColumnFamilyOptions::cf_allow_ingest_behind).
   // (4) If IngestExternalFileOptions->fail_if_not_bottommost_level is set to
   //     true, then this method can return Status:TryAgain() indicating that
   //     the files cannot be ingested to the bottommost level, and it is the
@@ -2081,14 +2131,11 @@ class DB {
       ColumnFamilyHandle* column_family, const Range* range, std::size_t n,
       TablePropertiesCollection* props) = 0;
 
-  // Get the table properties of files per level.
-  virtual Status GetPropertiesOfTablesForLevels(
-      ColumnFamilyHandle* /* column_family */,
-      std::vector<
-          std::unique_ptr<TablePropertiesCollection>>* /* levels_props */) {
-    return Status::NotSupported(
-        "GetPropertiesOfTablesForLevels() is not implemented.");
-  }
+  // Get the table properties of files by level.
+  virtual Status GetPropertiesOfTablesByLevel(
+      ColumnFamilyHandle* column_family,
+      std::vector<std::unique_ptr<TablePropertiesCollection>>*
+          props_by_level) = 0;
 
   virtual Status SuggestCompactRange(ColumnFamilyHandle* /*column_family*/,
                                      const Slice* /*begin*/,
@@ -2210,12 +2257,9 @@ inline Status DB::GetApproximateSizes(ColumnFamilyHandle* column_family,
                                       uint64_t* sizes,
                                       SizeApproximationFlags include_flags) {
   SizeApproximationOptions options;
-  options.include_memtables =
-      ((include_flags & SizeApproximationFlags::INCLUDE_MEMTABLES) !=
-       SizeApproximationFlags::NONE);
-  options.include_files =
-      ((include_flags & SizeApproximationFlags::INCLUDE_FILES) !=
-       SizeApproximationFlags::NONE);
+  using enum SizeApproximationFlags;  // Require C++20 support
+  options.include_memtables = ((include_flags & INCLUDE_MEMTABLES) != NONE);
+  options.include_files = ((include_flags & INCLUDE_FILES) != NONE);
   return GetApproximateSizes(options, column_family, ranges, n, sizes);
 }
 
diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h
index 0d5f24b52683..6dbfa7537bac 100644
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@@ -39,7 +39,7 @@
 #undef LoadLibrary
 #endif
 
-#if defined(__GNUC__) || defined(__clang__)
+#if defined(__GNUC__) || defined(__clang__)  // ODR-SAFE (essentially)
 #define ROCKSDB_PRINTF_FORMAT_ATTR(format_param, dots_param) \
   __attribute__((__format__(__printf__, format_param, dots_param)))
 #else
@@ -455,10 +455,146 @@ class Env : public Customizable {
     kVerifyFileChecksums = 7,
     kGetEntity = 8,
     kMultiGetEntity = 9,
-    kReadManifest = 10,
-    kUnknown,  // Keep last for easy array of non-unknowns
+    kGetFileChecksumsFromCurrentManifest = 10,
+    // Enums after this, up to 0x7F, are reserved for future use for the public
+    // RocksDB API (i.e. they should be "non-custom" IO activities). Make sure
+    // to also update IOActivityToString when adding new values.
+
+    kCustomIOActivity80 = 0x80,
+    kFirstCustomIOActivity = kCustomIOActivity80,
+    kCustomIOActivity81 = 0x81,
+    kCustomIOActivity82 = 0x82,
+    kCustomIOActivity83 = 0x83,
+    kCustomIOActivity84 = 0x84,
+    kCustomIOActivity85 = 0x85,
+    kCustomIOActivity86 = 0x86,
+    kCustomIOActivity87 = 0x87,
+    kCustomIOActivity88 = 0x88,
+    kCustomIOActivity89 = 0x89,
+    kCustomIOActivity8A = 0x8A,
+    kCustomIOActivity8B = 0x8B,
+    kCustomIOActivity8C = 0x8C,
+    kCustomIOActivity8D = 0x8D,
+    kCustomIOActivity8E = 0x8E,
+    kCustomIOActivity8F = 0x8F,
+    kCustomIOActivity90 = 0x90,
+    kCustomIOActivity91 = 0x91,
+    kCustomIOActivity92 = 0x92,
+    kCustomIOActivity93 = 0x93,
+    kCustomIOActivity94 = 0x94,
+    kCustomIOActivity95 = 0x95,
+    kCustomIOActivity96 = 0x96,
+    kCustomIOActivity97 = 0x97,
+    kCustomIOActivity98 = 0x98,
+    kCustomIOActivity99 = 0x99,
+    kCustomIOActivity9A = 0x9A,
+    kCustomIOActivity9B = 0x9B,
+    kCustomIOActivity9C = 0x9C,
+    kCustomIOActivity9D = 0x9D,
+    kCustomIOActivity9E = 0x9E,
+    kCustomIOActivity9F = 0x9F,
+    kCustomIOActivityA0 = 0xA0,
+    kCustomIOActivityA1 = 0xA1,
+    kCustomIOActivityA2 = 0xA2,
+    kCustomIOActivityA3 = 0xA3,
+    kCustomIOActivityA4 = 0xA4,
+    kCustomIOActivityA5 = 0xA5,
+    kCustomIOActivityA6 = 0xA6,
+    kCustomIOActivityA7 = 0xA7,
+    kCustomIOActivityA8 = 0xA8,
+    kCustomIOActivityA9 = 0xA9,
+    kCustomIOActivityAA = 0xAA,
+    kCustomIOActivityAB = 0xAB,
+    kCustomIOActivityAC = 0xAC,
+    kCustomIOActivityAD = 0xAD,
+    kCustomIOActivityAE = 0xAE,
+    kCustomIOActivityAF = 0xAF,
+    kCustomIOActivityB0 = 0xB0,
+    kCustomIOActivityB1 = 0xB1,
+    kCustomIOActivityB2 = 0xB2,
+    kCustomIOActivityB3 = 0xB3,
+    kCustomIOActivityB4 = 0xB4,
+    kCustomIOActivityB5 = 0xB5,
+    kCustomIOActivityB6 = 0xB6,
+    kCustomIOActivityB7 = 0xB7,
+    kCustomIOActivityB8 = 0xB8,
+    kCustomIOActivityB9 = 0xB9,
+    kCustomIOActivityBA = 0xBA,
+    kCustomIOActivityBB = 0xBB,
+    kCustomIOActivityBC = 0xBC,
+    kCustomIOActivityBD = 0xBD,
+    kCustomIOActivityBE = 0xBE,
+    kCustomIOActivityBF = 0xBF,
+    kCustomIOActivityC0 = 0xC0,
+    kCustomIOActivityC1 = 0xC1,
+    kCustomIOActivityC2 = 0xC2,
+    kCustomIOActivityC3 = 0xC3,
+    kCustomIOActivityC4 = 0xC4,
+    kCustomIOActivityC5 = 0xC5,
+    kCustomIOActivityC6 = 0xC6,
+    kCustomIOActivityC7 = 0xC7,
+    kCustomIOActivityC8 = 0xC8,
+    kCustomIOActivityC9 = 0xC9,
+    kCustomIOActivityCA = 0xCA,
+    kCustomIOActivityCB = 0xCB,
+    kCustomIOActivityCC = 0xCC,
+    kCustomIOActivityCD = 0xCD,
+    kCustomIOActivityCE = 0xCE,
+    kCustomIOActivityCF = 0xCF,
+    kCustomIOActivityD0 = 0xD0,
+    kCustomIOActivityD1 = 0xD1,
+    kCustomIOActivityD2 = 0xD2,
+    kCustomIOActivityD3 = 0xD3,
+    kCustomIOActivityD4 = 0xD4,
+    kCustomIOActivityD5 = 0xD5,
+    kCustomIOActivityD6 = 0xD6,
+    kCustomIOActivityD7 = 0xD7,
+    kCustomIOActivityD8 = 0xD8,
+    kCustomIOActivityD9 = 0xD9,
+    kCustomIOActivityDA = 0xDA,
+    kCustomIOActivityDB = 0xDB,
+    kCustomIOActivityDC = 0xDC,
+    kCustomIOActivityDD = 0xDD,
+    kCustomIOActivityDE = 0xDE,
+    kCustomIOActivityDF = 0xDF,
+    kCustomIOActivityE0 = 0xE0,
+    kCustomIOActivityE1 = 0xE1,
+    kCustomIOActivityE2 = 0xE2,
+    kCustomIOActivityE3 = 0xE3,
+    kCustomIOActivityE4 = 0xE4,
+    kCustomIOActivityE5 = 0xE5,
+    kCustomIOActivityE6 = 0xE6,
+    kCustomIOActivityE7 = 0xE7,
+    kCustomIOActivityE8 = 0xE8,
+    kCustomIOActivityE9 = 0xE9,
+    kCustomIOActivityEA = 0xEA,
+    kCustomIOActivityEB = 0xEB,
+    kCustomIOActivityEC = 0xEC,
+    kCustomIOActivityED = 0xED,
+    kCustomIOActivityEE = 0xEE,
+    kCustomIOActivityEF = 0xEF,
+    kCustomIOActivityF0 = 0xF0,
+    kCustomIOActivityF1 = 0xF1,
+    kCustomIOActivityF2 = 0xF2,
+    kCustomIOActivityF3 = 0xF3,
+    kCustomIOActivityF4 = 0xF4,
+    kCustomIOActivityF5 = 0xF5,
+    kCustomIOActivityF6 = 0xF6,
+    kCustomIOActivityF7 = 0xF7,
+    kCustomIOActivityF8 = 0xF8,
+    kCustomIOActivityF9 = 0xF9,
+    kCustomIOActivityFA = 0xFA,
+    kCustomIOActivityFB = 0xFB,
+    kCustomIOActivityFC = 0xFC,
+    kCustomIOActivityFD = 0xFD,
+    kCustomIOActivityFE = 0xFE,
+    kLastCustomIOActivity = kCustomIOActivityFE,
+
+    kUnknown = 0xFF,  // Keep last as unknown
   };
 
+  static std::string IOActivityToString(IOActivity activity);
+
   // Arrange to run "(*function)(arg)" once in a background thread, in
   // the thread pool specified by pri. By default, jobs go to the 'LOW'
   // priority thread pool.
@@ -864,6 +1000,13 @@ class RandomAccessFile {
         "RandomAccessFile::InvalidateCache not supported.");
   }
 
+  // The default implementation returns "not supported" so that user
+  // implementations of FSRandomAccessFile do not need to immediately implement
+  // this function.
+  virtual Status GetFileSize(uint64_t* /*result*/) {
+    return Status::NotSupported("RandomAccessFile::GetFileSize not supported.");
+  }
+
   // If you're adding methods here, remember to add them to
   // RandomAccessFileWrapper too.
 };
@@ -1748,6 +1891,9 @@ class RandomAccessFileWrapper : public RandomAccessFile {
   Status InvalidateCache(size_t offset, size_t length) override {
     return target_->InvalidateCache(offset, length);
   }
+  Status GetFileSize(uint64_t* file_size) override {
+    return target_->GetFileSize(file_size);
+  }
 
  private:
   RandomAccessFile* target_;
diff --git a/include/rocksdb/env_encryption.h b/include/rocksdb/env_encryption.h
index 6b4a13e039b6..118e8a052231 100644
--- a/include/rocksdb/env_encryption.h
+++ b/include/rocksdb/env_encryption.h
@@ -240,6 +240,15 @@ class EncryptedRandomAccessFile : public FSRandomAccessFile {
   size_t GetRequiredBufferAlignment() const override;
 
   IOStatus InvalidateCache(size_t offset, size_t length) override;
+
+  // Intentionally leave GetFileSize not overridden here, so that it inherits
+  // the default implementation from its parent class, which is Not Supported.
+  //
+  // As GetFileSize API is not required to be implemented yet, we use encrypted
+  // file system in unit test to validate the rest of the system could continue
+  // working with the Not Supported behavior.
+  //
+  // IOStatus GetFileSize(uint64_t* /*result*/) override;
 };
 
 class EncryptedWritableFile : public FSWritableFile {
diff --git a/include/rocksdb/experimental.h b/include/rocksdb/experimental.h
index 349d05f9b403..42b40cfa4754 100644
--- a/include/rocksdb/experimental.h
+++ b/include/rocksdb/experimental.h
@@ -21,6 +21,11 @@ Status SuggestCompactRange(DB* db, ColumnFamilyHandle* column_family,
                            const Slice* begin, const Slice* end);
 Status SuggestCompactRange(DB* db, const Slice* begin, const Slice* end);
 
+// DEPRECATED: this API may be removed in a future release.
+// This operation can be done through CompactRange() by setting
+// CompactRangeOptions::bottommost_level_compaction set to
+// BottommostLevelCompaction::kSkip and setting target level.
+//
 // Move all L0 files to target_level skipping compaction.
 // This operation succeeds only if the files in L0 have disjoint ranges; this
 // is guaranteed to happen, for instance, if keys are inserted in sorted
@@ -81,7 +86,7 @@ Status UpdateManifestForFilesState(
 // keys in a category to return an empty sequence of segments.
 //
 // To eliminate a confusing distinction between a segment that is empty vs.
-// "not present" for a particular key, each key is logically assiciated with
+// "not present" for a particular key, each key is logically associated with
 // an infinite sequence of segments, including some infinite tail of 0-length
 // segments. In practice, we only represent a finite sequence that (at least)
 // covers the non-trivial segments.
@@ -215,7 +220,7 @@ Status UpdateManifestForFilesState(
 // whole key.
 // * Range query - Whether there {definitely isn't, might be} any entries
 // within a lower and upper key bound, in an SST file (or partition, etc.).
-//    NOTE: For this disucssion, we ignore the detail of inclusive vs.
+//    NOTE: For this discussion, we ignore the detail of inclusive vs.
 //    exclusive bounds by assuming a generalized notion of "bound" (vs. key)
 //    that conveniently represents spaces between keys. For details, see
 //    https://github.com/facebook/rocksdb/pull/11434
@@ -295,7 +300,7 @@ Status UpdateManifestForFilesState(
 //     * Keys x and z are in categories in category set s, and
 //     * Key y is ordered x < y < z according to the CF comparator,
 // then both
-//     * The common segment prefix property is satisifed through ordinal i-1
+//     * The common segment prefix property is satisfied through ordinal i-1
 //     and with category set s
 //     * x_i..j <= y_i..j <= z_i..j according to segment comparator c, where
 //     x_i..j is the concatenation of segments i through j of key x (etc.).
diff --git a/include/rocksdb/external_table.h b/include/rocksdb/external_table.h
new file mode 100644
index 000000000000..844ba9d96b85
--- /dev/null
+++ b/include/rocksdb/external_table.h
@@ -0,0 +1,275 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/advanced_iterator.h"
+#include "rocksdb/customizable.h"
+#include "rocksdb/file_checksum.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/iterator_base.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ExternalTableFactory;
+
+// EXPERIMENTAL
+// The interface defined in this file is subject to change at any time without
+// warning!!
+
+// This file defines an interface for plugging in an external table
+// into RocksDB. The external table reader will be used instead of the
+// BlockBasedTable to load and query sst files.
+// The external table files can be created using an SstFileWriter. Eventually
+// external tables will be allowed to be ingested into a RocksDB instance
+// using the IngestExternalFIle() API.
+//
+// Initial support is for writing and querying the files using an
+// SstFileWriter and SstFileReader. We will add support for ingestion of an
+// external table into a limited RocksDB instance that only supports ingestion
+// and not live writes in the near future. It'll be followed by support for
+// replacing the column family by ingesting a new set of files. In all cases,
+// the external table files will only be allowed in the bottommost level.
+//
+// The external table can support one or both of the following layouts -
+// 1. Total order seek - All the keys in the files are in sorted order, and a
+//    user can seek to the first, last, or any key in between and iterate
+//    forwards or backwards till the end of the range. To support this mode,
+//    the implementation needs to use the comparator passed in
+//    ExternalTableOptions to enforce the key ordering. The prefix_extractor
+//    in ExternalTableOptions and the ExternalTableReader interfaces can be
+//    ignored.
+// 2. Prefix seek - In this mode, the prefix_extractor is used to extract the
+//    prefix from a key. All the keys sharing the same prefix are ordered in
+//    ascending order according to the comparator. However, no specific
+//    ordering is required across prefixes. Users can scan keys by seeking
+//    to a specific key inside a prefix, and iterate forwards or backwards
+//    within the prefix. The prefix_same_as_start flag in ReadOptions will
+//    be true.
+// 3. Both - If supporting both of the above, a user can seek inside a prefix
+//    and iterate beyond the prefix. The prefix_same_as_start in ReadOptions
+//    will be false. Additionally, the total_order_seek flag can be set to
+//    true to seek to the first non-empty prefix (as determined by the key
+//    order) if the seek prefix is empty.
+//
+// Many of the options in ReadOptions and WriteOptions may not be relevant to
+// the external table implementation.
+// TODO: Specify which options are relevant
+
+class ExternalTableIterator : public IteratorBase {
+ public:
+  virtual ~ExternalTableIterator() {}
+
+  // This can optionally be called to prepare the iterator for a series
+  // of scans. The scan_opts parameter specifies the order of scans to
+  // follow, as well as the limits for those scans. After calling this,
+  // the caller will Seek() the iterator to successive start keys in scan_opts.
+  //
+  // If Prepare() is called again with a different scan_opts pointer, it
+  // means the iterator will be reused for a new multi scan. If scan_opts
+  // is null, then the previous Prepare() can be discarded.
+  //
+  // The caller guarantees the lifetime of scan_opts until its either cleared
+  // or replaced by another Prepare().
+  // TODO: Update the contract to trim the scan_opts range to only include
+  // scans that potentially intersect the file key range.
+  //
+  // If the sequence of Seeks is interrupted by seeking to some other target
+  // key, then the iterator is free to discard anything done during Prepare.
+  virtual void Prepare(const ScanOptions scan_opts[], size_t num_opts) = 0;
+
+  // Similar to Next(), except it also fills the result and returns whether
+  // the iterator is on a valid key or not
+  virtual bool NextAndGetResult(IterateResult* result) = 0;
+
+  // Prepares the value if its lazily materialized. The implementation can
+  // request that this be called by setting value_prepared to false in
+  // IterateResult. Next() should always implicitly materialize the
+  // value.
+  bool PrepareValue() override = 0;
+
+  // Return the current key's value
+  virtual Slice value() const = 0;
+
+  // Return the current position bounds check result - kInbound if the
+  // position is a valid key, kOutOfBound if the key is out of bound (i.e
+  // scan has terminated), or kUnknown if end of file.
+  virtual IterBoundCheck UpperBoundCheckResult() = 0;
+};
+
+class ExternalTableReader {
+ public:
+  virtual ~ExternalTableReader() {}
+
+  // Return an Iterator that can be used to scan the table file.
+  // The read_options can optionally contain the upper bound
+  // key (exclusive) of the scan in iterate_upper_bound.
+  virtual ExternalTableIterator* NewIterator(
+      const ReadOptions& read_options,
+      const SliceTransform* prefix_extractor) = 0;
+
+  // Point lookup the given key and return its value
+  virtual Status Get(const ReadOptions& read_options, const Slice& key,
+                     const SliceTransform* prefix_extractor,
+                     std::string* value) = 0;
+
+  // Point lookup the given vector of keys and return the values, as well
+  // as status of each individual lookup in statuses.
+  virtual void MultiGet(const ReadOptions& read_options,
+                        const std::vector<Slice>& keys,
+                        const SliceTransform* prefix_extractor,
+                        std::vector<std::string>* values,
+                        std::vector<Status>* statuses) = 0;
+
+  // Allocate and return the contents of the properties block. If the builder
+  // supports PutPropertiesBlock(), then this must be supported. The
+  // properties block should be written to the table file as is (no
+  // compression or mutation of any kind), and its offset in the file
+  // should be returned in file_offset.
+  virtual Status GetPropertiesBlock(std::unique_ptr<char[]>* /*property_block*/,
+                                    uint64_t* /*size*/,
+                                    uint64_t* /*file_offset*/) {
+    return Status::NotSupported();
+  }
+
+  // Return TableProperties for the file. At a minimum, the following
+  // properties need to be returned -
+  // comparator_name
+  // num_entries
+  // raw_key_size
+  // raw_value_size
+  virtual std::shared_ptr<const TableProperties> GetTableProperties() const = 0;
+
+  virtual Status VerifyChecksum(const ReadOptions& /*ro*/) {
+    return Status::NotSupported("VerifyChecksum() not supported");
+  }
+};
+
+// A table builder interface that can be used by SstFileWriter to allow
+// RocksDB users to write external table files. The sequence of operations
+// to write an external table is as follows -
+// 1. Add() is called one or more times to write all key-values to the table.
+//    Its called in increasing key order, as determined by the comparator.
+//    The input key is a user key, i.e sequence number and value type are
+//    stripped out.
+// 2. After every Add() operation, status() is called to check the current
+//    status.
+// 3. After the last key is added, Finish() is called to do whatever is
+//    necessary to ensure the data is persisted in the table file.
+// 4. If there is a failure midway for some reason, Abandon() is called
+//    instead of Finish().
+// 5. At the end, FileSize(), GetTableProperties(), and status() are called to
+//    get the final size of the file, the table properties, and the final
+//    status. GetFileChecksum() and GetFileChecksumFuncName() may also be
+//    called to get checksum information about the whole file, but their
+//    implementation is optional.
+class ExternalTableBuilder {
+ public:
+  virtual ~ExternalTableBuilder() {}
+
+  // Write a single KV to the table file. This is guaranteed to be called
+  // in key order, and the write may be buffered and flushed at a later time.
+  virtual void Add(const Slice& key, const Slice& value) = 0;
+
+  // Return the current Status. This could return non-ok, for example, if
+  // Add() fails for some reason.
+  virtual Status status() const = 0;
+
+  // Flush and close the table file
+  virtual Status Finish() = 0;
+
+  // Delete the partial file and release any allocated resources. Either this
+  // or Finish() will be called, but not both.
+  virtual void Abandon() = 0;
+
+  // Return the size of the table file. Will be called at the end, after
+  // Finish().
+  virtual uint64_t FileSize() const = 0;
+
+  // Write the raw properties block as is in the table file
+  virtual Status PutPropertiesBlock(const Slice& /*property_block*/) {
+    return Status::NotSupported();
+  }
+
+  //  As mentioned in earlier comments, the following table properties must be
+  //  returned at a minimum -
+  //  comparator_name
+  //  num_entries
+  //  raw_key_size
+  //  raw_value_size
+  virtual TableProperties GetTableProperties() const = 0;
+
+  virtual std::string GetFileChecksum() const { return kUnknownFileChecksum; }
+
+  virtual const char* GetFileChecksumFuncName() const {
+    return kUnknownFileChecksumFuncName;
+  }
+};
+
+struct ExternalTableOptions {
+  const std::shared_ptr<const SliceTransform>& prefix_extractor;
+  const Comparator* comparator;
+  const std::shared_ptr<FileSystem>& fs;
+  const FileOptions& file_options;
+
+  ExternalTableOptions(
+      const std::shared_ptr<const SliceTransform>& _prefix_extractor,
+      const Comparator* _comparator, const std::shared_ptr<FileSystem>& _fs,
+      const FileOptions& _file_options)
+      : prefix_extractor(_prefix_extractor),
+        comparator(_comparator),
+        fs(_fs),
+        file_options(_file_options) {}
+};
+
+struct ExternalTableBuilderOptions {
+  const ReadOptions& read_options;
+  const WriteOptions& write_options;
+  const std::shared_ptr<const SliceTransform>& prefix_extractor;
+  const Comparator* comparator;
+  const std::string& column_family_name;
+  const std::string db_id;
+  const std::string db_session_id;
+  const TableFileCreationReason reason;
+
+  ExternalTableBuilderOptions(
+      const ReadOptions& _read_options, const WriteOptions& _write_options,
+      const std::shared_ptr<const SliceTransform>& _prefix_extractor,
+      const Comparator* _comparator, const std::string& _column_family_name,
+      const TableFileCreationReason _reason)
+      : read_options(_read_options),
+        write_options(_write_options),
+        prefix_extractor(_prefix_extractor),
+        comparator(_comparator),
+        column_family_name(_column_family_name),
+        reason(_reason) {}
+};
+
+class ExternalTableFactory : public Customizable {
+ public:
+  ~ExternalTableFactory() override {}
+
+  const char* Name() const override { return "ExternalTableFactory"; }
+
+  virtual Status NewTableReader(
+      const ReadOptions& read_options, const std::string& file_path,
+      const ExternalTableOptions& table_options,
+      std::unique_ptr<ExternalTableReader>* table_reader) const = 0;
+
+  // The table builder should use the file pointer to append to the file.
+  // Do not sync or close the file after finishing. RocksDB will do that.
+  virtual ExternalTableBuilder* NewTableBuilder(
+      const ExternalTableBuilderOptions& builder_options,
+      const std::string& file_path, FSWritableFile* file) const = 0;
+};
+
+// Allocate a TableFactory that wraps around an ExternalTableFactory. Use this
+// to allocate and set in ColumnFamilyOptions::table_factory.
+std::unique_ptr<TableFactory> NewExternalTableFactory(
+    std::shared_ptr<ExternalTableFactory> inner_factory);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/external_table_reader.h b/include/rocksdb/external_table_reader.h
deleted file mode 100644
index 9bba9f4f3eff..000000000000
--- a/include/rocksdb/external_table_reader.h
+++ /dev/null
@@ -1,124 +0,0 @@
-//  Copyright (c) Meta Platforms, Inc. and affiliates.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#pragma once
-
-#include "rocksdb/customizable.h"
-#include "rocksdb/iterator.h"
-#include "rocksdb/options.h"
-#include "rocksdb/status.h"
-
-namespace ROCKSDB_NAMESPACE {
-
-class ExternalTableFactory;
-
-// EXPERIMENTAL
-// The interface defined in this file is subject to change at any time without
-// warning!!
-
-// This file defines an interface for plugging in an external table reader
-// into RocksDB. The external table reader will be used instead of the
-// BlockBasedTable to load and query sst files. As of now, creating the
-// external table files using RocksDB is not supported, but will be added in
-// the near future. The external table files can be created outside and
-// RocksDB and ingested into a RocksDB instance using the IngestExternalFIle()
-// API.
-//
-// Initial support is for loading and querying the files using an
-// SstFileReader. We will add support for ingestion of an external table
-// into a limited RocksDB instance that only supports ingestion and not live
-// writes in the near future. It'll be followed by support for replacing the
-// column family by ingesting a new set of files. In all cases, the external
-// table files will only be allowed in the bottommost level.
-//
-// The external table reader can support one or both of the following layouts -
-// 1. Total order seek - All the keys in the files are in sorted order, and a
-//    user can seek to the first, last, or any key in between and iterate
-//    forwards or backwards till the end of the range. To support this mode,
-//    the implementation needs to use the comparator passed in
-//    ExternalTableOptions to enforce the key ordering. The prefix_extractor
-//    in ExternalTableOptions and the ExternalTableReader interfaces can be
-//    ignored.
-// 2. Prefix seek - In this mode, the prefix_extractor is used to extract the
-//    prefix from a key. All the keys sharing the same prefix are ordered in
-//    ascending order according to the comparator. However, no specific
-//    ordering is required across prefixes. Users can scan keys by seeking
-//    to a specific key inside a prefix, and iterate forwards or backwards
-//    within the prefix. The prefix_same_as_start flag in ReadOptions will
-//    be true.
-// 3. Both - If supporting both of the above, a user can seek inside a prefix
-//    and iterate beyond the prefix. The prefix_same_as_start in ReadOptions
-//    will be false. Additionally, the total_order_seek flag can be set to
-//    true to seek to the first non-empty prefix (as determined by the key
-//    order) if the seek prefix is empty.
-//
-// Many of the options in ReadOptions may not be relevant to the external
-// table implementation.
-// TODO: Specify which options are relevant
-
-class ExternalTableReader {
- public:
-  virtual ~ExternalTableReader() {}
-
-  // Return an Iterator that can be used to scan the table file.
-  // The read_options can optionally contain the upper bound
-  // key (exclusive) of the scan in iterate_upper_bound.
-  virtual Iterator* NewIterator(const ReadOptions& read_options,
-                                const SliceTransform* prefix_extractor) = 0;
-
-  // Point lookup the given key and return its value
-  virtual Status Get(const ReadOptions& read_options, const Slice& key,
-                     const SliceTransform* prefix_extractor,
-                     std::string* value) = 0;
-
-  // Point lookup the given vector of keys and return the values, as well
-  // as status of each individual lookup in statuses.
-  virtual void MultiGet(const ReadOptions& read_options,
-                        const std::vector<Slice>& keys,
-                        const SliceTransform* prefix_extractor,
-                        std::vector<std::string>* values,
-                        std::vector<Status>* statuses) = 0;
-
-  // Return TableProperties for the file. At a minimum, the following
-  // properties need to be returned -
-  // comparator_name
-  // num_entries
-  // raw_key_size
-  // raw_value_size
-  virtual std::shared_ptr<const TableProperties> GetTableProperties() const = 0;
-
-  virtual Status VerifyChecksum(const ReadOptions& /*ro*/) {
-    return Status::NotSupported("VerifyChecksum() not supported");
-  }
-};
-
-struct ExternalTableOptions {
-  const std::shared_ptr<const SliceTransform>& prefix_extractor;
-  const Comparator* comparator;
-
-  ExternalTableOptions(
-      const std::shared_ptr<const SliceTransform>& _prefix_extractor,
-      const Comparator* _comparator)
-      : prefix_extractor(_prefix_extractor), comparator(_comparator) {}
-};
-
-class ExternalTableFactory : public Customizable {
- public:
-  ~ExternalTableFactory() override {}
-
-  const char* Name() const override { return "ExternalTableFactory"; }
-
-  virtual Status NewTableReader(
-      const ReadOptions& read_options, const std::string& file_path,
-      const ExternalTableOptions& table_options,
-      std::unique_ptr<ExternalTableReader>* table_reader) = 0;
-};
-
-// Allocate a TableFactory that wraps around an ExternalTableFactory. Use this
-// to allocate and set in ColumnFamilyOptions::table_factory.
-std::shared_ptr<TableFactory> NewExternalTableFactory(
-    std::shared_ptr<ExternalTableFactory> inner_factory);
-
-}  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/file_checksum.h b/include/rocksdb/file_checksum.h
index 66024d0a1b4e..70de891f2c05 100644
--- a/include/rocksdb/file_checksum.h
+++ b/include/rocksdb/file_checksum.h
@@ -22,7 +22,12 @@ namespace ROCKSDB_NAMESPACE {
 // The unknown file checksum.
 constexpr char kUnknownFileChecksum[] = "";
 // The unknown sst file checksum function name.
+// Indicates that the file metadata says that no checksum factory was configured
+// when the file was written.
 constexpr char kUnknownFileChecksumFuncName[] = "Unknown";
+// Used when opening a file and there is no file checksum metadata to propagate
+// at all.
+constexpr char kNoFileChecksumFuncName[] = "Unavailable";
 // The standard DB file checksum function name.
 // This is the name of the checksum function returned by
 // GetFileChecksumGenCrc32cFactory();
@@ -80,7 +85,8 @@ class FileChecksumGenFactory : public Customizable {
       const ConfigOptions& options, const std::string& value,
       std::shared_ptr<FileChecksumGenFactory>* result);
 
-  // Create a new FileChecksumGenerator.
+  // Create a new FileChecksumGenerator. Recommended to return nullptr if the
+  // requested function name is not recognized.
   virtual std::unique_ptr<FileChecksumGenerator> CreateFileChecksumGenerator(
       const FileChecksumGenContext& context) = 0;
 
diff --git a/include/rocksdb/file_system.h b/include/rocksdb/file_system.h
index 27e497f432b5..ea9d52bf6b30 100644
--- a/include/rocksdb/file_system.h
+++ b/include/rocksdb/file_system.h
@@ -18,11 +18,13 @@
 
 #include <stdint.h>
 
+#include <any>
 #include <chrono>
 #include <cstdarg>
 #include <functional>
 #include <limits>
 #include <memory>
+#include <shared_mutex>
 #include <sstream>
 #include <string>
 #include <unordered_map>
@@ -88,6 +90,7 @@ enum FSSupportedOps {
   kVerifyAndReconstructRead,  // Supports a higher level of data integrity. See
                               // the verify_and_reconstruct_read flag in
                               // IOOptions.
+  kFSPrefetch,                // Supports prefetch operations
 };
 
 // Per-request options that can be passed down to the FileSystem
@@ -192,6 +195,25 @@ struct FileOptions : EnvOptions {
   // handoff during file writes.
   ChecksumType handoff_checksum_type;
 
+  // Expose write lifetime hint on the FileOptions level to provide more
+  // flexibility in setting the hint in downstream, custom implementations
+  // that might be able to process the hint only at the time of the actual
+  // FSWritableFile object creation.
+  Env::WriteLifeTimeHint write_hint = Env::WLTH_NOT_SET;
+
+  // File checksum of the file being opened. Empty string if no checksum is
+  // available.
+  std::string file_checksum;
+
+  // Name of the checksum function used to compute file_checksum. Set to
+  // kUnknownFileChecksumFuncName when file was created without a checksum
+  // factory. Set to kNoFileChecksumFuncName when no checksum metadata is
+  // available.
+  // Production FileSystems will accept empty values for both
+  // file_checksum and file_checksum_func_name, but internally within RocksDB
+  // that is forbidden for checking/auditing purposes.
+  std::string file_checksum_func_name;
+
   FileOptions() : EnvOptions(), handoff_checksum_type(ChecksumType::kCRC32c) {}
 
   FileOptions(const DBOptions& opts)
@@ -206,13 +228,18 @@ struct FileOptions : EnvOptions {
       : EnvOptions(opts),
         io_options(opts.io_options),
         temperature(opts.temperature),
-        handoff_checksum_type(opts.handoff_checksum_type) {}
+        handoff_checksum_type(opts.handoff_checksum_type),
+        write_hint(opts.write_hint),
+        file_checksum(opts.file_checksum),
+        file_checksum_func_name(opts.file_checksum_func_name) {}
 
   FileOptions& operator=(const FileOptions&) = default;
 };
 
 // A structure to pass back some debugging information from the FileSystem
 // implementation to RocksDB in case of an IO error
+// TODO(virajthakur): Update all calls to FS APIs for writes to pass in
+// IODebugContext
 struct IODebugContext {
   // file_path to be filled in by RocksDB in case of an error
   std::string file_path;
@@ -223,8 +250,9 @@ struct IODebugContext {
   // To be set by the FileSystem implementation
   std::string msg;
 
-  // To be set by the underlying FileSystem implementation.
-  std::string request_id;
+  // To be set by the application, to allow tracing logs/metrics from user ->
+  // RocksDB -> FS.
+  const std::string* request_id = nullptr;
 
   // In order to log required information in IO tracing for different
   // operations, Each bit in trace_data stores which corresponding info from
@@ -240,16 +268,48 @@ struct IODebugContext {
   };
   uint64_t trace_data = 0;
 
+  // Arbitrary structure containing cost information about the IO request
+  std::any cost_info;
+
+  // FileSystem implementations can use this mutex to synchronize concurrent
+  // reads/writes as needed (e.g. to update the counters or cost_info field)
+  std::shared_mutex mutex;
+
   IODebugContext() {}
 
+  // Copy constructor
+  IODebugContext(const IODebugContext& other)
+      : file_path(other.file_path),
+        counters(other.counters),
+        msg(other.msg),
+        trace_data(other.trace_data),
+        cost_info(other.cost_info),
+        _request_id(other.request_id ? *other.request_id : "") {
+    request_id = other.request_id ? &_request_id : nullptr;
+  }
+
+  // Copy assignment operator
+  IODebugContext& operator=(const IODebugContext& other) {
+    if (this != &other) {
+      file_path = other.file_path;
+      counters = other.counters;
+      msg = other.msg;
+      trace_data = other.trace_data;
+      cost_info = other.cost_info;
+      _request_id = other.request_id ? *other.request_id : "";
+      request_id = other.request_id ? &_request_id : nullptr;
+    }
+    return *this;
+  }
+
   void AddCounter(std::string& name, uint64_t value) {
     counters.emplace(name, value);
   }
 
   // Called by underlying file system to set request_id and log request_id in
   // IOTracing.
-  void SetRequestId(const std::string& _request_id) {
-    request_id = _request_id;
+  void SetRequestId(const std::string* updated_request_id) {
+    request_id = updated_request_id;
     trace_data |= (1 << TraceData::kRequestID);
   }
 
@@ -262,6 +322,12 @@ struct IODebugContext {
     ss << msg;
     return ss.str();
   }
+
+ private:
+  // Private member that allows for safe copying of IODebugContext without any
+  // memory ownership issues. After copying, request_id can point directly to
+  // this field.
+  std::string _request_id;
 };
 
 // A function pointer type for custom destruction of void pointer passed to
@@ -507,7 +573,7 @@ class FileSystem : public Customizable {
   }
 
 // This seems to clash with a macro on Windows, so #undef it here
-#ifdef DeleteFile
+#ifdef DeleteFile  // ODR-SAFE
 #undef DeleteFile
 #endif
   // Delete the named file.
@@ -668,7 +734,7 @@ class FileSystem : public Customizable {
       const ImmutableDBOptions& db_options) const;
 
 // This seems to clash with a macro on Windows, so #undef it here
-#ifdef GetFreeSpace
+#ifdef GetFreeSpace  // ODR-SAFE
 #undef GetFreeSpace
 #endif
 
@@ -699,7 +765,7 @@ class FileSystem : public Customizable {
   // Abort the read IO requests submitted asynchronously. Underlying FS is
   // required to support AbortIO API. AbortIO implementation should ensure that
   // the all the read requests related to io_handles should be aborted and
-  // it shouldn't call the callback for these io_handles.
+  // it should call the callback for these io_handles.
   virtual IOStatus AbortIO(std::vector<void*>& /*io_handles*/) {
     return IOStatus::OK();
   }
@@ -721,12 +787,13 @@ class FileSystem : public Customizable {
   //  If async_io is supported by the underlying FileSystem, then supported_ops
   //  will have corresponding bit (i.e FSSupportedOps::kAsyncIO) set to 1.
   //
-  // By default, async_io operation is set and FS should override this API and
-  // set all the operations they support provided in FSSupportedOps (including
-  // async_io).
+  // By default, async_io and prefetch operation are set and FS should override
+  // this API and set all the operations they support provided in FSSupportedOps
+  // (including async_io and prefetch).
   virtual void SupportedOps(int64_t& supported_ops) {
     supported_ops = 0;
     supported_ops |= (1 << FSSupportedOps::kAsyncIO);
+    supported_ops |= (1 << FSSupportedOps::kFSPrefetch);
   }
 
   // If you're adding methods here, remember to add them to EnvWrapper too.
@@ -1006,6 +1073,14 @@ class FSRandomAccessFile {
   // open.
   virtual Temperature GetTemperature() const { return Temperature::kUnknown; }
 
+  // Get the file size on an open-for-reading file without re-seeking the file's
+  // path in the filesystem. The default implementation returns "not supported"
+  // so that user implementations of FSRandomAccessFile do not need to
+  // immediately implement this function.
+  virtual IOStatus GetFileSize(uint64_t* /*result*/) {
+    return IOStatus::NotSupported("GetFileSize Not Supported");
+  }
+
   // If you're adding methods here, remember to add them to
   // RandomAccessFileWrapper too.
 };
@@ -1106,8 +1181,10 @@ class FSWritableFile {
 
   // Truncate is necessary to trim the file to the correct size
   // before closing. It is not always possible to keep track of the file
-  // size due to whole pages writes. The behavior is undefined if called
-  // with other writes to follow.
+  // size due to whole pages writes. If called with other writes to follow,
+  // the behavior is file system specific. Posix will reseek to the new EOF.
+  // Other file systems may behave differently. Its the caller's
+  // responsibility to check the file system contract.
   virtual IOStatus Truncate(uint64_t /*size*/, const IOOptions& /*options*/,
                             IODebugContext* /*dbg*/) {
     return IOStatus::OK();
@@ -1727,6 +1804,10 @@ class FSRandomAccessFileWrapper : public FSRandomAccessFile {
     return target_->GetTemperature();
   }
 
+  virtual IOStatus GetFileSize(uint64_t* result) override {
+    return target_->GetFileSize(result);
+  }
+
  private:
   std::unique_ptr<FSRandomAccessFile> guard_;
   FSRandomAccessFile* target_;
diff --git a/include/rocksdb/functor_wrapper.h b/include/rocksdb/functor_wrapper.h
index 17b021bf73b5..50007b85d77a 100644
--- a/include/rocksdb/functor_wrapper.h
+++ b/include/rocksdb/functor_wrapper.h
@@ -44,7 +44,7 @@ void call(Function f, Tuple t) {
 template <typename... Args>
 class FunctorWrapper {
  public:
-  explicit FunctorWrapper(std::function<void(Args...)> functor, Args &&...args)
+  explicit FunctorWrapper(std::function<void(Args...)> functor, Args&&... args)
       : functor_(std::move(functor)), args_(std::forward<Args>(args)...) {}
 
   void invoke() { detail::call(functor_, args_); }
diff --git a/include/rocksdb/io_dispatcher.h b/include/rocksdb/io_dispatcher.h
new file mode 100644
index 000000000000..6354d72ad36d
--- /dev/null
+++ b/include/rocksdb/io_dispatcher.h
@@ -0,0 +1,358 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+#include <functional>
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "rocksdb/options.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class FileSystem;
+class Statistics;
+
+// Forward declaration for internal implementation
+struct IODispatcherImplData;
+struct PendingPrefetchRequest;
+
+// Options for configuring IODispatcher behavior
+struct IODispatcherOptions {
+  // Maximum memory (in bytes) for prefetching across all ReadSets.
+  // When this limit is reached, SubmitJob() blocks until memory is released.
+  // Set to 0 (default) for unlimited prefetch memory.
+  size_t max_prefetch_memory_bytes = 0;
+
+  // Optional statistics for tracking memory limiter metrics
+  Statistics* statistics = nullptr;
+};
+
+/*
+ * IODispatcher is a class that allows users to submit groups of IO jobs to be
+ * dispatched asynchronously (or synchronously), upon submission the
+ * IODispatcher will return a ReadSet which act as an ownership object of those
+ * IOs. Users read from their readset when they require the data, and either
+ * poll for completion of the block, or read synchronously if the block is not
+ * in cache at that point.
+ *
+ * ReadSets have RAII semantics, meaning on destruction they will cancel any on
+ * going IO, and release the underlying pinned blocks.
+ *
+ * IODispatcher main goal is to act as control plane for all readers using the
+ * dispatcher, allowing for future ratelimiting and smarter dispatching policies
+ * in the future.
+ *
+ * Example 1: Basic Usage
+ * ----------------------
+ * // Submitting an IO job and reading blocks:
+ * //
+ * // std::shared_ptr<IOJob> job = std::make_shared<IOJob>();
+ * // job->table = table_reader;  // Provided BlockBasedTable*
+ * // job->job_options.io_coalesce_threshold = 32 * 1024;
+ * // job->job_options.read_options = read_options;  // Provided ReadOptions
+ * //
+ * // // Populate the job with block handles (e.g., from an index/iterator)
+ * // job->block_handles.push_back(handle1);
+ * // job->block_handles.push_back(handle2);
+ * // job->block_handles.push_back(handle3);
+ * //
+ * // std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
+ * // std::shared_ptr<ReadSet> read_set;
+ * // Status s = dispatcher->SubmitJob(job, &read_set);
+ * // if (!s.ok()) {
+ * //   // Handle submit error
+ * // }
+ * //
+ * // // Read by index
+ * // for (size_t i = 1; i < job->block_handles.size(); ++i) {
+ * //   CachableEntry<Block> block_entry;
+ * //   Status rs = read_set->ReadIndex(i, &block_entry);
+ * //   if (!rs.ok()) {
+ * //     // Handle read error
+ * //     continue;
+ * //   }
+ * //   // Use block_entry (block contents are pinned here)
+ * // }
+ * //
+ * // // Or read by byte offset
+ * // {
+ * //   size_t offset =
+ static_cast<size_t>(job->block_handles.front().offset());
+ * //   CachableEntry<Block> block_entry;
+ * //   Status rs = read_set->ReadOffset(offset, &block_entry);
+ * //   if (rs.ok()) {
+ * //     // Use block_entry
+ * //   }
+ * // }
+ * //
+ * // // Stats
+ * // uint64_t cache_hits = read_set->GetNumCacheHits();
+ * // uint64_t async_reads = read_set->GetNumAsyncReads();
+ * // uint64_t sync_reads = read_set->GetNumSyncReads();
+ *
+ * Example 2: Memory-Limited Prefetching
+ * -------------------------------------
+ * // Configure a memory budget for prefetching to prevent unbounded memory use.
+ * // When the budget is exceeded, IODispatcher uses "partial prefetch":
+ * //   - Dispatches as many blocks as fit in available memory (earlier first)
+ * //   - Queues remaining blocks for later dispatch when memory is released
+ * //   - Never blocks on SubmitJob - remaining blocks are read on-demand
+ * //
+ * // IODispatcherOptions opts;
+ * // opts.max_prefetch_memory_bytes = 64 * 1024 * 1024;  // 64MB budget
+ * // opts.statistics = db_options.statistics.get();      // Optional metrics
+ * //
+ * // std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+ * //
+ * // // Submit a job that needs more memory than available
+ * // // Partial prefetch will dispatch what fits immediately
+ * // std::shared_ptr<ReadSet> read_set;
+ * // Status s = dispatcher->SubmitJob(job, &read_set);  // Never blocks
+ * //
+ * // // Read blocks in order - earlier blocks are more likely to be prefetched
+ * // for (size_t i = 0; i < job->block_handles.size(); ++i) {
+ * //   CachableEntry<Block> block;
+ * //   Status rs = read_set->ReadIndex(i, &block);
+ * //   // Use block...
+ * //
+ * //   // Release block when done to free memory for pending prefetches
+ * //   read_set->ReleaseBlock(i);  // Triggers dispatch of queued blocks
+ * // }
+ * //
+ * // Memory limiting statistics (when statistics is configured):
+ * // - PREFETCH_MEMORY_BYTES_GRANTED: Total bytes acquired for prefetching
+ * // - PREFETCH_MEMORY_BYTES_RELEASED: Total bytes released after use
+ * // - PREFETCH_MEMORY_REQUESTS_BLOCKED: Number of blocks that couldn't be
+ * //   prefetched immediately due to memory pressure
+
+ */
+
+class BlockHandle;
+struct ReadOptions;
+struct AsyncIOState;
+
+template <typename T>
+class CachableEntry;
+class Block;
+class BlockBasedTable;
+
+struct JobOptions {
+  uint64_t io_coalesce_threshold = 16 * 1024;
+  ReadOptions read_options;
+};
+
+class IOJob {
+ public:
+  std::vector<BlockHandle> block_handles;
+
+  // Table reader for accessing block cache and index
+  BlockBasedTable* table = nullptr;
+
+  // Job execution options
+  JobOptions job_options;
+};
+
+/*
+ * ReadSet represents a set of blocks that may be in cache, being read
+ * asynchronously, or need to be read synchronously. The Read() method
+ * transparently handles all three cases.
+ */
+class ReadSet {
+ public:
+  ReadSet() = default;
+  ~ReadSet();
+
+  ReadSet(const ReadSet&) = delete;
+  ReadSet& operator=(const ReadSet&) = delete;
+  ReadSet(ReadSet&&) noexcept = delete;
+  ReadSet& operator=(ReadSet&&) noexcept = delete;
+
+  // Read a block by index
+  // - If the block is in cache, returns it immediately
+  // - If the block is being read asynchronously, polls for completion and
+  // returns it
+  // - If the block needs to be read, performs a synchronous read and returns it
+  //
+  // block_index: Index into the original IOJob's block_handles vector
+  // out: Output parameter for the pinned block entry
+  //
+  // Returns: Status::OK() on success, error status otherwise
+  Status ReadIndex(size_t block_index, CachableEntry<Block>* out);
+  // Read a block by offset
+  // - If the block is in cache, returns it immediately
+  // - If the block is being read asynchronously, polls for completion and
+  // returns it
+  // - If the block needs to be read, performs a synchronous read and returns it
+
+  // block_offset: Byte Offset into the SST file of the block.
+
+  // out: Output parameter for the pinned block entry
+  Status ReadOffset(size_t offset, CachableEntry<Block>* out);
+
+  // Release a block by index, unpinning it from cache.
+  // After this call, ReadIndex() for this block will return an error.
+  // This is useful for eager memory reclamation when blocks are no longer
+  // needed.
+  void ReleaseBlock(size_t block_index);
+
+  // Check if a block at the given index is still available (not released).
+  // Returns true if the block can be read, false otherwise.
+  bool IsBlockAvailable(size_t block_index) const;
+
+  // Statistics accessors
+  uint64_t GetNumSyncReads() const { return num_sync_reads_; }
+  uint64_t GetNumAsyncReads() const { return num_async_reads_; }
+  uint64_t GetNumCacheHits() const { return num_cache_hits_; }
+
+ private:
+  friend class IODispatcherImpl;
+
+  // Job data
+  std::shared_ptr<IOJob> job_;
+
+  // FileSystem for calling AbortIO in destructor
+  std::shared_ptr<FileSystem> fs_;
+
+  // Storage for pinned blocks (one per block handle in the job)
+  std::vector<CachableEntry<Block>> pinned_blocks_;
+
+  // Sorted index for binary search in ReadOffset.
+  // sorted_block_indices_[i] is the original index of the i-th smallest block
+  // by offset. Built once during SubmitJob for O(log n) ReadOffset lookups.
+  std::vector<size_t> sorted_block_indices_;
+
+  // Map from block index to async IO state for blocks being read
+  // asynchronously. Multiple block indices may map to the same async state when
+  // blocks are coalesced into a single IO request.
+  std::unordered_map<size_t, std::shared_ptr<AsyncIOState>> async_io_map_;
+
+  // For memory release notifications back to dispatcher (weak ref to avoid
+  // cycles)
+  std::weak_ptr<IODispatcherImplData> dispatcher_data_;
+
+  // Size of each block (parallel to pinned_blocks_) for memory accounting
+  std::vector<size_t> block_sizes_;
+
+  // Statistics counters
+  std::atomic<uint64_t> num_sync_reads_ = 0;
+  std::atomic<uint64_t> num_async_reads_ = 0;
+  std::atomic<uint64_t> num_cache_hits_ = 0;
+
+  // Poll and process a specific async IO request
+  Status PollAndProcessAsyncIO(
+      const std::shared_ptr<AsyncIOState>& async_state);
+
+  // Perform synchronous read for a specific block
+  Status SyncRead(size_t block_index);
+
+  // Remove a block from pending prefetch (called by ReadIndex/ReleaseBlock)
+  void RemoveFromPending(size_t block_index);
+
+  // Atomic flags indicating if block is pending prefetch (lock-free check)
+  std::unique_ptr<std::atomic<bool>[]> pending_prefetch_flags_;
+  size_t pending_prefetch_flags_size_ = 0;
+
+  // Reference to pending request (for removal notification)
+  std::shared_ptr<PendingPrefetchRequest> pending_request_;
+};
+
+/*
+ * IODispatcher handles IO operations synchronously or asynchronously based
+ * on JobOptions. When async is true, it uses ReadAsync; when false, it uses
+ * standard synchronous reads.
+ * */
+class IODispatcher {
+ protected:
+  IODispatcher() = default;
+
+ public:
+  virtual ~IODispatcher() {}
+
+  IODispatcher(const IODispatcher&) = delete;
+  IODispatcher& operator=(const IODispatcher&) = delete;
+  IODispatcher(IODispatcher&&) = delete;
+  IODispatcher& operator=(IODispatcher&&) = delete;
+
+  // Submit a job for IO processing
+  // job: The IO job to submit
+  // read_set: Output parameter that will be populated with the ReadSet on
+  // success Returns: Status::OK() on success, error status otherwise
+  virtual Status SubmitJob(const std::shared_ptr<IOJob>& job,
+                           std::shared_ptr<ReadSet>* read_set) = 0;
+};
+
+// Create IODispatcher with default options (no memory limit)
+IODispatcher* NewIODispatcher();
+
+// Create IODispatcher with custom options
+IODispatcher* NewIODispatcher(const IODispatcherOptions& options);
+
+// TrackingIODispatcher wraps another IODispatcher and tracks all ReadSets
+// created. This is useful for testing to verify IO statistics.
+class TrackingIODispatcher : public IODispatcher {
+ public:
+  TrackingIODispatcher() : impl_(NewIODispatcher()) {}
+  explicit TrackingIODispatcher(IODispatcher* impl) : impl_(impl) {}
+
+  Status SubmitJob(const std::shared_ptr<IOJob>& job,
+                   std::shared_ptr<ReadSet>* read_set) override {
+    Status s = impl_->SubmitJob(job, read_set);
+    if (s.ok() && read_set && *read_set) {
+      read_sets_.push_back(*read_set);
+    }
+    return s;
+  }
+
+  // Get all ReadSets created by this dispatcher
+  const std::vector<std::shared_ptr<ReadSet>>& GetReadSets() const {
+    return read_sets_;
+  }
+
+  // Get aggregated statistics from all ReadSets
+  uint64_t GetTotalSyncReads() const {
+    uint64_t total = 0;
+    for (const auto& rs : read_sets_) {
+      total += rs->GetNumSyncReads();
+    }
+    return total;
+  }
+
+  uint64_t GetTotalAsyncReads() const {
+    uint64_t total = 0;
+    for (const auto& rs : read_sets_) {
+      total += rs->GetNumAsyncReads();
+    }
+    return total;
+  }
+
+  uint64_t GetTotalCacheHits() const {
+    uint64_t total = 0;
+    for (const auto& rs : read_sets_) {
+      total += rs->GetNumCacheHits();
+    }
+    return total;
+  }
+
+  // Get total IO operations (sum of all types)
+  uint64_t GetTotalIOOperations() const {
+    return GetTotalSyncReads() + GetTotalAsyncReads() + GetTotalCacheHits();
+  }
+
+  // Clear tracked ReadSets
+  void ClearReadSets() { read_sets_.clear(); }
+
+ private:
+  std::unique_ptr<IODispatcher> impl_;
+  std::vector<std::shared_ptr<ReadSet>> read_sets_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/iostats_context.h b/include/rocksdb/iostats_context.h
index 592bc0c46709..8fce6181c0b4 100644
--- a/include/rocksdb/iostats_context.h
+++ b/include/rocksdb/iostats_context.h
@@ -32,22 +32,47 @@ struct FileIOByTemperature {
   uint64_t hot_file_bytes_read;
   // the number of bytes read to Temperature::kWarm file
   uint64_t warm_file_bytes_read;
+  // the number of bytes read to Temperature::kCool file
+  uint64_t cool_file_bytes_read;
   // the number of bytes read to Temperature::kCold file
   uint64_t cold_file_bytes_read;
+  // the number of bytes read to Temperature::kIce file
+  uint64_t ice_file_bytes_read;
+  // the number of bytes read to Temperature::kUnknown file not in last level
+  uint64_t unknown_non_last_level_bytes_read;
+  // the number of bytes read to Temperature::kUnknown file in last level
+  uint64_t unknown_last_level_bytes_read;
   // total number of reads to Temperature::kHot file
   uint64_t hot_file_read_count;
   // total number of reads to Temperature::kWarm file
   uint64_t warm_file_read_count;
+  // total number of reads to Temperature::kCool file
+  uint64_t cool_file_read_count;
   // total number of reads to Temperature::kCold file
   uint64_t cold_file_read_count;
+  // total number of reads to Temperature::kIce file
+  uint64_t ice_file_read_count;
+  // total number of reads to Temperature::kUnknown file not in last level
+  uint64_t unknown_non_last_level_read_count;
+  // total number of reads to Temperature::kUnknown file in last level
+  uint64_t unknown_last_level_read_count;
+
   // reset all the statistics to 0.
   void Reset() {
     hot_file_bytes_read = 0;
     warm_file_bytes_read = 0;
+    cool_file_bytes_read = 0;
     cold_file_bytes_read = 0;
+    ice_file_bytes_read = 0;
+    unknown_non_last_level_bytes_read = 0;
+    unknown_last_level_bytes_read = 0;
     hot_file_read_count = 0;
     warm_file_read_count = 0;
+    cool_file_read_count = 0;
     cold_file_read_count = 0;
+    ice_file_read_count = 0;
+    unknown_non_last_level_read_count = 0;
+    unknown_last_level_read_count = 0;
   }
 };
 
diff --git a/include/rocksdb/iterator.h b/include/rocksdb/iterator.h
index 51bead99b907..b006138376aa 100644
--- a/include/rocksdb/iterator.h
+++ b/include/rocksdb/iterator.h
@@ -21,6 +21,7 @@
 #include <string>
 
 #include "rocksdb/iterator_base.h"
+#include "rocksdb/options.h"
 #include "rocksdb/wide_columns.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -93,6 +94,22 @@ class Iterator : public IteratorBase {
     assert(false);
     return Slice();
   }
+
+  // Prepare the iterator to scan the ranges specified in scan_opts. This
+  // includes prefetching relevant blocks from disk. The upper bound and
+  // other table specific limits should be specified for each
+  // scan for best results. If an upper bound is not specified, Prepare may
+  // skip prefetching as it cannot accurately determine how much to prefetch.
+  //
+  // Prepare should typically be followed by Seeks to the start keys in the
+  // order they're specified in scan_opts. If the user does a Seek to some
+  // other target key, the iterator should disregard the scan_opts from that
+  // point onwards and behave like a normal iterator. Its the user's
+  // responsibility to again call Prepare().
+  //
+  // If Prepare() is called, it overrides the iterate_upper_bound in
+  // ReadOptions
+  virtual void Prepare(const MultiScanArgs& /*scan_opts*/) {}
 };
 
 // Return an empty iterator (yields nothing).
diff --git a/include/rocksdb/ldb_tool.h b/include/rocksdb/ldb_tool.h
index 7a4c6ca11fbd..623fb1f0b918 100644
--- a/include/rocksdb/ldb_tool.h
+++ b/include/rocksdb/ldb_tool.h
@@ -32,10 +32,18 @@ struct LDBOptions {
 
 class LDBTool {
  public:
+  // DEPRECATED because this function does not return, which can result in
+  // memory leaks being reported because of the default Options() etc. not being
+  // destroyed.
   void Run(
       int argc, char** argv, Options db_options = Options(),
       const LDBOptions& ldb_options = LDBOptions(),
       const std::vector<ColumnFamilyDescriptor>* column_families = nullptr);
+
+  int RunAndReturn(
+      int argc, char** argv, const Options& db_options = Options(),
+      const LDBOptions& ldb_options = LDBOptions(),
+      const std::vector<ColumnFamilyDescriptor>* column_families = nullptr);
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/listener.h b/include/rocksdb/listener.h
index 019f4d40bf60..1b41ca81f3d9 100644
--- a/include/rocksdb/listener.h
+++ b/include/rocksdb/listener.h
@@ -439,6 +439,9 @@ struct CompactionJobInfo {
   // the job id, which is unique in the same thread.
   int job_id;
 
+  // the number of L0 files in the CF right before and after the compaction
+  int num_l0_files;
+
   // the smallest input level of the compaction.
   int base_input_level;
   // the output level of the compaction.
@@ -485,6 +488,9 @@ struct CompactionJobInfo {
   // Information about blob files deleted during compaction in Integrated
   // BlobDB.
   std::vector<BlobFileGarbageInfo> blob_file_garbage_infos;
+
+  // Whether this compaction was aborted via AbortAllCompactions()
+  bool aborted = false;
 };
 
 struct MemTableInfo {
diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h
index fd63f127f468..00d08562762b 100644
--- a/include/rocksdb/memtablerep.h
+++ b/include/rocksdb/memtablerep.h
@@ -38,6 +38,7 @@
 #include <stdint.h>
 #include <stdlib.h>
 
+#include <functional>
 #include <memory>
 #include <stdexcept>
 #include <unordered_set>
@@ -162,6 +163,12 @@ class MemTableRep {
     return true;
   }
 
+  // Only used after concurrent memtable inserts.
+  // This function will be called by each writer after all writes are done
+  // through InsertConcurrently().
+  // This is used by VectorRep to do batched writes for concurrent inserts.
+  virtual void BatchPostProcess() {}
+
   // Returns true iff an entry that compares equal to key is in the collection.
   virtual bool Contains(const char* key) const = 0;
 
@@ -195,11 +202,12 @@ class MemTableRep {
                    bool (*callback_func)(void* arg, const char* entry));
 
   // Same as Get() but performs data integrity validation.
-  virtual Status GetAndValidate(const LookupKey& /* k */,
-                                void* /* callback_args */,
-                                bool (* /* callback_func */)(void* arg,
-                                                             const char* entry),
-                                bool /*allow_data_in_error*/) {
+  virtual Status GetAndValidate(
+      const LookupKey& /* k */, void* /* callback_args */,
+      bool (* /* callback_func */)(void* arg, const char* entry),
+      bool /* allow_data_in_error */, bool /* detect_key_out_of_order */,
+      const std::function<Status(const char*, bool)>&
+      /* key_validation_callback */) {
     return Status::NotSupported("GetAndValidate() not implemented.");
   }
 
@@ -270,9 +278,11 @@ class MemTableRep {
     // Seek and perform integrity validations on the skip list.
     // Iterator becomes invalid and Corruption is returned if a
     // corruption is found.
-    virtual Status SeekAndValidate(const Slice& /* internal_key */,
-                                   const char* /* memtable_key */,
-                                   bool /* allow_data_in_errors */) {
+    virtual Status SeekAndValidate(
+        const Slice& /* internal_key */, const char* /* memtable_key */,
+        bool /* allow_data_in_errors */, bool /* detect_key_out_of_order */,
+        const std::function<Status(const char*, bool)>&
+        /* key_validation_callback */) {
       return Status::NotSupported("SeekAndValidate() not implemented.");
     }
 
@@ -397,6 +407,11 @@ class SkipListFactory : public MemTableRepFactory {
 // the vector is sorted. This is useful for workloads where iteration is very
 // rare and writes are generally not issued after reads begin.
 //
+// Concurrent inserts are supported by buffering writes in thread-local vectors
+// for each write batch. To optimize performance for concurrent inserts, it is
+// recommended to perform batched writes, and enable unordered_write (refer to
+// the option comment for its impact on read consistency).
+//
 // Parameters:
 //   count: Passed to the constructor of the underlying std::vector of each
 //     VectorRep. On initialization, the underlying array will be at least count
@@ -418,6 +433,8 @@ class VectorRepFactory : public MemTableRepFactory {
   MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, Allocator*,
                                  const SliceTransform*,
                                  Logger* logger) override;
+
+  bool IsInsertConcurrentlySupported() const override { return true; }
 };
 
 // This class contains a fixed array of buckets, each
diff --git a/include/rocksdb/metadata.h b/include/rocksdb/metadata.h
index 4ab3842dda80..29e6b6dc575d 100644
--- a/include/rocksdb/metadata.h
+++ b/include/rocksdb/metadata.h
@@ -224,6 +224,20 @@ struct LevelMetaData {
   const std::vector<SstFileMetaData> files;
 };
 
+// Options for filtering column family metadata by key range.
+struct GetColumnFamilyMetaDataOptions {
+  RangeOpt range;
+
+  // The level to filter on. If -1, all levels are included.
+  int level = -1;
+
+  GetColumnFamilyMetaDataOptions() = default;
+
+  GetColumnFamilyMetaDataOptions(const OptSlice& _start_key,
+                                 const OptSlice& _end_key, int _level = -1)
+      : range(_start_key, _end_key), level(_level) {}
+};
+
 // The metadata that describes a column family.
 struct ColumnFamilyMetaData {
   ColumnFamilyMetaData() : size(0), file_count(0), name("") {}
@@ -239,6 +253,9 @@ struct ColumnFamilyMetaData {
   // The name of the column family.
   std::string name;
   // The metadata of all levels in this column family.
+  // levels[i] contains files in level i.
+  // For level 0, files with recent updates are ordered first.
+  // For level 1+, files are ordered by increasing key range.
   std::vector<LevelMetaData> levels;
 
   // The total size of all blob files
diff --git a/include/rocksdb/multi_scan.h b/include/rocksdb/multi_scan.h
new file mode 100644
index 000000000000..4b0917173701
--- /dev/null
+++ b/include/rocksdb/multi_scan.h
@@ -0,0 +1,248 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/db.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// EXPERIMENTAL
+//
+// An iterator that returns results from multiple scan ranges. The ranges are
+// expected to be in increasing sorted order.
+// The results are returned in nested container objects that can be iterated
+// using an std::input_iterator.
+//
+// MultiScan
+//     |
+//     ---
+//       |
+//  MultiScanIterator  <-- std::input_iterator (returns a Scan object for each
+//         |                                    scan range)
+//         ---
+//           |
+//          Scan
+//            |
+//            ---
+//              |
+//          ScanIterator <-- std::input_iterator (returns the KVs of a single
+//                                                scan range)
+//
+// The application on top of RocksDB
+// would use this as follows -
+//
+//  std::vector<ScanOptions> scans{{.start = Slice("bar")},
+//                              {.start = Slice("foo")}};
+//  std::unique_ptr<MultiScan> iter.reset(
+//                                      db->NewMultiScan());
+//  try {
+//    for (auto scan : *iter) {
+//      for (auto it : scan) {
+//        // Do something with key - it.first
+//        // Do something with value - it.second
+//      }
+//    }
+//  } catch (MultiScanException& ex) {
+//    // Check ex.status()
+//  } catch (std::logic_error& ex) {
+//    // Check ex.what()
+//  }
+
+class MultiScanException : public std::runtime_error {
+ public:
+  explicit MultiScanException(Status& s)
+      : std::runtime_error(s.ToString()), s_(s) {}
+
+  Status& status() { return s_; }
+
+ private:
+  Status s_;
+};
+
+// A container object encapsulating a single scan range. It supports an
+// std::input_iterator for a single pass iteration of the KVs in the range.
+// A Status exception is thrown if there is an error in scanning the range.
+class Scan {
+ public:
+  class ScanIterator;
+
+  explicit Scan(Iterator* db_iter) : db_iter_(db_iter) {}
+
+  void Reset(Iterator* db_iter) { db_iter_ = db_iter; }
+
+  ScanIterator begin() { return ScanIterator(db_iter_); }
+
+  std::nullptr_t end() { return nullptr; }
+
+  class ScanIterator {
+   public:
+    using self_type = ScanIterator;
+    using value_type = std::pair<Slice, Slice>;
+    using reference = std::pair<Slice, Slice>&;
+    using pointer = std::pair<Slice, Slice>*;
+    using difference_type = int;
+    using iterator_category = std::input_iterator_tag;
+
+    explicit ScanIterator(Iterator* db_iter) : db_iter_(db_iter) {
+      valid_ = db_iter_->Valid();
+      if (valid_) {
+        result_ = value_type(db_iter_->key(), db_iter_->value());
+      }
+    }
+
+    ScanIterator() : db_iter_(nullptr), valid_(false) {}
+
+    ~ScanIterator() {
+      if (!status_.ok()) {
+        fprintf(stderr, "ScanIterator status: %s\n",
+                status_.ToString().c_str());
+        assert(false);
+      }
+    }
+
+    ScanIterator& operator++() {
+      if (!valid_) {
+        throw std::logic_error("Trying to advance invalid iterator");
+      } else {
+        db_iter_->Next();
+        status_ = db_iter_->status();
+        if (!status_.ok()) {
+          throw MultiScanException(status_);
+        } else {
+          valid_ = db_iter_->Valid();
+          if (valid_) {
+            result_ = value_type(db_iter_->key(), db_iter_->value());
+          }
+        }
+      }
+      return *this;
+    }
+
+    bool operator==(std::nullptr_t /*other*/) const { return !valid_; }
+
+    bool operator!=(std::nullptr_t /*other*/) const { return valid_; }
+
+    reference operator*() {
+      if (!valid_) {
+        throw std::logic_error("Trying to deref invalid iterator");
+      }
+      return result_;
+    }
+    reference operator->() {
+      if (!valid_) {
+        throw std::logic_error("Trying to deref invalid iterator");
+      }
+      return result_;
+    }
+
+   private:
+    Iterator* db_iter_;
+    bool valid_;
+    Status status_;
+    value_type result_;
+  };
+
+ private:
+  Iterator* db_iter_;
+};
+
+// A container object encapsulating the scan ranges for a multi scan.
+// It supports an std::input_iterator for a single pass iteration of the
+// ScanOptions in scan_opts, which can be dereferenced to get the container
+// (Scan) for a single range.
+// A Status exception is thrown if there is an error.
+class MultiScan {
+ public:
+  MultiScan(const ReadOptions& read_options, const MultiScanArgs& scan_opts,
+            DB* db, ColumnFamilyHandle* cfh);
+
+  explicit MultiScan(const Comparator* comp,
+                     std::unique_ptr<Iterator>&& db_iter)
+      : scan_opts_(comp), db_iter_(std::move(db_iter)) {}
+
+  class MultiScanIterator {
+   public:
+    MultiScanIterator(const MultiScanIterator&) = delete;
+    MultiScanIterator operator=(MultiScanIterator&) = delete;
+
+    using self_type = MultiScanIterator;
+    using value_type = Scan;
+    using reference = Scan&;
+    using pointer = Scan*;
+    using difference_type = int;
+    using iterator_category = std::input_iterator_tag;
+
+    MultiScanIterator(const std::vector<ScanOptions>& scan_opts, DB* db,
+                      ColumnFamilyHandle* cfh, ReadOptions& read_options,
+                      Slice* upper_bound, std::unique_ptr<Iterator>& db_iter)
+        : scan_opts_(scan_opts),
+          db_(db),
+          cfh_(cfh),
+          read_options_(read_options),
+          upper_bound_(upper_bound),
+          idx_(0),
+          db_iter_(db_iter),
+          scan_(db_iter_.get()) {
+      if (scan_opts_.empty()) {
+        throw std::logic_error("Zero scans in multi-scan");
+      }
+      status_ = db_iter_->status();
+      if (!status_.ok()) {
+        throw MultiScanException(status_);
+      }
+      db_iter_->Seek(*scan_opts_[idx_].range.start);
+      status_ = db_iter_->status();
+      if (!status_.ok()) {
+        throw MultiScanException(status_);
+      }
+    }
+
+    ~MultiScanIterator() { assert(status_.ok()); }
+
+    MultiScanIterator& operator++();
+
+    bool operator==(std::nullptr_t /*other*/) const {
+      return idx_ >= scan_opts_.size();
+    }
+
+    bool operator!=(std::nullptr_t /*other*/) const {
+      return idx_ < scan_opts_.size();
+    }
+
+    reference operator*() { return scan_; }
+    reference operator->() { return scan_; }
+
+   private:
+    const std::vector<ScanOptions>& scan_opts_;
+    DB* db_;
+    ColumnFamilyHandle* cfh_;
+    ReadOptions& read_options_;
+    Slice* upper_bound_;
+    size_t idx_;
+    std::unique_ptr<Iterator>& db_iter_;
+    Status status_;
+    Scan scan_;
+  };
+
+  MultiScanIterator begin() {
+    return MultiScanIterator(scan_opts_.GetScanRanges(), db_, cfh_,
+                             read_options_, &upper_bound_, db_iter_);
+  }
+
+  std::nullptr_t end() { return nullptr; }
+
+ private:
+  ReadOptions read_options_;
+  const MultiScanArgs scan_opts_;
+  DB* db_;
+  ColumnFamilyHandle* cfh_;
+  Slice upper_bound_;
+  std::unique_ptr<Iterator> db_iter_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 796de1fef086..3c0898fdc82b 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -32,7 +32,7 @@
 #include "rocksdb/version.h"
 #include "rocksdb/write_buffer_manager.h"
 
-#ifdef max
+#ifdef max  // ODR-SAFE
 #undef max
 #endif
 
@@ -57,11 +57,15 @@ class Statistics;
 class InternalKeyComparator;
 class WalFilter;
 class FileSystem;
+class UserDefinedIndexFactory;
+class IODispatcher;
 
 struct Options;
 struct DbPath;
 
 using FileTypeSet = SmallEnumSet<FileType, FileType::kBlobFile>;
+using CompactionStyleSet =
+    SmallEnumSet<CompactionStyle, CompactionStyle::kCompactionStyleNone>;
 
 struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
   // The function recovers options to a previous version. Only 4.6 or later
@@ -231,6 +235,14 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
   // different options for compression algorithms
   CompressionOptions compression_opts;
 
+  // EXPERIMENTAL
+  // Customized compression through a callback interface. When non-nullptr,
+  // supersedes the above compression options, except that the above options are
+  // still processed as they historically would be and passed to
+  // CompressionManager::GetCompressorForSST as hints or suggestions. See
+  // advanced_compression.h
+  std::shared_ptr<CompressionManager> compression_manager;
+
   // Number of files to trigger level-0 compaction. A value <0 means that
   // level-0 compaction will not be triggered by number of files at all.
   //
@@ -293,9 +305,6 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
   // Dynamically changeable through SetOptions() API
   uint64_t max_bytes_for_level_base = 256 * 1048576;
 
-  // Deprecated.
-  uint64_t snap_refresh_nanos = 0;
-
   // Disable automatic compactions. Manual compactions can still
   // be issued on this column family
   //
@@ -454,6 +463,7 @@ extern const char* kHostnameForDbHostId;
 enum class CompactionServiceJobStatus : char {
   kSuccess,
   kFailure,
+  kAborted,
   kUseLocal,
 };
 
@@ -461,6 +471,12 @@ struct CompactionServiceJobInfo {
   std::string db_name;
   std::string db_id;
   std::string db_session_id;
+
+  // the id of the column family where the compaction happened.
+  uint32_t cf_id;
+  // the name of the column family where the compaction happened.
+  std::string cf_name;
+
   uint64_t job_id;  // job_id is only unique within the current DB and session,
                     // restart DB will reset the job_id. `db_id` and
                     // `db_session_id` could help you build unique id across
@@ -474,21 +490,34 @@ struct CompactionServiceJobInfo {
   bool is_manual_compaction;
   bool bottommost_level;
 
+  // the smallest input level of the compaction.
+  // (same as Compaction::start_level and CompactionJobInfo::base_input_level)
+  int base_input_level;
+  // the output level of the compaction.
+  int output_level;
+
+  CompactionServiceJobInfo() {}
   CompactionServiceJobInfo(std::string db_name_, std::string db_id_,
-                           std::string db_session_id_, uint64_t job_id_,
+                           std::string db_session_id_, uint32_t cf_id_,
+                           std::string cf_name_, uint64_t job_id_,
                            Env::Priority priority_,
                            CompactionReason compaction_reason_,
                            bool is_full_compaction_, bool is_manual_compaction_,
-                           bool bottommost_level_)
+                           bool bottommost_level_, int base_input_level_,
+                           int output_level_)
       : db_name(std::move(db_name_)),
         db_id(std::move(db_id_)),
         db_session_id(std::move(db_session_id_)),
+        cf_id(cf_id_),
+        cf_name(std::move(cf_name_)),
         job_id(job_id_),
         priority(priority_),
         compaction_reason(compaction_reason_),
         is_full_compaction(is_full_compaction_),
         is_manual_compaction(is_manual_compaction_),
-        bottommost_level(bottommost_level_) {}
+        bottommost_level(bottommost_level_),
+        base_input_level(base_input_level_),
+        output_level(output_level_) {}
 };
 
 struct CompactionServiceScheduleResponse {
@@ -579,12 +608,20 @@ struct DBOptions {
   // checksums. True also enters a read-only mode when a DB write fails;
   // see DB::Resume().
   //
+  // When set to true, the DB will fail to open if any SST files fail to open
+  // e.g. due to incorrect file size or corrupted footer.
+  //
+  // When set to false, when there are files corrupted, the DB will still be
+  // opened, and the healthy ones could still be accessed, while corrupted one
+  // will not
+  //
   // As most workloads value data correctness over availability, this option
   // is on by default. Note that the name of this old option is potentially
   // misleading, and other options and operations go further in proactive
   // checking for corruption, including
   // * paranoid_file_checks
   // * paranoid_memory_checks
+  // * memtable_veirfy_per_key_checksum_on_seek
   // * DB::VerifyChecksum()
   //
   // Default: true
@@ -593,7 +630,8 @@ struct DBOptions {
   // DEPRECATED: This option might be removed in a future release.
   //
   // If true, during memtable flush, RocksDB will validate total entries
-  // read in flush, and compare with counter inserted into it.
+  // read in flush, total entries written in the SST and compare them with
+  // counter of keys added.
   //
   // The option is here to turn the feature off in case this new validation
   // feature has a bug. The option may be removed in the future once the
@@ -812,6 +850,7 @@ struct DBOptions {
   // If it is non empty, the log files will be in the specified dir,
   // and the db data dir's absolute path will be used as the log file
   // name's prefix.
+  // NOTE: not for WALs
   std::string db_log_dir = "";
 
   // This specifies the absolute dir path for write-ahead logs (WAL).
@@ -892,21 +931,24 @@ struct DBOptions {
   // be created.
   // If max_log_file_size == 0, all logs will be written to one
   // log file.
+  // NOTE: not for WALs
   size_t max_log_file_size = 0;
 
   // Time for the info log file to roll (in seconds).
   // If specified with non-zero value, log file will be rolled
   // if it has been active longer than `log_file_time_to_roll`.
   // Default: 0 (disabled)
+  // NOTE: not for WALs
   size_t log_file_time_to_roll = 0;
 
   // Maximal info log files to be kept.
   // Default: 1000
+  // NOTE: not for WALs
   size_t keep_log_file_num = 1000;
 
-  // Recycle log files.
-  // If non-zero, we will reuse previously written log files for new
-  // logs, overwriting the old data.  The value indicates how many
+  // Recycle WAL files.
+  // If non-zero, we will reuse previously written WAL files for new
+  // WALs, overwriting the old data.  The value indicates how many
   // such files we will keep around at any point in time for later
   // use.  This is more efficient because the blocks are already
   // allocated and fdatasync does not need to update the inode after
@@ -914,12 +956,67 @@ struct DBOptions {
   // Default: 0
   size_t recycle_log_file_num = 0;
 
-  // manifest file is rolled over on reaching this limit.
-  // The older manifest file be deleted.
-  // The default value is 1GB so that the manifest file can grow, but not
-  // reach the limit of storage capacity.
+  // The manifest file is rolled over on reaching this limit AND the
+  // space amp limit described in max_manifest_space_amp_pct. More trade-off
+  // details there.
+  //
+  // NOTE: this option used to be a hard limit, but that made this a dangerous
+  // tuning parameter for optimizing manifest file size because the best
+  // size really depends on the DB size and average SST file size (and other
+  // settings). Now it is essentially a minimum for the auto-tuned max manifest
+  // file size.
+  //
+  // Until the max_manifest_space_amp_pct feature is fully validated to show a
+  // smaller default here like 1MB is appropriate, the default value is 1GB to
+  // match historical behavior (without it being a hard limit in case of giant
+  // compacted manifest size).
+  //
+  // This option is mutable with SetDBOptions(), taking effect on the next
+  // manifest write (e.g. completed DB compaction or flush).
   uint64_t max_manifest_file_size = 1024 * 1024 * 1024;
 
+  // This option mostly replaces max_manifest_file_size to control an auto-tuned
+  // balance of manifest write amplification and space amplification. A new
+  // manifest file is created with the "compacted" contents of the old one when
+  //  current_manifest_size
+  //    >
+  //  max(max_manifest_file_size,
+  //      est_compacted_manifest_size * (1 + max_manifest_space_amp_pct/100))
+  //
+  // where est_compacted_manifest_size is an estimate of how big a new compacted
+  // version of the current manifest would be. Currently, the estimate used is
+  // the last newly-written manifest, in its "compacted" form.
+  //
+  // Space amplification in the manifest file might be less of a concern for
+  // primary storage space and more of a concern for DB recover time and size of
+  // backup files that aren't incremental between backups. To minimize manifest
+  // churn on initial DB population, setting max_manifest_file_size to something
+  // not too small, like 1MB, should suffice. Similarly, write amp on the
+  // manifest file is likely not a direct concern but completed compactions and
+  // flushes cannot (currently) be committed while the (relatively small)
+  // manifest file is being compacted. Manifest compactions should not
+  // interfere with user write latency or throughput unless the DB is
+  // chronically stalling or close to stalling writes already.
+  //
+  // For this option to have a meaningful effect, it is recommended to set
+  // max_manifest_file_size to something modest like 1MB. Then we can interpret
+  // values for this option as follows, starting with minimum space amp and
+  // maximum write amp:
+  // * 0 - Every manifest write (flush, compaction, etc.) generates a whole new
+  // manifest. Only useful for testing.
+  // * very small - Doesn't take many manifest writes to generate a whole new
+  // manifest.
+  // * 100 - In a DB with pretty consistent number of SST files, etc., achieves
+  // about 1.0 write amp (writing about 2x the theoretical minimum) and a max of
+  // about 1.0 space amp (manifest up to 2x the compacted size).
+  // * 500 - Recommended and default: 0.2 write amp and up to roughly 5.0 space
+  // amp.
+  // * 10000 - 0.01 write amp and up to 100 space amp on the manifest.
+  //
+  // This option is mutable with SetDBOptions(), taking effect on the next
+  // manifest write (e.g. completed DB compaction or flush).
+  int max_manifest_space_amp_pct = 500;
+
   // Number of shards used for table cache.
   int table_cache_numshardbits = 6;
 
@@ -1263,16 +1360,6 @@ struct DBOptions {
   // Default: false
   bool skip_stats_update_on_db_open = false;
 
-  // If true, then DB::Open() will not fetch and check sizes of all sst files.
-  // This may significantly speed up startup if there are many sst files,
-  // especially when using non-default Env with expensive GetFileSize().
-  // We'll still check that all required sst files exist.
-  // If paranoid_checks is false, this option is ignored, and sst files are
-  // not checked at all.
-  //
-  // Default: false
-  bool skip_checking_sst_file_sizes_on_db_open = false;
-
   // Recovery mode to control the consistency while replaying WAL
   // Default: kPointInTimeRecovery
   WALRecoveryMode wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
@@ -1294,14 +1381,6 @@ struct DBOptions {
   // currently.
   WalFilter* wal_filter = nullptr;
 
-  // DEPRECATED: This option might be removed in a future release.
-  //
-  // If true, then DB::Open, CreateColumnFamily, DropColumnFamily, and
-  // SetOptions will fail if options file is not properly persisted.
-  //
-  // DEFAULT: true
-  bool fail_if_options_file_error = true;
-
   // If true, then print malloc stats together with rocksdb.stats
   // when printing to LOG.
   // DEFAULT: false
@@ -1325,16 +1404,11 @@ struct DBOptions {
   // Dynamically changeable through SetDBOptions() API.
   bool avoid_flush_during_shutdown = false;
 
-  // Set this option to true during creation of database if you want
-  // to be able to ingest behind (call IngestExternalFile() skipping keys
-  // that already exist, rather than overwriting matching keys).
-  // Setting this option to true has the following effects:
-  // 1) Disable some internal optimizations around SST file compression.
-  // 2) Reserve the last level for ingested files only.
-  // 3) Compaction will not include any file from the last level.
-  // Note that only Universal Compaction supports allow_ingest_behind.
-  // `num_levels` should be >= 3 if this option is turned on.
+  // DEPRECATED: use ColumnFamilyOptions::cf_allow_ingest_behind instead.
+  // This option might be removed in a future release.
   //
+  // See comment for `ColumnFamilyOptions::cf_allow_ingest_behind` for
+  // detail about the option's functionality and use cases.
   //
   // DEFAULT: false
   // Immutable.
@@ -1420,9 +1494,10 @@ struct DBOptions {
   // prefix_same_as_start=true can take advantage of prefix seek optimizations.
   bool prefix_seek_opt_in_only = false;
 
-  // The number of bytes to prefetch when reading the log. This is mostly useful
-  // for reading a remotely located log, as it can save the number of
-  // round-trips. If 0, then the prefetching is disabled.
+  // The number of bytes to prefetch when reading the DB manifest and WAL files
+  // during DB::Open (and variants). This is mostly useful for reading a
+  // remotely located log, as it can save the number of round-trips. If 0, then
+  // the prefetching is disabled.
   //
   // Default: 0
   size_t log_readahead_size = 0;
@@ -1619,6 +1694,24 @@ struct DBOptions {
   // `kUnknown`, this overrides any temperature set by OptimizeForLogWrite
   // functions.
   Temperature wal_write_temperature = Temperature::kUnknown;
+
+  // Enum set indicative of which compaction styles SST write lifetime hint
+  // calculation is allowed on. Today, RocksDB provides native support for
+  // kCompactionStyleLevel and kCompactionStyleUniversal (experimental version).
+  // Other compaction styles, even when enabled in the set, won't have any
+  // effect in the default PosixWritableFile file implementation. There are
+  // numerous benefits coming from employing the hints including reduction in
+  // write amplification caused by OS file movement during garbage collection,
+  // and reduction in wear-leveling (SSDs). However, as currently implemented,
+  // SST write lifetime hints are calculated in a static way and solely based on
+  // the level, which might not be suitable for non-uniform workloads with
+  // dynamic / high-variance lifespan of data within the same level. In those
+  // cases (or when the performance is not satisfactory), it's recommended to
+  // disable the hints by assigning the setting to the empty set (= {});
+  //
+  // Default: Enabled in kCompactionStyleLevel mode.
+  CompactionStyleSet calculate_sst_write_lifetime_hint_set = {
+      CompactionStyle::kCompactionStyleLevel};
   // End EXPERIMENTAL
 };
 
@@ -1682,6 +1775,174 @@ enum ReadTier {
   kMemtableTier = 0x3     // data in memtable. used for memtable-only iterators.
 };
 
+// A range of keys. In case of user_defined timestamp, if enabled, `start` and
+// `limit` should point to key without timestamp part.
+struct Range {
+  Slice start;
+  Slice limit;
+
+  Range() {}
+  Range(const Slice& s, const Slice& l) : start(s), limit(l) {}
+};
+
+// A key range with optional endpoints. In case of user_defined timestamp, if
+// enabled, `start` and `limit` should point to key without timestamp part.
+struct RangeOpt {
+  // When start.has_value() == false, refers to starting before every key
+  OptSlice start;
+  // When limit.has_value() == false, refers to ending after every key
+  OptSlice limit;
+
+  RangeOpt() {}
+  RangeOpt(const OptSlice& s, const OptSlice& l) : start(s), limit(l) {}
+};
+
+// EXPERIMENTAL
+//
+// Options for a RocksDB scan request. Only forward scans for now.
+// We may add other options such as prefix scan in the future.
+struct ScanOptions {
+  // The scan range. Mandatory for start to be set, limit is optional
+  RangeOpt range;
+
+  // A map of name,value pairs that can be passed by the user to an
+  // external table reader. This is completely opaque to RocksDB and is
+  // ignored by the natively supported table readers like block based and plain
+  // table. This is only useful for Iterator.
+  std::optional<std::unordered_map<std::string, std::string>> property_bag;
+
+  // An unbounded scan with a start key
+  explicit ScanOptions(const Slice& _start) : range(_start, OptSlice()) {}
+
+  // A bounded scan with a start key and upper bound
+  ScanOptions(const Slice& _start, const Slice& _upper_bound)
+      : range(_start, _upper_bound) {}
+};
+
+// Container for multiple scan ranges that can be used with MultiScan.
+// This replaces std::vector<ScanOptions> with a more efficient implementation
+// that can merge overlapping ranges.
+class MultiScanArgs {
+ public:
+  // Constructor that takes a comparator
+  explicit MultiScanArgs(const Comparator* comparator) : comp_(comparator) {}
+
+  // Copy Constructor
+  MultiScanArgs(const MultiScanArgs& other) {
+    comp_ = other.comp_;
+    original_ranges_ = other.original_ranges_;
+    io_coalesce_threshold = other.io_coalesce_threshold;
+    max_prefetch_size = other.max_prefetch_size;
+    use_async_io = other.use_async_io;
+    io_dispatcher = other.io_dispatcher;
+  }
+  MultiScanArgs(MultiScanArgs&& other) noexcept
+      : io_coalesce_threshold(other.io_coalesce_threshold),
+        max_prefetch_size(other.max_prefetch_size),
+        use_async_io(other.use_async_io),
+        io_dispatcher(std::move(other.io_dispatcher)),
+        comp_(other.comp_),
+        original_ranges_(std::move(other.original_ranges_)) {}
+
+  MultiScanArgs& operator=(const MultiScanArgs& other) {
+    comp_ = other.comp_;
+    original_ranges_ = other.original_ranges_;
+    io_coalesce_threshold = other.io_coalesce_threshold;
+    max_prefetch_size = other.max_prefetch_size;
+    use_async_io = other.use_async_io;
+    io_dispatcher = other.io_dispatcher;
+    return *this;
+  }
+
+  MultiScanArgs& operator=(MultiScanArgs&& other) noexcept {
+    if (this != &other) {
+      comp_ = other.comp_;
+      original_ranges_ = std::move(other.original_ranges_);
+      io_coalesce_threshold = other.io_coalesce_threshold;
+      max_prefetch_size = other.max_prefetch_size;
+      use_async_io = other.use_async_io;
+      io_dispatcher = std::move(other.io_dispatcher);
+    }
+    return *this;
+  }
+
+  void insert(const Slice& s, const Slice& b) {
+    original_ranges_.emplace_back(s, b);
+  }
+
+  void insert(const Slice& s, const Slice& b,
+              const std::optional<std::unordered_map<std::string, std::string>>&
+                  property_bag) {
+    original_ranges_.emplace_back(s, b);
+    original_ranges_.back().property_bag = property_bag;
+  }
+
+  void insert(const Slice& s) { original_ranges_.emplace_back(s); }
+
+  void insert(const Slice& s,
+              const std::optional<std::unordered_map<std::string, std::string>>&
+                  property_bag) {
+    original_ranges_.emplace_back(s);
+    original_ranges_.back().property_bag = property_bag;
+  }
+
+  size_t size() const { return original_ranges_.size(); }
+  bool empty() const { return original_ranges_.empty(); }
+
+  void reserve(size_t size) { original_ranges_.reserve(size); }
+
+  operator std::vector<ScanOptions>*() { return &original_ranges_; }
+
+  operator const std::vector<ScanOptions>*() const { return &original_ranges_; }
+
+  ~MultiScanArgs() {}
+
+  const std::vector<ScanOptions>& GetScanRanges() const {
+    return original_ranges_;
+  }
+
+  const Comparator* GetComparator() const { return comp_; }
+
+  // Copies the configurations (excluding actual scan ranges) from another
+  // MultiScanArgs.
+  void CopyConfigFrom(const MultiScanArgs& other) {
+    io_coalesce_threshold = other.io_coalesce_threshold;
+    max_prefetch_size = other.max_prefetch_size;
+    use_async_io = other.use_async_io;
+    io_dispatcher = other.io_dispatcher;
+  }
+
+  uint64_t io_coalesce_threshold = 16 << 10;  // 16KB by default
+
+  // Maximum size (in bytes) for the data blocks loaded by a MultiScan.
+  // This limits the amount of I/O and memory usage by pinned data blocks.
+  //
+  // When set to 0 (the default), there is no limit. When the limit is reached,
+  // the iterator will start returning Status::PrefetchLimitReached().
+  //
+  // Note that prefetching happens only once in Prepare(), which is different
+  // from ReadOptions::readahead_size, which applies any time the iterator does
+  // I/O.
+  // Note that this limit is per file and applies to compressed block size.
+  uint64_t max_prefetch_size = 0;
+
+  // Enable async I/O for multi-scan operations
+  // When true, BlockBasedTableIterator will use ReadAsync() for reading blocks
+  // When false, it will use synchronous MultiRead().
+  bool use_async_io = false;
+
+  // Optional IODispatcher for multi-scan operations.
+  // If nullptr (default), a new IODispatcher is created internally.
+  // Users can provide their own IODispatcher for custom IO scheduling
+  // or for testing/monitoring purposes (e.g., to check IO statistics).
+  std::shared_ptr<IODispatcher> io_dispatcher = nullptr;
+
+ private:
+  // The comparator used for ordering ranges
+  const Comparator* comp_;
+  std::vector<ScanOptions> original_ranges_;
+};
+
 // Options that control read operations
 struct ReadOptions {
   // *** BEGIN options relevant to point lookups as well as scans ***
@@ -1763,6 +2024,10 @@ struct ReadOptions {
   // block cache.
   bool fill_cache = true;
 
+  // DEPRECATED: This option might be removed in a future release.
+  // There should be no noticeable performance difference whether this option
+  // is turned on or off when a DB does not use DeleteRange().
+  //
   // If true, range tombstones handling will be skipped in key lookup paths.
   // For DB instances that don't use DeleteRange() calls, this setting can
   // be used to optimize the read performance.
@@ -1841,10 +2106,6 @@ struct ReadOptions {
   // that were inserted into the database after the creation of the iterator.
   bool tailing = false;
 
-  // This options is not used anymore. It was to turn on a functionality that
-  // has been removed. DEPRECATED
-  bool managed = false;
-
   // Enable a total order seek regardless of index format (e.g. hash index)
   // used in the table. Some table format (e.g. plain table) may not support
   // this option.
@@ -1968,6 +2229,17 @@ struct ReadOptions {
   // Default: false
   bool auto_refresh_iterator_with_snapshot = false;
 
+  // EXPERIMENTAL
+  //
+  // Specify an alternate index to use in the SST files instead of the native
+  // block based table index. The table_factory used for the column family
+  // must support building/reading this index.
+  //
+  // Currently, only forward scans are supported. For forward scans, only Seek()
+  // is supported. SeekToFirst() is not supported. If the caller wishes to scan
+  // from start to end, the native index must be used.
+  const UserDefinedIndexFactory* table_index_factory = nullptr;
+
   // *** END options only relevant to iterators or scans ***
 
   // *** BEGIN options for RocksDB internal use only ***
@@ -1975,18 +2247,21 @@ struct ReadOptions {
   // EXPERIMENTAL
   Env::IOActivity io_activity = Env::IOActivity::kUnknown;
 
-  // EXPERIMENTAL
-  // An optional weight of values to be returned by a scan. Once the
-  // weight is reached or exceeded the scan is terminated (i.e Next()
-  // invalidates the iterator). In the case of a DB with one of the built-in
-  // table formats, such as BlockBasedTable, the weight is simply the number
-  // of key-value pairs. In the case of an ExternalTableReader, the weight is
-  // passed through to the table reader and the interpretation is upto the
-  // reader implementation.
-  uint64_t weight = 0;
-
   // *** END options for RocksDB internal use only ***
 
+  // *** BEGIN per-request settings for internal team use only ***
+
+  // TODO: create a new struct for per-request options, potentially including
+  // timestamps in point lookups/scans
+
+  // request_id is a unique id assigned by the application. It is used to allow
+  // us to link file system metrics/logs to rocksDB and application logs. This
+  // request_id may not be unique to each RocksDB api call - it could refer to
+  // an application level request that results in multiple RocksDB api calls
+  const std::string* request_id = nullptr;
+
+  // *** END per-request settings for internal team use only ***
+
   ReadOptions() {}
   ReadOptions(bool _verify_checksums, bool _fill_cache);
   explicit ReadOptions(Env::IOActivity _io_activity);
@@ -2098,6 +2373,23 @@ struct FlushOptions {
   FlushOptions() : wait(true), allow_write_stall(false) {}
 };
 
+struct FlushWALOptions {
+  // If true, it calls `SyncWAL()` afterwards.
+  // Default: false
+  bool sync;
+
+  // For IO operations associated with flushing the WAL, charge the internal
+  // rate limiter (see `DBOptions::rate_limiter`) at the specified priority and
+  // pass the priority down to the file system through
+  // `IOOptions::rate_limiter_priority`. The special value `Env::IO_TOTAL`
+  // disables charging the rate limiter.
+  //
+  // Default: `Env::IO_TOTAL`
+  Env::IOPriority rate_limiter_priority;
+
+  FlushWALOptions() : sync(false), rate_limiter_priority(Env::IO_TOTAL) {}
+};
+
 // Create a Logger from provided DBOptions
 Status CreateLoggerFromOptions(const std::string& dbname,
                                const DBOptions& options,
@@ -2126,10 +2418,31 @@ struct CompactionOptions {
   // If > 0, it will replace the option in the DBOptions for this compaction.
   uint32_t max_subcompactions;
 
+  // Allows cancellation of an in-progress manual compaction.
+  //
+  // Cancellation can be delayed waiting on automatic compactions when used
+  // together with `exclusive_manual_compaction == true`.
+  std::atomic<bool>* canceled;
+  // NOTE: Calling DisableManualCompaction() will not override the
+  // canceled variable in CompactionOptions, as it does for CompactRangeOptions
+  // - this is because ManualCompactionState is not used
+
+  // Create output compaction file using this file temperature. If unset, will
+  // default to "last_level_temperature" if output level is last level otherwise
+  // "default_write_temperature"
+  Temperature output_temperature_override = Temperature::kUnknown;
+
+  // Option to optimize the manual compaction by enabling trivial move for non
+  // overlapping files.
+  // Default: false
+  bool allow_trivial_move;
+
   CompactionOptions()
       : compression(kDisableCompressionOption),
         output_file_size_limit(std::numeric_limits<uint64_t>::max()),
-        max_subcompactions(0) {}
+        max_subcompactions(0),
+        canceled(nullptr),
+        allow_trivial_move(false) {}
 };
 
 // For level based compaction, we can configure if we want to skip/force
@@ -2196,7 +2509,7 @@ struct CompactRangeOptions {
   // Cancellation can be delayed waiting on automatic compactions when used
   // together with `exclusive_manual_compaction == true`.
   std::atomic<bool>* canceled = nullptr;
-  // NOTE: Calling DisableManualCompaction() overwrites the uer-provided
+  // NOTE: Calling DisableManualCompaction() overwrites the user-provided
   // canceled variable in CompactRangeOptions.
   // Typically, when CompactRange is being called in one thread (t1) with
   // canceled = false, and DisableManualCompaction is being called in the
@@ -2218,7 +2531,47 @@ struct CompactRangeOptions {
   double blob_garbage_collection_age_cutoff = -1;
 };
 
-// IngestExternalFileOptions is used by IngestExternalFile()
+// IngestExternalFileOptions setting guide:
+//
+// The options in IngestExternalFileOptions interact in complex ways depending
+// on the source and overlap of SST files. Below is a summary of recommended
+// non-default settings for common use cases:
+//
+// 1. Ingesting only SST writer generated non-overlapping SSTs that are not
+// expected to overlap with existing data:
+//    - Optionally set fail_if_not_bottommost_level = true to enforce placement
+//    in the last level. This is better paird with SST partitioner to guarantee
+//    that there are no existing file with keys across the ingesting key range.
+//    - Set allow_blocking_flush to false: Not expecting to overlap with
+//    memtable and cause a flush.
+//    - If snapshot consistency is not expected, set snapshot_consistency to
+//    false and allow_global_seqno to false. allow_global_seqno = false will
+//    fail ingestion if any input file overlap with each other.
+//
+// 2. Ingesting SST writer generated overlapping SSTs:
+//    - order files with older updates first, newer overwrites later.
+//    - Set allow_global_seqno = true since newer files need to be assigned
+//    larger sequence numbers.
+//
+// 3. Ingesting DB generated SSTs: overlapping with target CF data is not
+// allowed. Input files are allowed to contain both DB generated files and SST
+// file writer generated files. They will all be treated as DB generated.
+//    - Set allow_db_generated_files = true.
+//    - Set snapshot_consistency = false: snapshot consistency requires
+// assigning higher sequence number to ingested files. DB generated files
+// don't support global seqno assignment yet.
+//    - Set allow_blocking_flush to false: Not expecting to overlap with
+//    memtable and cause a flush.
+//    - If the source live DB is running, set link_files = true instead of
+//    move_files.
+// 3a) SST files are non-overlapping and all keys have seqno 0: e.g., a
+// temporary RocksDB instance used to sort some data, and compacts all
+// data into the last level before ingestion.
+//    - Optionally set fail_if_not_bottommost_level = true to enforce placement
+//    in the last level.
+// 3b) SST files are overlapping, e.g. ingesting files from one CF to another.
+//    - Ensure older updates are ordered first and newer updates are ordered
+//    later. See more in option comment for allow_db_generated_files.
 struct IngestExternalFileOptions {
   // Can be set to true to move the files instead of copying them.
   // The input files will be unlinked after successful ingestion.
@@ -2235,20 +2588,33 @@ struct IngestExternalFileOptions {
   // If set to false, an ingested file keys could appear in existing snapshots
   // that where created before the file was ingested.
   bool snapshot_consistency = true;
-  // If set to false, IngestExternalFile() will fail if the file key range
+  // Enables assiging a global sequence number to each ingested file, i.e.,
+  // all keys in the ingested file will be treated as having this seqno.
+  // If set to false, we will use the sequence numbers in the ingested file
+  // as is, and IngestExternalFile() will fail if the ingested key range
   // overlaps with existing keys or tombstones or output of ongoing compaction
-  // during file ingestion in the DB (the conditions under which a global_seqno
-  // must be assigned to the ingested file).
+  // in the CF (the conditions under which a global seqno must be assigned to
+  // the ingested file).
+  // If the ingested files overlap with each other, we need to assign global
+  // sequence to the ingested files and this option needs to be enabled. One
+  // exception to this is when ingesting DB generated SST files (see option
+  // allow_db_generated_files below). DB generated files do not support
+  // global seqno assignment and can be ingested even if they overlap with
+  // each other. This option has no effect when allow_db_generated_files is
+  // enabled.
   bool allow_global_seqno = true;
-  // If set to false and the file key range overlaps with the memtable key range
-  // (memtable flush required), IngestExternalFile will fail.
+  // Normally (true), IngestExternalFile() will trigger and block for flushing
+  // memtable(s) if there is overlap between ingested files and memtable(s). If
+  // allow_blocking_flush is set to false, IngestExternalFile() will fail if the
+  // file key range overlaps with the memtable key range (memtable flush
+  // required).
   bool allow_blocking_flush = true;
   // Set to true if you would like duplicate keys in the file being ingested
   // to be skipped rather than overwriting existing data under that key.
   // Use case: back-fill of some historical data in the database without
   // over-writing existing newer version of data.
-  // This option could only be used if the DB has been running
-  // with allow_ingest_behind=true since the dawn of time.
+  // This option could only be used if the CF has been running
+  // with cf_allow_ingest_behind=true since CF creation (or before any write).
   // All files will be ingested at the bottommost level with seqno=0.
   bool ingest_behind = false;
   // DEPRECATED - Set to true if you would like to write global_seqno to
@@ -2301,18 +2667,53 @@ struct IngestExternalFileOptions {
   //
   // XXX: "bottommost" is obsolete/confusing terminology to refer to last level
   bool fail_if_not_bottommost_level = false;
-  // EXPERIMENTAL
-  // Enables ingestion of files not generated by SstFileWriter. When true:
+  // EXPERIMENTAL, SUBJECT TO CHANGE
+  //
+  // Enables special mode of ingestion that allows files generated by a live DB,
+  // instead of SstFileWriter. When true:
   // - Allows files to be ingested when their cf_id doesn't match the CF they
   //   are being ingested into.
+  // - Allows files with any sequence numbers to be ingested.
+  // - Original sequence numbers are preserved (no reassignment).
+  //
   // REQUIREMENTS:
-  // - Ingested files must not overlap with existing keys.
-  // - `write_global_seqno` must be false.
-  // - All keys in ingested files should have sequence number 0. We fail
-  // ingestion if any sequence numbers is non-zero.
-  // WARNING: If a DB contains ingested files generated by another DB/CF,
-  // RepairDB() may not recover these files correctly, potentially leading to
-  // data loss.
+  // - Ingested files must NOT overlap with any existing data in the DB. Since
+  //   no sequence number reassignment is performed on db generated files.
+  //   Ingestion will fail if any overlap is detected. However, input files
+  //   are allowed to overlap with each other when this option is enabled. This
+  //   is useful when ingesting multiple levels of files from a CF, where
+  //   levels naturally overlap with each other.
+  // - CAUTION: If input files overlap with each other, then for any given user
+  //   key appearing in multiple files, earlier files MUST have smaller sequence
+  //   numbers than later files. Later files will be placed at a higher level
+  //   (smaller level number). This is to ensure the LSM invariant where for
+  //   the same key, recent updates are in higher levels. This means that
+  //   if you are ingesting files from multiple levels of a CF, you should
+  //   put files from lower levels first, and files from higher levels later.
+  //   Example for getting files from a CF for ingestion:
+  //
+  // ColumnFamilyMetaData cf_meta;
+  // from_db->GetColumnFamilyMetaData(from_cf, &cf_meta);
+  // // iterate in reverse to start from lowest level
+  // for (auto level_meta = cf_meta.levels.rbegin();
+  //      level_meta != cf_meta.levels.rend(); ++level_meta) {
+  //   // L0 files need to be added in reverse order so we iterate in reverse
+  //   // within a level too
+  //   for (auto file_meta = level_meta->files.rbegin();
+  //        file_meta != level_meta->files.rend(); ++file_meta) {
+  //     // Add file for ingestion
+  //   }
+  // }
+  //
+  //   WARNING: Violating the sequence number ordering requirement will cause
+  //   LSM invariant violations and may lead to incorrect reads or data
+  //   corruption.
+  // - If you would like to enforce that the ingested files do not overlap
+  //   with each other, you can set `fail_if_not_bottommost_level` to true.
+  //   If ingested files overlap with each other, some file will be placed
+  //   above Lmax, failing the ingestion if the option is set.
+  // - `write_global_seqno` must be false (sequence numbers cannot be
+  //    reassigned).
   bool allow_db_generated_files = false;
 
   // Controls whether data and metadata blocks (e.g. index, filter) read during
@@ -2324,6 +2725,44 @@ struct IngestExternalFileOptions {
   bool fill_cache = true;
 };
 
+// It is valid that files_checksums and files_checksum_func_names are both
+// empty (no checksum information is provided for ingestion). Otherwise,
+// their sizes should be the same as external_files. The file order should
+// be the same in three vectors and guaranteed by the caller.
+// Note that, we assume the temperatures of this batch of files to be
+// ingested are the same.
+struct IngestExternalFileArg {
+  ColumnFamilyHandle* column_family = nullptr;
+  std::vector<std::string> external_files;
+  IngestExternalFileOptions options;
+  std::vector<std::string> files_checksums;
+  std::vector<std::string> files_checksum_func_names;
+  // A hint as to the temperature for *reading* the files to be ingested.
+  Temperature file_temperature = Temperature::kUnknown;
+  // EXPERIMENTAL: When specified, existing keys in the given range will be
+  // cleared atomically as part of the ingestion, where the ingested files are
+  // logically applied on top of the cleared key range.
+  // * If both `start` and `limit` are nullptr, the entire column family is
+  // cleared; however, setting just one bound to nullptr is not yet supported.
+  // * When a range is specified, all the external files in this batch must
+  //   be contained in that key range.
+  // * Checks for memtable overlap and possible blocking flush will apply
+  //   to this range (not just the file ranges).
+  // * Not compatible with ingest_behind=true.
+  // * When options.snapshot_consistency = false, the range is cleared
+  // similarly to DeleteFilesInRange, but fails if any files overlap the range
+  // only partially.
+  //   * It is recommended to use fail_if_not_bottommost_level=true to ensure
+  //     data in the key range is ingested to a single compacted level (the
+  //     last level). (fail_if_not_bottommost_level=false allows overlap between
+  //     the ingested files.)
+  // * options.snapshot_consistency = true is not yet supported.
+  // BUG: the upper bound of the range may be interpreted as inclusive or
+  // exclusive, so it is best not to depend on one or the other until it is
+  // sorted out.
+  std::optional<RangeOpt> atomic_replace_range;
+};
+
 enum TraceFilterType : uint64_t {
   // Trace all the operations
   kTraceFilterNone = 0x0,
@@ -2409,15 +2848,58 @@ struct CompactionServiceOptionsOverride {
   // to set it here.
   std::shared_ptr<Statistics> statistics = nullptr;
 
+  // Info Log. If not overriden, default one will be used.
+  std::shared_ptr<Logger> info_log = nullptr;
+
   // Only compaction generated SST files use this user defined table properties
   // collector.
   std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
       table_properties_collector_factories;
+
+  // All other options to override. Unknown options will be ignored.
+  std::unordered_map<std::string, std::string> options_map;
 };
 
 struct OpenAndCompactOptions {
   // Allows cancellation of an in-progress compaction.
   std::atomic<bool>* canceled = nullptr;
+
+  // EXPERIMENTAL
+  //
+  // Controls whether OpenAndCompact() should attempt to resume from previously
+  // persisted compaction progress or start fresh.
+  //
+  // When `allow_resumption = true`:
+  // - OpenAndCompact() attempts to resume from previously persisted compaction
+  //   progress stored in `output_directory`
+  // - During execution, it periodically persists new progress to the same
+  //   directory, allowing future calls to continue from where the previous
+  //   compaction left off.
+  // - Fallback behavior: If resumption cannot be fulfilled (e.g., due to
+  //   corrupted or missing resume state), the system will attempt to start a
+  //   fresh compaction as a best-effort fallback by cleaning related files in
+  //   the `output_directory` to achieve a clean state. If even the fresh
+  //   compaction cannot be started, a non-OK status will be returned.
+  // - Important: Resume attempts will be ineffective if the underlying
+  //   conditions that caused the previous OpenAndCompact() failure still
+  //   persist. The same non-OK status will likely be returned unless the root
+  //   cause has been resolved.
+  // - Progress persistence is sequential and best-effort, triggered upon
+  //   completion of each new output file. If compaction is interrupted while
+  //   creating an output file (before its completion), that partial work will
+  //   need to be redone upon resumption.
+  //
+  // When `allow_resumption = false`:
+  // - OpenAndCompact() starts a fresh compaction from scratch.
+  // - No progress will be saved during execution, so interruptions require
+  //   starting over completely.
+  // - CRITICAL REQUIREMENT: The `output_directory` associated MUST be empty
+  //   before calling OpenAndCompact(). Any existing files (including resume
+  //   state or output files from previous runs) may cause correctness errors.
+  //
+  // Limitation: Currently incompatible with paranoid_file_checks=true. The
+  // option is effectively disabled when `paranoid_file_checks` is enabled.
+  bool allow_resumption = false;
 };
 
 struct LiveFilesStorageInfoOptions {
diff --git a/include/rocksdb/point_lock_bench_tool.h b/include/rocksdb/point_lock_bench_tool.h
new file mode 100644
index 000000000000..ed6066c43128
--- /dev/null
+++ b/include/rocksdb/point_lock_bench_tool.h
@@ -0,0 +1,14 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+int point_lock_bench_tool(int argc, char** argv);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/rate_limiter.h b/include/rocksdb/rate_limiter.h
index 51383ba20adc..ede742aba6ac 100644
--- a/include/rocksdb/rate_limiter.h
+++ b/include/rocksdb/rate_limiter.h
@@ -132,7 +132,7 @@ class RateLimiter {
   }
 
  protected:
-  Mode GetMode() { return mode_; }
+  Mode GetMode() const { return mode_; }
 
  private:
   const Mode mode_;
diff --git a/include/rocksdb/secondary_cache.h b/include/rocksdb/secondary_cache.h
index e8644c45469f..c7b7b6886efb 100644
--- a/include/rocksdb/secondary_cache.h
+++ b/include/rocksdb/secondary_cache.h
@@ -33,8 +33,8 @@ namespace ROCKSDB_NAMESPACE {
 // Wait() or SecondaryCache::WaitAll() may be skipped if IsReady() happens to
 // return true, but (depending on the implementation) IsReady() might never
 // return true without Wait() or SecondaryCache::WaitAll(). After the handle
-// is known ready, calling Value() is required to avoid a memory leak in case
-// of a cache hit.
+// is known ready, calling Value() and taking ownership is required to avoid
+// a memory leak in case of a cache hit.
 class SecondaryCacheResultHandle {
  public:
   virtual ~SecondaryCacheResultHandle() = default;
diff --git a/include/rocksdb/slice.h b/include/rocksdb/slice.h
index 0d7eb59499eb..dde34d709d65 100644
--- a/include/rocksdb/slice.h
+++ b/include/rocksdb/slice.h
@@ -20,10 +20,11 @@
 
 #include <cassert>
 #include <cstddef>
+#include <cstdint>
 #include <cstdio>
 #include <cstring>
 #include <string>
-#include <string_view>  // RocksDB now requires C++17 support
+#include <string_view>
 
 #include "rocksdb/cleanable.h"
 
@@ -129,6 +130,46 @@ class Slice {
   // Intentionally copyable
 };
 
+// A likely more efficient alternative to std::optional<Slice>. For example,
+// an empty key might be distinct from "not specified" (and Slice* as an
+// optional is more troublesome to deal with).
+class OptSlice {
+ public:
+  OptSlice() : slice_(nullptr, SIZE_MAX) {}
+  /*implicit*/ OptSlice(const Slice& s) : slice_(s) {}
+  /*implicit*/ OptSlice(const std::string& s) : slice_(s) {}
+  /*implicit*/ OptSlice(const std::string_view& sv) : slice_(sv) {}
+  /*implicit*/ OptSlice(const char* c_str) : slice_(c_str) {}
+  // For easier migrating from APIs uing Slice* as an optional type.
+  // CAUTION: OptSlice{nullptr} is "no value" while Slice{nullptr} is "empty"
+  /*implicit*/ OptSlice(std::nullptr_t) : OptSlice() {}
+
+  bool has_value() const noexcept { return slice_.size() != SIZE_MAX; }
+  explicit operator bool() const noexcept { return has_value(); }
+
+  const Slice& value() const noexcept {
+    assert(has_value());
+    return slice_;
+  }
+  const Slice& operator*() const noexcept { return value(); }
+  const Slice* operator->() const noexcept { return &value(); }
+
+  const Slice* AsPtr() const noexcept {
+    return has_value() ? &slice_ : nullptr;
+  }
+  // Populate from an optional pointer. This is a very explicit conversion
+  // to minimize risk of bugs as in
+  //   Slice start, limit;
+  //   RangeOpt rng = {&start, &limit};
+  //   start = ...;  // BUG: would not affect rng
+  static OptSlice CopyFromPtr(const Slice* ptr) {
+    return ptr ? OptSlice{*ptr} : OptSlice{};
+  }
+
+ protected:
+  Slice slice_;
+};
+
 /**
  * A Slice that can be pinned with some cleanup tasks, which will be run upon
  * ::Reset() or object destruction, whichever is invoked first. This can be used
diff --git a/include/rocksdb/slice_transform.h b/include/rocksdb/slice_transform.h
index f2515d03ffa2..f1ed46a62c50 100644
--- a/include/rocksdb/slice_transform.h
+++ b/include/rocksdb/slice_transform.h
@@ -8,9 +8,8 @@
 //
 // Class for specifying user-defined functions which perform a
 // transformation on a slice.  It is not required that every slice
-// belong to the domain and/or range of a function.  Subclasses should
-// define InDomain and InRange to determine which slices are in either
-// of these sets respectively.
+// belong to the domain of a function.  Subclasses should
+// define InDomain to determine which slices are in this set.
 
 #pragma once
 
@@ -70,10 +69,6 @@ class SliceTransform : public Customizable {
   //
   virtual bool InDomain(const Slice& key) const = 0;
 
-  // DEPRECATED: This is currently not used and remains here for backward
-  // compatibility.
-  virtual bool InRange(const Slice& /*dst*/) const { return false; }
-
   // Returns information on maximum prefix length, if there is one.
   // If Transform(x).size() == n for some keys and otherwise < n,
   // should return true and set *len = n. Returning false is safe but
diff --git a/include/rocksdb/sst_file_writer.h b/include/rocksdb/sst_file_writer.h
index d893cb1e2afb..607782715a21 100644
--- a/include/rocksdb/sst_file_writer.h
+++ b/include/rocksdb/sst_file_writer.h
@@ -15,12 +15,6 @@
 #include "rocksdb/types.h"
 #include "rocksdb/wide_columns.h"
 
-#if defined(__GNUC__) || defined(__clang__)
-#define ROCKSDB_DEPRECATED_FUNC __attribute__((__deprecated__))
-#elif _WIN32
-#define ROCKSDB_DEPRECATED_FUNC __declspec(deprecated)
-#endif
-
 namespace ROCKSDB_NAMESPACE {
 
 class Comparator;
@@ -88,24 +82,19 @@ class SstFileWriter {
   // hint that this file pages is not needed every time we write 1MB to the
   // file. To use the rate limiter an io_priority smaller than IO_TOTAL can be
   // passed.
-  // The `skip_filters` option is DEPRECATED and could be removed in the
-  // future. Use `BlockBasedTableOptions::filter_policy` to control filter
-  // generation.
   SstFileWriter(const EnvOptions& env_options, const Options& options,
                 ColumnFamilyHandle* column_family = nullptr,
                 bool invalidate_page_cache = true,
-                Env::IOPriority io_priority = Env::IOPriority::IO_TOTAL,
-                bool skip_filters = false)
+                Env::IOPriority io_priority = Env::IOPriority::IO_TOTAL)
       : SstFileWriter(env_options, options, options.comparator, column_family,
-                      invalidate_page_cache, io_priority, skip_filters) {}
+                      invalidate_page_cache, io_priority) {}
 
   // Deprecated API
   SstFileWriter(const EnvOptions& env_options, const Options& options,
                 const Comparator* user_comparator,
                 ColumnFamilyHandle* column_family = nullptr,
                 bool invalidate_page_cache = true,
-                Env::IOPriority io_priority = Env::IOPriority::IO_TOTAL,
-                bool skip_filters = false);
+                Env::IOPriority io_priority = Env::IOPriority::IO_TOTAL);
 
   ~SstFileWriter();
 
@@ -113,12 +102,6 @@ class SstFileWriter {
   Status Open(const std::string& file_path,
               Temperature temp = Temperature::kUnknown);
 
-  // Add a Put key with value to currently opened file (deprecated)
-  // REQUIRES: user_key is after any previously added point (Put/Merge/Delete)
-  //           key according to the comparator.
-  // REQUIRES: comparator is *not* timestamp-aware.
-  ROCKSDB_DEPRECATED_FUNC Status Add(const Slice& user_key, const Slice& value);
-
   // Add a Put key with value to currently opened file
   // REQUIRES: user_key is after any previously added point (Put/Merge/Delete)
   //           key according to the comparator.
diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h
index 00b95e8d1fd3..7cecac05f7a1 100644
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@@ -162,6 +162,8 @@ enum Tickers : uint32_t {
   COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE,
   // If a compaction was canceled in sfm to prevent ENOSPC
   COMPACTION_CANCELLED,
+  // Number of compactions aborted via AbortAllCompactions()
+  COMPACTION_ABORTED,
 
   // Number of keys written to the database via the Put and Write call's
   NUMBER_KEYS_WRITTEN,
@@ -301,7 +303,7 @@ enum Tickers : uint32_t {
   NUMBER_RATE_LIMITER_DRAINS,
 
   // BlobDB specific stats
-  // # of Put/PutTTL/PutUntil to BlobDB. Only applicable to legacy BlobDB.
+  // # of Put/PutWithTTL to BlobDB. Only applicable to legacy BlobDB.
   BLOB_DB_NUM_PUT,
   // # of Write to BlobDB. Only applicable to legacy BlobDB.
   BLOB_DB_NUM_WRITE,
@@ -326,12 +328,12 @@ enum Tickers : uint32_t {
   // # of bytes (keys + value) read from BlobDB. Only applicable to legacy
   // BlobDB.
   BLOB_DB_BYTES_READ,
-  // # of keys written by BlobDB as non-TTL inlined value. Only applicable to
-  // legacy BlobDB.
-  BLOB_DB_WRITE_INLINED,
-  // # of keys written by BlobDB as TTL inlined value. Only applicable to legacy
-  // BlobDB.
-  BLOB_DB_WRITE_INLINED_TTL,
+  // Deprecated: min_blob_size is no longer configurable. Retained to avoid
+  // shifting enum values.
+  BLOB_DB_WRITE_INLINED_DEPRECATED,
+  // Deprecated: min_blob_size is no longer configurable. Retained to avoid
+  // shifting enum values.
+  BLOB_DB_WRITE_INLINED_TTL_DEPRECATED,
   // # of keys written by BlobDB as non-TTL blob value. Only applicable to
   // legacy BlobDB.
   BLOB_DB_WRITE_BLOB,
@@ -440,13 +442,20 @@ enum Tickers : uint32_t {
   REMOTE_COMPACT_READ_BYTES,
   REMOTE_COMPACT_WRITE_BYTES,
 
+  // Bytes of output files successfully resumed during compaction
+  REMOTE_COMPACT_RESUMED_BYTES,
+
   // Tiered storage related statistics
   HOT_FILE_READ_BYTES,
   WARM_FILE_READ_BYTES,
+  COOL_FILE_READ_BYTES,
   COLD_FILE_READ_BYTES,
+  ICE_FILE_READ_BYTES,
   HOT_FILE_READ_COUNT,
   WARM_FILE_READ_COUNT,
+  COOL_FILE_READ_COUNT,
   COLD_FILE_READ_COUNT,
+  ICE_FILE_READ_COUNT,
 
   // Last level and non-last level read statistics
   LAST_LEVEL_READ_BYTES,
@@ -516,14 +525,16 @@ enum Tickers : uint32_t {
   // Number of FIFO compactions that drop files based on different reasons
   FIFO_MAX_SIZE_COMPACTIONS,
   FIFO_TTL_COMPACTIONS,
+  FIFO_CHANGE_TEMPERATURE_COMPACTIONS,
 
   // Number of bytes prefetched during user initiated scan
   PREFETCH_BYTES,
 
-  // Number of prefetched bytes that were actually useful
+  // Number of prefetched bytes that were actually useful during user initiated
+  // scan
   PREFETCH_BYTES_USEFUL,
 
-  // Number of FS reads avoided due to scan prefetching
+  // Number of FS reads avoided due to prefetching during user initiated scan
   PREFETCH_HITS,
 
   // Footer corruption detected when opening an SST file for reading
@@ -534,6 +545,44 @@ enum Tickers : uint32_t {
   FILE_READ_CORRUPTION_RETRY_COUNT,
   FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT,
 
+  // Counter for the number of times a WBWI is ingested into the DB. This
+  // happens when IngestWriteBatchWithIndex() is used and when large
+  // transaction optimization is enabled through
+  // TransactionOptions::large_txn_commit_optimize_threshold.
+  NUMBER_WBWI_INGEST,
+
+  // Failure to load the UDI during SST table open
+  SST_USER_DEFINED_INDEX_LOAD_FAIL_COUNT,
+
+  // MultiScan statistics
+  // # of Prepare() calls
+  MULTISCAN_PREPARE_CALLS,
+  // # of Prepare() calls that failed
+  MULTISCAN_PREPARE_ERRORS,
+  // # of data blocks prefetched from storage during MultiScan
+  MULTISCAN_BLOCKS_PREFETCHED,
+  // # of blocks found already in cache during MultiScan Prepare
+  MULTISCAN_BLOCKS_FROM_CACHE,
+  // Total bytes prefetched during MultiScan
+  MULTISCAN_PREFETCH_BYTES,
+  // # of prefetched blocks that were never accessed
+  MULTISCAN_PREFETCH_BLOCKS_WASTED,
+  // # of actual I/O requests issued during MultiScan
+  MULTISCAN_IO_REQUESTS,
+  // # of non-adjacent blocks coalesced into single I/O (within
+  // io_coalesce_threshold)
+  MULTISCAN_IO_COALESCED_NONADJACENT,
+  // # of seeks that failed validation (out of order, etc.)
+  MULTISCAN_SEEK_ERRORS,
+
+  // IODispatcher memory limiting statistics
+  // # of bytes granted to prefetch requests
+  PREFETCH_MEMORY_BYTES_GRANTED,
+  // # of bytes released from prefetch memory
+  PREFETCH_MEMORY_BYTES_RELEASED,
+  // # of prefetch requests that were blocked waiting for memory
+  PREFETCH_MEMORY_REQUESTS_BLOCKED,
+
   TICKER_ENUM_MAX
 };
 
@@ -612,8 +661,7 @@ enum Histograms : uint32_t {
   BLOB_DB_KEY_SIZE,
   // Size of values written to BlobDB. Only applicable to legacy BlobDB.
   BLOB_DB_VALUE_SIZE,
-  // BlobDB Put/PutWithTTL/PutUntil/Write latency. Only applicable to legacy
-  // BlobDB.
+  // BlobDB Put/PutWithTTL/Write latency. Only applicable to legacy BlobDB.
   BLOB_DB_WRITE_MICROS,
   // BlobDB Get latency. Only applicable to legacy BlobDB.
   BLOB_DB_GET_MICROS,
@@ -657,16 +705,31 @@ enum Histograms : uint32_t {
   ASYNC_READ_BYTES,
   POLL_WAIT_MICROS,
 
+  // Number of bytes for RocksDB's prefetching (as opposed to file
+  // system's prefetch) on SST file during compaction read
+  COMPACTION_PREFETCH_BYTES,
+
   // Number of prefetched bytes discarded by RocksDB.
   PREFETCHED_BYTES_DISCARDED,
 
   // Wait time for aborting async read in FilePrefetchBuffer destructor
   ASYNC_PREFETCH_ABORT_MICROS,
 
-  // Number of bytes read for RocksDB's prefetching contents (as opposed to file
+  // Number of bytes for RocksDB's prefetching contents (as opposed to file
   // system's prefetch) from the end of SST table during block based table open
   TABLE_OPEN_PREFETCH_TAIL_READ_BYTES,
 
+  // Number of operations per transaction.
+  NUM_OP_PER_TRANSACTION,
+
+  // MultiScan Prefill iterator Prepare cost
+  MULTISCAN_PREPARE_ITERATORS,
+
+  // Total Prepare() latency for MultiScan
+  MULTISCAN_PREPARE_MICROS,
+  // Distribution of blocks prefetched per MultiScan Prepare()
+  MULTISCAN_BLOCKS_PER_PREPARE,
+
   HISTOGRAM_ENUM_MAX
 };
 
diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h
index 82597239fff7..c3eeb082c3ed 100644
--- a/include/rocksdb/status.h
+++ b/include/rocksdb/status.h
@@ -115,6 +115,9 @@ class Status {
     kIOFenced = 14,
     kMergeOperatorFailed = 15,
     kMergeOperandThresholdExceeded = 16,
+    kPrefetchLimitReached = 17,
+    kNotExpectedCodePath = 18,
+    kCompactionAborted = 19,
     kMaxSubCode
   };
 
@@ -316,12 +319,21 @@ class Status {
     return Status(kInvalidArgument, kTxnNotPrepared, msg, msg2);
   }
 
+  static Status LockLimit() { return Status(kAborted, kLockLimit); }
+
+  static Status PrefetchLimitReached() {
+    return Status(kIncomplete, kPrefetchLimitReached);
+  }
+
   // Returns true iff the status indicates success.
   bool ok() const {
     MarkChecked();
     return code() == kOk;
   }
 
+  // Assert the status is OK in debug mode
+  void AssertOK() const { assert(ok()); }
+
   // Returns true iff the status indicates success *with* something
   // overwritten
   bool IsOkOverwritten() const {
@@ -472,6 +484,13 @@ class Status {
     return (code() == kIncomplete) && (subcode() == kManualCompactionPaused);
   }
 
+  // Returns true iff the status indicates compaction aborted. This
+  // is caused by a call to AbortAllCompactions
+  bool IsCompactionAborted() const {
+    MarkChecked();
+    return (code() == kIncomplete) && (subcode() == kCompactionAborted);
+  }
+
   // Returns true iff the status indicates a TxnNotPrepared error.
   bool IsTxnNotPrepared() const {
     MarkChecked();
@@ -484,6 +503,13 @@ class Status {
     return (code() == kIOError) && (subcode() == kIOFenced);
   }
 
+  // Returns true iff the status indicates prefetch limit reached during
+  // MultiScan.
+  bool IsPrefetchLimitReached() const {
+    MarkChecked();
+    return (code() == kIncomplete) && (subcode() == kPrefetchLimitReached);
+  }
+
   // Return a string representation of this status suitable for printing.
   // Returns the string "OK" for success.
   std::string ToString() const;
diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h
index e1f76fcd4632..3485c41f8079 100644
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@@ -44,6 +44,7 @@ class TableReader;
 class WritableFileWriter;
 struct ConfigOptions;
 struct EnvOptions;
+class UserDefinedIndexFactory;
 
 // Types of checksums to use for checking integrity of logical blocks within
 // files. All checksums currently use 32 bits of checking power (1 in 4B
@@ -126,7 +127,15 @@ struct CacheUsageOptions {
 };
 
 // Configures how SST files using the block-based table format (standard)
-// are written and read.
+// are written and read. With few exceptions, each option only affects either
+// (a) how new SST files are written, or (b) how SST files are read. If an
+// option seems to affect how the SST file is constructed, e.g. format_version,
+// that option *ONLY* has an effect at construction time. Contrast this with
+// options like the various `cache` and `pin` options, that only affect
+// in-memory and IO behavior at read time. In general, any version of RocksDB
+// able to read the full key-value and indexing data in the SST file will read
+// it as written regardless of current options for writing new files. See
+// filter_policy regarding filters.
 //
 // Except as specifically noted, all options here are "mutable" using
 // SetOptions(), with the caveat that only new table builders and new table
@@ -254,6 +263,21 @@ struct BlockBasedTableOptions {
 
   IndexType index_type = kBinarySearch;
 
+  // The search algorithm used when seeking to entries in the index block.
+  enum BlockSearchType : char {
+    // Standard binary search
+    kBinary = 0x00,
+    // Interpolation search, which may be better suited for uniformly
+    // distributed keys. This will only be applicable if the comparator is the
+    // byte-wise comparator. Avoid using
+    // IndexShorteningMode::kShortenSeparatorsAndSuccessor as shortening the
+    // succesor can skew the end key and make interpolation search significantly
+    // less performant.
+    kInterpolation = 0x01,
+  };
+
+  BlockSearchType index_block_search_type = kBinary;
+
   // The index type that will be used for the data block.
   enum DataBlockIndexType : char {
     kDataBlockBinarySearch = 0,   // traditional block type
@@ -431,10 +455,13 @@ struct BlockBasedTableOptions {
   // versions of RocksDB able to read partitioned filters are able to read
   // decoupled partitioned filters.)
   //
-  // decouple_partitioned_filters = false is the original behavior, because of
-  // limitations in the initial implementation, and the new behavior
-  // decouple_partitioned_filters = true is expected to become the new default.
-  bool decouple_partitioned_filters = false;
+  // decouple_partitioned_filters = true is the new default. This option is now
+  // DEPRECATED and might be ignored and/or removed in a future release.
+  //
+  // NOTE: decouple_partitioned_filters = false with partition_filters = true
+  // disables parallel compression (CompressionOptions::parallel_threads
+  // sanitized to 1).
+  bool decouple_partitioned_filters = true;
 
   // Option to generate Bloom/Ribbon filters that minimize memory
   // internal fragmentation.
@@ -480,8 +507,29 @@ struct BlockBasedTableOptions {
   // If non-nullptr, use the specified filter policy to reduce disk reads.
   // Many applications will benefit from passing the result of
   // NewBloomFilterPolicy() here.
+  //
+  // Because filters only impact performance and are not data-critical, an
+  // SST file can be opened and used without filters if (a) the filter
+  // policy name or schema is unrecognized, or (b) filter_policy is nullptr.
+  // See filter_policy regarding filters.
   std::shared_ptr<const FilterPolicy> filter_policy = nullptr;
 
+  // EXPERIMENTAL
+  //
+  // If non-nullptr, use the specified factory to build user-defined index.
+  // This allows users to define their own index format and build the index
+  // during table building.
+  //
+  // NOTE: UserDefinedIndexFactory currently disables parallel compression
+  // (CompressionOptions::parallel_threads sanitized to 1).
+  std::shared_ptr<UserDefinedIndexFactory> user_defined_index_factory = nullptr;
+
+  // EXPERIMENTAL
+  //
+  // Return an error Status if a user_defined_index_factory is configured,
+  // but there's no corresponding UDI block in the SST file being opened.
+  bool fail_if_no_udi_on_open = false;
+
   // If true, place whole keys in the filter (not just prefixes).
   // This must generally be true for gets to be efficient.
   bool whole_key_filtering = true;
@@ -524,13 +572,10 @@ struct BlockBasedTableOptions {
   // Default: 0 (disabled)
   uint32_t read_amp_bytes_per_bit = 0;
 
-  // We currently have these versions:
-  // 0 -- This version can be read by really old RocksDB's. Doesn't support
-  // changing checksum type (default is CRC32).
-  // 1 -- Can be read by RocksDB's versions since 3.0. Supports non-default
-  // checksum, like xxHash. It is written by RocksDB when
-  // BlockBasedTableOptions::checksum is something other than kCRC32c. (version
-  // 0 is silently upconverted)
+  // We currently have these format versions:
+  // 0 - 1 -- No longer supported. Attempting to read files with these format
+  // versions will return an error. To upgrade, load the data with RocksDB
+  // >= 4.6.0 and < 11.0.0, then run a full compaction.
   // 2 -- Can be read by RocksDB's versions since 3.10. Changes the way we
   // encode compressed blocks with LZ4, BZip2 and Zlib compression. If you
   // don't plan to run RocksDB before version 3.10, you should probably use
@@ -553,6 +598,10 @@ struct BlockBasedTableOptions {
   // misplaced within or between files is as likely to fail checksum
   // verification as random corruption. Also checksum-protects SST footer.
   // Can be read by RocksDB versions >= 8.6.0.
+  // 7 -- Support for custom compression algorithms with a CompressionManager
+  // using a non-built-in CompatibilityName(). See `compression_manager` in
+  // ColumnFamilyOptions. Also changes the format of TableProperties field
+  // `compression_name`. Can be read by RocksDB versions >= 10.4.0.
   //
   // Using the default setting of format_version is strongly recommended, so
   // that available enhancements are adopted eventually and automatically. The
@@ -560,7 +609,7 @@ struct BlockBasedTableOptions {
   // validation and sufficient time and number of releases have elapsed
   // (6 months recommended) to ensure a clean downgrade/revert path for users
   // who might only upgrade a few times per year.
-  uint32_t format_version = 6;
+  uint32_t format_version = 7;
 
   // Store index blocks on disk in compressed format. Changing this option to
   // false  will avoid the overhead of decompression if index blocks are evicted
@@ -570,6 +619,30 @@ struct BlockBasedTableOptions {
   // Align data blocks on lesser of page size and block size
   bool block_align = false;
 
+  // Align data blocks on super block alignment. Avoid a data block split across
+  // super block boundaries. Works with/without compression.
+  //
+  // Here a "super block" refers to an aligned unit of underlying Filesystem
+  // storage for which there is an extra cost when a random read involves two
+  // such super blocks instead of just one. Configuring that size here suggests
+  // inserting padding in the SST file to avoid a single SST block splitting
+  // across two super blocks. Only power-of-two sizes are supported. See also
+  // super_block_alignment_space_overhead_ratio. Default to 0, which means super
+  // block alignment is disabled.
+  //
+  // Super block alignment size. Default to 0, which means super block alignment
+  // is disabled. If it is enabled, it needs to be a power of 2 and higher than
+  // block size.
+  size_t super_block_alignment_size = 0;
+
+  // This option constrols the storage space overhead of super block alignment.
+  // It is used to calculate the max padding size allowed for super block
+  // alignment. It is calculated in this way. If super_block_alignment_size is
+  // 2MB, and super_block_alignment_overhead_ratio is 128, then the max padding
+  // size allowed for super block alignment is 2MB / 128 = 16KB.
+  // Note that, when it is set to 0, super block alignment is disabled.
+  size_t super_block_alignment_space_overhead_ratio = 128;
+
   // This enum allows trading off increased index size for improved iterator
   // seek performance in some situations, particularly when block cache is
   // disabled (ReadOptions::fill_cache = false) and direct IO is
diff --git a/include/rocksdb/table_properties.h b/include/rocksdb/table_properties.h
index 00e448ba7d7f..6bac922761f9 100644
--- a/include/rocksdb/table_properties.h
+++ b/include/rocksdb/table_properties.h
@@ -76,6 +76,7 @@ struct TablePropertiesNames {
   static const std::string kTailStartOffset;
   static const std::string kUserDefinedTimestampsPersisted;
   static const std::string kKeyLargestSeqno;
+  static const std::string kKeySmallestSeqno;
 };
 
 // `TablePropertiesCollector` provides the mechanism for users to collect
@@ -109,6 +110,10 @@ class TablePropertiesCollector {
   // table.
   // @params key    the user key that is inserted into the table.
   // @params value  the value that is inserted into the table.
+  // @params file_size the current file size. For BlockBasedTable, this
+  //         includes all the data blocks written so far, upto but not including
+  //         the current block being built. With parallel compression, data
+  //         blocks are written async so it depends on the compression progress.
   virtual Status AddUserKey(const Slice& key, const Slice& value,
                             EntryType /*type*/, SequenceNumber /*seq*/,
                             uint64_t /*file_size*/) {
@@ -143,7 +148,7 @@ class TablePropertiesCollector {
   // The name of the properties collector can be used for debugging purpose.
   virtual const char* Name() const = 0;
 
-  // EXPERIMENTAL Return whether the output file should be further compacted
+  // Return whether the output file should be further compacted
   virtual bool NeedCompact() const { return false; }
 
   // For internal use only.
@@ -216,6 +221,8 @@ struct TableProperties {
   uint64_t orig_file_number = 0;
   // the total size of all data blocks.
   uint64_t data_size = 0;
+  // the total uncompressed size of all data blocks (since RocksDB 10.7)
+  uint64_t uncompressed_data_size = 0;
   // the size of index block.
   uint64_t index_size = 0;
   // Total number of index partitions if kTwoLevelIndexSearch is used
@@ -303,6 +310,16 @@ struct TableProperties {
   // table is empty).
   uint64_t key_largest_seqno = UINT64_MAX;
 
+  bool HasKeyLargestSeqno() const { return key_largest_seqno != UINT64_MAX; }
+
+  // The smallest sequence number of keys in this file.
+  // UINT64_MAX means unknown.
+  // Only written to properties block if known (should be known unless the
+  // table is empty).
+  uint64_t key_smallest_seqno = UINT64_MAX;
+
+  bool HasKeySmallestSeqno() const { return key_smallest_seqno != UINT64_MAX; }
+
   // DB identity
   // db_id is an identifier generated the first time the DB is created
   // If DB identity is unset or unassigned, `db_id` will be an empty string.
@@ -344,7 +361,20 @@ struct TableProperties {
   // {collector_name[1]},{collector_name[2]},{collector_name[3]} ..
   std::string property_collectors_names;
 
-  // The compression algo used to compress the SST files.
+  // Identifies the compression algorithm or schema used in the file.
+  // Specifically:
+  // * For format_version < 7, it is one of several names for built-in
+  // compression types. Because of how some previous versions of RocksDB
+  // behave, this must be set to "ZSTD" if any blocks are compressed
+  // with zstd and must NOT be set to "NoCompression" if any blocks are
+  // compressed.
+  // * For format_version >= 7, the format is
+  //   <compatibility_name>;<hex-coded compression types>;<future use>
+  // where <compatibility_name> is the CompatibilityName() of the
+  // CompressionManager used for the file, or empty if compression was
+  // disabled; <hex-coded compression types> represents a sorted set of
+  // CompressionType values used in the file other than kNoCompression, each
+  // as 2-digit hex, e.g. 04 for LZ$, 07 for ZSTD, etc.
   std::string compression_name;
 
   // Compression options used to compress the SST files.
diff --git a/include/rocksdb/thread_status.h b/include/rocksdb/thread_status.h
index 880b0bd4fa20..07c872c0e9b5 100644
--- a/include/rocksdb/thread_status.h
+++ b/include/rocksdb/thread_status.h
@@ -22,24 +22,16 @@
 
 #include "rocksdb/rocksdb_namespace.h"
 
-#if !defined(NROCKSDB_THREAD_STATUS)
-#define ROCKSDB_USING_THREAD_STATUS
-#endif
-
 namespace ROCKSDB_NAMESPACE {
 
-// TODO(yhchiang): remove this function once c++14 is available
-//                 as std::max will be able to cover this.
-// Current MS compiler does not support constexpr
-template <int A, int B>
-struct constexpr_max {
-  static const int result = (A > B) ? A : B;
-};
-
 // A structure that describes the current status of a thread.
 // The status of active threads can be fetched using
 // ROCKSDB_NAMESPACE::GetThreadList().
 struct ThreadStatus {
+  // Whether RocksDB was built with !NROCKSDB_THREAD_STATUS for
+  // ROCKSDB_NAMESPACE::GetThreadList() to be supported.
+  static const bool kEnabled;
+
   // The type of a thread.
   enum ThreadType : int {
     HIGH_PRIORITY = 0,  // RocksDB BG thread in high-pri thread pool
@@ -64,7 +56,7 @@ struct ThreadStatus {
     OP_VERIFY_FILE_CHECKSUMS,
     OP_GETENTITY,
     OP_MULTIGETENTITY,
-    OP_READ_MANIFEST,
+    OP_GET_FILE_CHECKSUMS_FROM_CURRENT_MANIFEST,
     NUM_OP_TYPES
   };
 
@@ -102,8 +94,8 @@ struct ThreadStatus {
 
   // The maximum number of properties of an operation.
   // This number should be set to the biggest NUM_XXX_PROPERTIES.
-  static const int kNumOperationProperties =
-      constexpr_max<NUM_COMPACTION_PROPERTIES, NUM_FLUSH_PROPERTIES>::result;
+  static constexpr int kNumOperationProperties =
+      std::max(int{NUM_COMPACTION_PROPERTIES}, int{NUM_FLUSH_PROPERTIES});
 
   // The type used to refer to a thread state.
   // A state describes lower-level action of a thread
diff --git a/include/rocksdb/tool_hooks.h b/include/rocksdb/tool_hooks.h
index b31780c032f8..a92abde67356 100644
--- a/include/rocksdb/tool_hooks.h
+++ b/include/rocksdb/tool_hooks.h
@@ -30,18 +30,21 @@ class ToolHooks {
   ToolHooks() = default;
   virtual ~ToolHooks() = default;
   virtual Status Open(const Options& db_options, const std::string& name,
-                      DB** dbptr) = 0;
+                      std::unique_ptr<DB>* dbptr) = 0;
   virtual Status Open(
       const DBOptions& db_options, const std::string& name,
       const std::vector<ColumnFamilyDescriptor>& column_families,
-      std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) = 0;
+      std::vector<ColumnFamilyHandle*>* handles,
+      std::unique_ptr<DB>* dbptr) = 0;
   virtual Status OpenForReadOnly(const Options& options,
-                                 const std::string& name, DB** dbptr,
+                                 const std::string& name,
+                                 std::unique_ptr<DB>* dbptr,
                                  bool error_if_wal_file_exists) = 0;
   virtual Status OpenForReadOnly(
       const Options& options, const std::string& name,
       const std::vector<ColumnFamilyDescriptor>& column_families,
-      std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) = 0;
+      std::vector<ColumnFamilyHandle*>* handles,
+      std::unique_ptr<DB>* dbptr) = 0;
   virtual Status OpenTransactionDB(const Options& db_options,
                                    const TransactionDBOptions& txn_db_options,
                                    const std::string& dbname,
@@ -62,13 +65,14 @@ class ToolHooks {
   virtual Status OpenAsSecondary(const Options& options,
                                  const std::string& name,
                                  const std::string& secondary_path,
-                                 DB** dbptr) = 0;
+                                 std::unique_ptr<DB>* dbptr) = 0;
   virtual Status OpenAsFollower(const Options& options, const std::string& name,
                                 const std::string& leader_path,
                                 std::unique_ptr<DB>* dbptr) = 0;
   virtual Status Open(const Options& options,
                       const blob_db::BlobDBOptions& bdb_options,
                       const std::string& dbname, blob_db::BlobDB** blob_db) = 0;
+  virtual void Exit(int status) = 0;
 };
 
 class DefaultHooks : public ToolHooks {
@@ -76,18 +80,21 @@ class DefaultHooks : public ToolHooks {
   DefaultHooks() = default;
   ~DefaultHooks() override = default;
   virtual Status Open(const Options& db_options, const std::string& name,
-                      DB** dbptr) override;
+                      std::unique_ptr<DB>* dbptr) override;
   virtual Status Open(
       const DBOptions& db_options, const std::string& name,
       const std::vector<ColumnFamilyDescriptor>& column_families,
-      std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) override;
+      std::vector<ColumnFamilyHandle*>* handles,
+      std::unique_ptr<DB>* dbptr) override;
   virtual Status OpenForReadOnly(const Options& options,
-                                 const std::string& name, DB** dbptr,
+                                 const std::string& name,
+                                 std::unique_ptr<DB>* dbptr,
                                  bool error_if_wal_file_exists) override;
   virtual Status OpenForReadOnly(
       const Options& options, const std::string& name,
       const std::vector<ColumnFamilyDescriptor>& column_families,
-      std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) override;
+      std::vector<ColumnFamilyHandle*>* handles,
+      std::unique_ptr<DB>* dbptr) override;
   virtual Status OpenTransactionDB(const Options& db_options,
                                    const TransactionDBOptions& txn_db_options,
                                    const std::string& dbname,
@@ -109,7 +116,7 @@ class DefaultHooks : public ToolHooks {
   virtual Status OpenAsSecondary(const Options& options,
                                  const std::string& name,
                                  const std::string& secondary_path,
-                                 DB** dbptr) override;
+                                 std::unique_ptr<DB>* dbptr) override;
   virtual Status OpenAsFollower(const Options& options, const std::string& name,
                                 const std::string& leader_path,
                                 std::unique_ptr<DB>* dbptr) override;
@@ -117,6 +124,8 @@ class DefaultHooks : public ToolHooks {
                       const blob_db::BlobDBOptions& bdb_options,
                       const std::string& dbname,
                       blob_db::BlobDB** blob_db) override;
+
+  virtual void Exit(int status) override { exit(status); }
 };
 
 extern DefaultHooks defaultHooks;
diff --git a/include/rocksdb/trace_record.h b/include/rocksdb/trace_record.h
index 8f9c3ee2f0f5..d321f538745d 100644
--- a/include/rocksdb/trace_record.h
+++ b/include/rocksdb/trace_record.h
@@ -5,6 +5,7 @@
 
 #pragma once
 
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <vector>
diff --git a/include/rocksdb/types.h b/include/rocksdb/types.h
index 368736cbd097..982f497fdf55 100644
--- a/include/rocksdb/types.h
+++ b/include/rocksdb/types.h
@@ -53,7 +53,8 @@ enum FileType {
   kMetaDatabase,
   kIdentityFile,
   kOptionsFile,
-  kBlobFile
+  kBlobFile,
+  kCompactionProgressFile
 };
 
 // User-oriented representation of internal key types.
@@ -118,7 +119,11 @@ enum class Temperature : uint8_t {
   kUnknown = 0,
   kHot = 0x04,
   kWarm = 0x08,
+  kCool = 0x0A,
   kCold = 0x0C,
+  kIce = 0x10,
+  // XXX: this is mis-named. It is instead an invalid temperature beyond the
+  // rest
   kLastTemperature,
 };
 
diff --git a/include/rocksdb/unique_id.h b/include/rocksdb/unique_id.h
index eb0c778266cb..3c0c0eb5b1bf 100644
--- a/include/rocksdb/unique_id.h
+++ b/include/rocksdb/unique_id.h
@@ -33,8 +33,8 @@ namespace ROCKSDB_NAMESPACE {
 // And assuming one generates many SST files in the lifetime of each process,
 // the probability of ID collisions is much "better than random"; see
 // https://github.com/pdillinger/unique_id
-Status GetUniqueIdFromTableProperties(const TableProperties &props,
-                                      std::string *out_id);
+Status GetUniqueIdFromTableProperties(const TableProperties& props,
+                                      std::string* out_id);
 
 // Computes a 192-bit (24 binary char) stable, universally unique ID
 // with an extra 64 bits of uniqueness compared to the standard ID. It is only
@@ -44,12 +44,12 @@ Status GetUniqueIdFromTableProperties(const TableProperties &props,
 // example above would expect a global file ID collision every 4 days with
 // 128-bit IDs (using some worst-case assumptions about process lifetime).
 // It's 10^17 years with 192-bit IDs.
-Status GetExtendedUniqueIdFromTableProperties(const TableProperties &props,
-                                              std::string *out_id);
+Status GetExtendedUniqueIdFromTableProperties(const TableProperties& props,
+                                              std::string* out_id);
 
 // Converts a binary string (unique id) to hexadecimal, with each 64 bits
 // separated by '-', e.g. 6474DF650323BDF0-B48E64F3039308CA-17284B32E7F7444B
 // Also works on unique id prefix.
-std::string UniqueIdToHumanString(const std::string &id);
+std::string UniqueIdToHumanString(const std::string& id);
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/universal_compaction.h b/include/rocksdb/universal_compaction.h
index e40033cae44a..9a52ee539db2 100644
--- a/include/rocksdb/universal_compaction.h
+++ b/include/rocksdb/universal_compaction.h
@@ -111,6 +111,24 @@ class CompactionOptionsUniversal {
   // Default: false
   bool incremental;
 
+  // If true, auto universal compaction picking will adjust to minimize locking
+  // of input files when bottom priority compactions are waiting to run. This
+  // can increase the likelihood of existing L0s being selected for compaction,
+  // thereby improving write stall and reducing read regression. It may increase
+  // the overrall write amplification and compaction load on low priority
+  // threads.
+  //
+  // Default: true (enabled)
+  //
+  // This options does not apply to manual compactions.
+  //
+  // This option is temporary in case turning on this feature causes problems
+  // and users need to undo it quickly. This option is planned for removal in
+  // the near future with default value set to true.
+  //
+  // Dynamically changeable through the SetOptions() API.
+  bool reduce_file_locking;
+
   // Default set of parameters
   CompactionOptionsUniversal()
       : size_ratio(1),
@@ -121,11 +139,10 @@ class CompactionOptionsUniversal {
         max_read_amp(-1),
         stop_style(kCompactionStopStyleTotalSize),
         allow_trivial_move(false),
-        incremental(false) {}
+        incremental(false),
+        reduce_file_locking(true) {}
 
-#if __cplusplus >= 202002L
   bool operator==(const CompactionOptionsUniversal& rhs) const = default;
-#endif
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/user_defined_index.h b/include/rocksdb/user_defined_index.h
new file mode 100644
index 000000000000..395f9fbf3530
--- /dev/null
+++ b/include/rocksdb/user_defined_index.h
@@ -0,0 +1,187 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+//  *****************************************************************
+//  EXPERIMENTAL - subject to change while under development
+//  *****************************************************************
+
+#pragma once
+
+#include <string>
+
+#include "rocksdb/advanced_iterator.h"
+#include "rocksdb/customizable.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Prefix for user-defined index block names
+inline const std::string kUserDefinedIndexPrefix =
+    "rocksdb.user_defined_index.";
+
+// This is a public API for user-defined index builders.
+// It allows users to define their own index format and build custom
+// indexes during table building. Currently, only a monolithic index
+// block is supported (no partitioned index).
+//
+// This is currently supported only for a restricted set of use cases. The
+// CF must be ingest only, and only files containing Puts generated by
+// SstFileWriter are supported.
+
+// The interface for building user-defined index.
+class UserDefinedIndexBuilder {
+ public:
+  // Right now, we only support Puts. In the future, we may support merges,
+  // deletions etc.
+  enum ValueType {
+    kValue,
+    kTypeMax,
+  };
+
+  // File offset and size of the data block
+  struct BlockHandle {
+    uint64_t offset;
+    uint64_t size;
+  };
+
+  virtual ~UserDefinedIndexBuilder() = default;
+
+  // Add a new index entry to index block. The key for the new index entry
+  // should be >= last_key_in_current_block and < first_key_in_next_block.
+  // The previous index entry key and the new index entry key cover
+  // all the keys in the data block associated with the new index entry.
+  //
+  // The last_key_in_current_block and first_key_in_next_block will be user
+  // keys, i.e the user key string, and optionally the user timestamp if one
+  // is configured, without a sequence number suffix.
+  //
+  // Called before the OnKeyAdded() call for first_key_in_next_block.
+  // @last_key_in_current_block: The last key in the current data block
+  // @first_key_in_next_block: it will be nullptr if the entry being added is
+  //                           the last one in the table
+  // @block_handle: offset/size of the data block referenced by this index
+  //                entry. This should be stored along with the index entry
+  //                key
+  // @separator_scratch: a scratch buffer to back a computed separator between
+  //                     those, as needed. May be modified on each call.
+  // @return: the key or separator stored in the index, which could be
+  //          last_key_in_current_block or a computed separator backed by
+  //          separator_scratch.
+  virtual Slice AddIndexEntry(const Slice& last_key_in_current_block,
+                              const Slice* first_key_in_next_block,
+                              const BlockHandle& block_handle,
+                              std::string* separator_scratch) = 0;
+
+  // This method will be called whenever a key is added. The subclasses may
+  // override OnKeyAdded() if they need to collect additional information.
+  // The type argument indicates whether the value is a full value or partial.
+  // At the moment, only full values are supported.
+  //
+  // The key will be a user key. RocksDB guarantees that there will only be
+  // one entry for each key in the file/index.
+  virtual void OnKeyAdded(const Slice& /*key*/, ValueType /*type*/,
+                          const Slice& /*value*/) {}
+
+  // Finish building the index.
+  // Returns a Status and the serialized index contents.
+  // The memory backing the contents should not be freed until this builder
+  // object is destructed.
+  virtual Status Finish(Slice* index_contents) = 0;
+};
+
+// The interface for iterating the user defined index. This will be
+// instantiated and used by a scan to iterate through the index entries
+// covered by the scan.
+class UserDefinedIndexIterator {
+ public:
+  virtual ~UserDefinedIndexIterator() = default;
+
+  // Prepare the iterator for a series of scans. The iterator should use
+  // this as an opportunity to do any prefetching and buffering of results.
+  virtual void Prepare(const ScanOptions scan_opts[], size_t num_opts) = 0;
+
+  // Given the target key, position the index iterator at the index entry
+  // with the smallest key >= target. The result must be updated with the
+  // index key, and the bound_check_result. The bound_check_result should
+  // be set to kOutOfBound if no block satisfies the target key and
+  // termination criteria, kInbound if the data block is definitely fully
+  // within bounds, or kUnknown if the data block could be partially
+  // within bounds.
+  // The UDI implementation needs to be careful about returning kOutOfBound.
+  // If a limit key is specified in ScanOptions, an implementation that
+  // does not store the first key in the block for the corresponding index
+  // entry cannot reliably determine if the block is out of bounds. It must
+  // compare against the previous index key to determine if the current block
+  // is out of bounds w.r.t the limit. Other termination criteria (specified
+  // in property_bag) may cause the scan to terminate earlier, in which case
+  // kOutOfBound can be returned earlier.
+  virtual Status SeekAndGetResult(const Slice& target,
+                                  IterateResult* result) = 0;
+
+  // Advance to the next index entry. The result must be populated similar
+  // to SeekAndGetResult.
+  virtual Status NextAndGetResult(IterateResult* result) = 0;
+
+  // Return the BlockHandle in the current index entry
+  virtual UserDefinedIndexBuilder::BlockHandle value() = 0;
+};
+
+// A reader interface for the user defined index
+class UserDefinedIndexReader {
+ public:
+  virtual ~UserDefinedIndexReader() = default;
+
+  // Allocate an iterator that will be used by RocksDB to perform scans
+  virtual std::unique_ptr<UserDefinedIndexIterator> NewIterator(
+      const ReadOptions& read_options) = 0;
+
+  // The memory usage of the index, including the size of the raw contents and
+  // any other heap data structures allocated by the reader
+  virtual size_t ApproximateMemoryUsage() const = 0;
+};
+
+// Options for user defined index
+struct UserDefinedIndexOption {
+  const Comparator* comparator = BytewiseComparator();
+};
+
+// Factory for creating user-defined index builders.
+class UserDefinedIndexFactory : public Customizable {
+ public:
+  virtual ~UserDefinedIndexFactory() = default;
+
+  static const char* Type() { return "UserDefinedIndexFactory"; }
+
+  static Status CreateFromString(
+      const ConfigOptions& config_options, const std::string& value,
+      std::shared_ptr<UserDefinedIndexFactory>* factory);
+
+  // Create a new builder for user-defined index.
+  virtual UserDefinedIndexBuilder* NewBuilder() const = 0;
+
+  // Create a new user defined index reader given the contents of the index
+  // block
+  virtual std::unique_ptr<UserDefinedIndexReader> NewReader(
+      Slice& index_block) const = 0;
+
+  // New API for allowing customized comparator
+  virtual Status NewBuilder(
+      const UserDefinedIndexOption& /*option*/,
+      std::unique_ptr<UserDefinedIndexBuilder>& builder) const {
+    builder.reset(NewBuilder());
+    return Status::OK();
+  };
+
+  virtual Status NewReader(
+      const UserDefinedIndexOption& /*option*/, Slice& index_block,
+      std::unique_ptr<UserDefinedIndexReader>& reader) const {
+    reader = NewReader(index_block);
+    return Status::OK();
+  };
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/utilities/backup_engine.h b/include/rocksdb/utilities/backup_engine.h
index 045fdb06aa49..1961691be15e 100644
--- a/include/rocksdb/utilities/backup_engine.h
+++ b/include/rocksdb/utilities/backup_engine.h
@@ -621,7 +621,14 @@ class BackupEngineAppendOnlyBase {
   // The backup will stop ASAP and the call to CreateNewBackup will
   // return Status::Incomplete(). It will not clean up after itself, but
   // the state will remain consistent. The state will be cleaned up the
-  // next time you call CreateNewBackup or GarbageCollect.
+  // next time you call CreateNewBackup or GarbageCollect for the same backup
+  // directory on a new BackupEngine object.
+  //
+  // NOTE: This is a one-way operation. Once StopBackup() is called on a
+  // BackupEngine instance, all subsequent backup requests (CreateNewBackup,
+  // CreateNewBackupWithMetadata) will fail with Status::Incomplete().
+  // To create new backups after calling StopBackup(), you must open a new
+  // BackupEngine instance.
   virtual void StopBackup() = 0;
 
   // Will delete any files left over from incomplete creation or deletion of
diff --git a/include/rocksdb/utilities/cache_dump_load.h b/include/rocksdb/utilities/cache_dump_load.h
index 8f41839cd9de..ca2ce5ae11aa 100644
--- a/include/rocksdb/utilities/cache_dump_load.h
+++ b/include/rocksdb/utilities/cache_dump_load.h
@@ -90,7 +90,7 @@ class CacheDumper {
  public:
   virtual ~CacheDumper() = default;
   // Only dump the blocks in the block cache that belong to the DBs in this list
-  virtual Status SetDumpFilter(std::vector<DB*> db_list) {
+  virtual Status SetDumpFilter(const std::vector<DB*>& db_list) {
     (void)db_list;
     return Status::NotSupported("SetDumpFilter is not supported");
   }
diff --git a/include/rocksdb/utilities/db_ttl.h b/include/rocksdb/utilities/db_ttl.h
index 12f5cbac0f75..bccce8ddb14f 100644
--- a/include/rocksdb/utilities/db_ttl.h
+++ b/include/rocksdb/utilities/db_ttl.h
@@ -63,8 +63,10 @@ class DBWithTTL : public StackableDB {
 
   virtual void SetTtl(ColumnFamilyHandle* h, int32_t ttl) = 0;
 
+  virtual Status GetTtl(ColumnFamilyHandle* h, int32_t* ttl) = 0;
+
  protected:
-  explicit DBWithTTL(DB* db) : StackableDB(db) {}
+  explicit DBWithTTL(std::unique_ptr<DB>&& db) : StackableDB(std::move(db)) {}
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/utilities/debug.h b/include/rocksdb/utilities/debug.h
index 1cbc7daf84cc..57968ad15e10 100644
--- a/include/rocksdb/utilities/debug.h
+++ b/include/rocksdb/utilities/debug.h
@@ -33,12 +33,12 @@ struct KeyVersion {
 // copied to memory, if the range covers too many keys, the memory usage
 // may be huge. `max_num_ikeys` can be used to cap the memory usage.
 // The result is inserted into the provided vector, `key_versions`.
-Status GetAllKeyVersions(DB* db, Slice begin_key, Slice end_key,
+Status GetAllKeyVersions(DB* db, OptSlice begin_key, OptSlice end_key,
                          size_t max_num_ikeys,
                          std::vector<KeyVersion>* key_versions);
 
-Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, Slice begin_key,
-                         Slice end_key, size_t max_num_ikeys,
+Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, OptSlice begin_key,
+                         OptSlice end_key, size_t max_num_ikeys,
                          std::vector<KeyVersion>* key_versions);
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/utilities/env_mirror.h b/include/rocksdb/utilities/env_mirror.h
index 40c04095bde9..68cce77dad4e 100644
--- a/include/rocksdb/utilities/env_mirror.h
+++ b/include/rocksdb/utilities/env_mirror.h
@@ -68,7 +68,7 @@ class EnvMirror : public EnvWrapper {
     assert(as == bs);
     return as;
   }
-#if defined(_MSC_VER)
+#if defined(_MSC_VER)  // ODR-SAFE
 #pragma warning(push)
 // logical operation on address of string constant
 #pragma warning(disable : 4130)
@@ -87,7 +87,7 @@ class EnvMirror : public EnvWrapper {
     *r = ar;
     return as;
   }
-#if defined(_MSC_VER)
+#if defined(_MSC_VER)  // ODR-SAFE
 #pragma warning(pop)
 #endif
   Status DeleteFile(const std::string& f) override {
diff --git a/include/rocksdb/utilities/ldb_cmd.h b/include/rocksdb/utilities/ldb_cmd.h
index e0a1f06a7c8a..aacf9d3e9338 100644
--- a/include/rocksdb/utilities/ldb_cmd.h
+++ b/include/rocksdb/utilities/ldb_cmd.h
@@ -11,6 +11,7 @@
 #include <algorithm>
 #include <functional>
 #include <map>
+#include <memory>
 #include <sstream>
 #include <string>
 #include <vector>
@@ -23,6 +24,7 @@
 #include "rocksdb/slice.h"
 #include "rocksdb/utilities/db_ttl.h"
 #include "rocksdb/utilities/ldb_cmd_execute_result.h"
+#include "rocksdb/utilities/transaction_db.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -42,6 +44,8 @@ class LDBCommand {
   static const std::string ARG_TTL;
   static const std::string ARG_TTL_START;
   static const std::string ARG_TTL_END;
+  static const std::string ARG_USE_TXN;
+  static const std::string ARG_TXN_WRITE_POLICY;
   static const std::string ARG_TIMESTAMP;
   static const std::string ARG_TRY_LOAD_OPTIONS;
   static const std::string ARG_IGNORE_UNKNOWN_OPTIONS;
@@ -71,7 +75,6 @@ class LDBCommand {
   static const std::string ARG_BLOB_FILE_STARTING_LEVEL;
   static const std::string ARG_PREPOPULATE_BLOB_CACHE;
   static const std::string ARG_DECODE_BLOB_INDEX;
-  static const std::string ARG_DUMP_UNCOMPRESSED_BLOBS;
   static const std::string ARG_READ_TIMESTAMP;
   static const std::string ARG_GET_WRITE_UNIX_TIME;
 
@@ -163,8 +166,9 @@ class LDBCommand {
   std::string secondary_path_;
   std::string leader_path_;
   std::string column_family_name_;
-  DB* db_;
+  std::unique_ptr<DB> db_;
   DBWithTTL* db_ttl_;
+  TransactionDB* db_txn_;
   std::map<std::string, ColumnFamilyHandle*> cf_handles_;
   std::map<uint32_t, const Comparator*> ucmps_;
 
@@ -183,6 +187,13 @@ class LDBCommand {
   /** If true, the value is treated as timestamp suffixed */
   bool is_db_ttl_;
 
+  /** If true, open the DB as TransactionDB */
+  bool is_db_txn_;
+
+  /** Transaction write policy (0=WRITE_COMMITTED, 1=WRITE_PREPARED,
+   * 2=WRITE_UNPREPARED) */
+  int txn_write_policy_;
+
   // If true, the kvs are output with their insert/modify timestamp in a ttl db
   bool timestamp_;
 
diff --git a/include/rocksdb/utilities/ldb_cmd_execute_result.h b/include/rocksdb/utilities/ldb_cmd_execute_result.h
index 57bac334682b..2af07eeba55f 100644
--- a/include/rocksdb/utilities/ldb_cmd_execute_result.h
+++ b/include/rocksdb/utilities/ldb_cmd_execute_result.h
@@ -9,10 +9,6 @@
 
 #include "rocksdb/rocksdb_namespace.h"
 
-#ifdef FAILED
-#undef FAILED
-#endif
-
 namespace ROCKSDB_NAMESPACE {
 
 class LDBCommandExecuteResult {
diff --git a/include/rocksdb/utilities/lua/rocks_lua_custom_library.h b/include/rocksdb/utilities/lua/rocks_lua_custom_library.h
deleted file mode 100644
index f617da02bea6..000000000000
--- a/include/rocksdb/utilities/lua/rocks_lua_custom_library.h
+++ /dev/null
@@ -1,43 +0,0 @@
-//  Copyright (c) 2016, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#pragma once
-#ifdef LUA
-
-// lua headers
-extern "C" {
-#include <lauxlib.h>
-#include <lua.h>
-#include <lualib.h>
-}
-
-namespace ROCKSDB_NAMESPACE {
-namespace lua {
-// A class that used to define custom C Library that is callable
-// from Lua script
-class RocksLuaCustomLibrary {
- public:
-  virtual ~RocksLuaCustomLibrary() {}
-  // The name of the C library.  This name will also be used as the table
-  // (namespace) in Lua that contains the C library.
-  virtual const char* Name() const = 0;
-
-  // Returns a "static const struct luaL_Reg[]", which includes a list of
-  // C functions.  Note that the last entry of this static array must be
-  // {nullptr, nullptr} as required by Lua.
-  //
-  // More details about how to implement Lua C libraries can be found
-  // in the official Lua document http://www.lua.org/pil/26.2.html
-  virtual const struct luaL_Reg* Lib() const = 0;
-
-  // A function that will be called right after the library has been created
-  // and pushed on the top of the lua_State.  This custom setup function
-  // allows developers to put additional table or constant values inside
-  // the same table / namespace.
-  virtual void CustomSetup(lua_State* /*L*/) const {}
-};
-}  // namespace lua
-}  // namespace ROCKSDB_NAMESPACE
-#endif  // LUA
diff --git a/include/rocksdb/utilities/lua/rocks_lua_util.h b/include/rocksdb/utilities/lua/rocks_lua_util.h
deleted file mode 100644
index 3427b65ef674..000000000000
--- a/include/rocksdb/utilities/lua/rocks_lua_util.h
+++ /dev/null
@@ -1,55 +0,0 @@
-//  Copyright (c) 2016, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#pragma once
-// lua headers
-extern "C" {
-#include <lauxlib.h>
-#include <lua.h>
-#include <lualib.h>
-}
-
-#ifdef LUA
-#include <string>
-#include <vector>
-
-#include "rocksdb/utilities/lua/rocks_lua_custom_library.h"
-
-namespace ROCKSDB_NAMESPACE {
-namespace lua {
-class LuaStateWrapper {
- public:
-  explicit LuaStateWrapper(const std::string& lua_script) {
-    lua_state_ = luaL_newstate();
-    Init(lua_script, {});
-  }
-  LuaStateWrapper(
-      const std::string& lua_script,
-      const std::vector<std::shared_ptr<RocksLuaCustomLibrary>>& libraries) {
-    lua_state_ = luaL_newstate();
-    Init(lua_script, libraries);
-  }
-  lua_State* GetLuaState() const { return lua_state_; }
-  ~LuaStateWrapper() { lua_close(lua_state_); }
-
- private:
-  void Init(
-      const std::string& lua_script,
-      const std::vector<std::shared_ptr<RocksLuaCustomLibrary>>& libraries) {
-    if (lua_state_) {
-      luaL_openlibs(lua_state_);
-      for (const auto& library : libraries) {
-        luaL_openlib(lua_state_, library->Name(), library->Lib(), 0);
-        library->CustomSetup(lua_state_);
-      }
-      luaL_dostring(lua_state_, lua_script.c_str());
-    }
-  }
-
-  lua_State* lua_state_;
-};
-}  // namespace lua
-}  // namespace ROCKSDB_NAMESPACE
-#endif  // LUA
diff --git a/include/rocksdb/utilities/memory_util.h b/include/rocksdb/utilities/memory_util.h
index acebc8b4a655..40d9f5646c46 100644
--- a/include/rocksdb/utilities/memory_util.h
+++ b/include/rocksdb/utilities/memory_util.h
@@ -6,6 +6,7 @@
 #pragma once
 
 #include <map>
+#include <memory>
 #include <string>
 #include <unordered_set>
 #include <vector>
@@ -39,8 +40,11 @@ class MemoryUtil {
   // only report the usage of the input "cache_set" without
   // including those Cache usage inside the input list "dbs"
   // of DBs.
+  //
+  // Supports vectors of DB* or unique_ptr<DB>.
+  template <typename DBPtr>
   static Status GetApproximateMemoryUsageByType(
-      const std::vector<DB*>& dbs,
+      const std::vector<DBPtr>& dbs,
       const std::unordered_set<const Cache*> cache_set,
       std::map<MemoryUtil::UsageType, uint64_t>* usage_by_type);
 };
diff --git a/include/rocksdb/utilities/optimistic_transaction_db.h b/include/rocksdb/utilities/optimistic_transaction_db.h
index 875a132e408f..eb9f973a82b1 100644
--- a/include/rocksdb/utilities/optimistic_transaction_db.h
+++ b/include/rocksdb/utilities/optimistic_transaction_db.h
@@ -123,7 +123,8 @@ class OptimisticTransactionDB : public StackableDB {
 
  protected:
   // To Create an OptimisticTransactionDB, call Open()
-  explicit OptimisticTransactionDB(DB* db) : StackableDB(db) {}
+  explicit OptimisticTransactionDB(std::unique_ptr<DB>&& db)
+      : StackableDB(std::move(db)) {}
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/utilities/option_change_migration.h b/include/rocksdb/utilities/option_change_migration.h
index 0ad00cc860e3..5c13329dc130 100644
--- a/include/rocksdb/utilities/option_change_migration.h
+++ b/include/rocksdb/utilities/option_change_migration.h
@@ -6,19 +6,47 @@
 #pragma once
 
 #include <string>
+#include <vector>
 
+#include "rocksdb/db.h"
 #include "rocksdb/options.h"
 #include "rocksdb/status.h"
 
 namespace ROCKSDB_NAMESPACE {
-// Try to migrate DB created with old_opts to be use new_opts.
-// Multiple column families is not supported.
-// It is best-effort. No guarantee to succeed.
-// A full compaction may be executed.
+// Prepares a database to be compatible with new_opts after using old_opts.
+// Restructures the LSM tree but does NOT apply new_opts - you must call
+// DB::Open(new_opts, dbname) afterward to actually use the new configuration.
+// It is best-effort with no guarantee to succeed. A full compaction may be
+// executed.
+//
+// Limitations: single column family only
+//
 // WARNING: using this to migrate from non-FIFO to FIFO compaction
 // with `Options::compaction_options_fifo.max_table_files_size` > 0 can cause
 // the whole DB to be dropped right after migration if the migrated data is
 // larger than `max_table_files_size`
-Status OptionChangeMigration(std::string dbname, const Options& old_opts,
+Status OptionChangeMigration(const std::string& dbname, const Options& old_opts,
                              const Options& new_opts);
+
+// Multi-CF version: Prepares a database with multiple column families to be
+// compatible with new options after using old options.
+//
+// REQUIREMENTS:
+// - old_cf_descs and new_cf_descs MUST have the same number of CFs
+// - old_cf_descs and new_cf_descs MUST have the same CF names IN THE SAME ORDER
+// - Adding or dropping CFs is NOT supported - use CreateColumnFamily() or
+//   DropColumnFamily() separately before/after migration
+//
+// The function will return InvalidArgument status if these requirements are
+// violated.
+//
+// WARNING: using this to migrate from non-FIFO to FIFO compaction
+// with `max_table_files_size` > 0 can cause the whole DB to be dropped right
+// after migration if the migrated data is larger than `max_table_files_size`
+Status OptionChangeMigration(
+    const std::string& dbname, const DBOptions& old_db_opts,
+    const std::vector<ColumnFamilyDescriptor>& old_cf_descs,
+    const DBOptions& new_db_opts,
+    const std::vector<ColumnFamilyDescriptor>& new_cf_descs);
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h
index 244989a6c98e..de43ba386282 100644
--- a/include/rocksdb/utilities/stackable_db.h
+++ b/include/rocksdb/utilities/stackable_db.h
@@ -289,6 +289,13 @@ class StackableDB : public DB {
     return db_->NewAttributeGroupIterator(options, column_families);
   }
 
+  using DB::NewMultiScan;
+  std::unique_ptr<MultiScan> NewMultiScan(
+      const ReadOptions& opts, ColumnFamilyHandle* column_family,
+      const MultiScanArgs& scan_opts) override {
+    return db_->NewMultiScan(opts, column_family, scan_opts);
+  }
+
   const Snapshot* GetSnapshot() override { return db_->GetSnapshot(); }
 
   void ReleaseSnapshot(const Snapshot* snapshot) override {
@@ -368,6 +375,8 @@ class StackableDB : public DB {
   void DisableManualCompaction() override {
     return db_->DisableManualCompaction();
   }
+  void AbortAllCompactions() override { return db_->AbortAllCompactions(); }
+  void ResumeAllCompactions() override { return db_->ResumeAllCompactions(); }
 
   Status WaitForCompact(
       const WaitForCompactOptions& wait_for_compact_options) override {
@@ -379,11 +388,6 @@ class StackableDB : public DB {
     return db_->NumberLevels(column_family);
   }
 
-  using DB::MaxMemCompactionLevel;
-  int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) override {
-    return db_->MaxMemCompactionLevel(column_family);
-  }
-
   using DB::Level0StopWriteTrigger;
   int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) override {
     return db_->Level0StopWriteTrigger(column_family);
@@ -416,7 +420,11 @@ class StackableDB : public DB {
 
   Status SyncWAL() override { return db_->SyncWAL(); }
 
+  using DB::FlushWAL;
   Status FlushWAL(bool sync) override { return db_->FlushWAL(sync); }
+  Status FlushWAL(const FlushWALOptions& options) override {
+    return db_->FlushWAL(options);
+  }
 
   Status LockWAL() override { return db_->LockWAL(); }
 
@@ -445,6 +453,12 @@ class StackableDB : public DB {
     db_->GetColumnFamilyMetaData(column_family, cf_meta);
   }
 
+  void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
+                               const GetColumnFamilyMetaDataOptions& options,
+                               ColumnFamilyMetaData* metadata) override {
+    db_->GetColumnFamilyMetaData(column_family, options, metadata);
+  }
+
   using DB::StartBlockCacheTrace;
   Status StartBlockCacheTrace(
       const TraceOptions& trace_options,
@@ -505,13 +519,18 @@ class StackableDB : public DB {
     return db_->GetFullHistoryTsLow(column_family, ts_low);
   }
 
+  Status GetNewestUserDefinedTimestamp(ColumnFamilyHandle* column_family,
+                                       std::string* newest_timestamp) override {
+    return db_->GetNewestUserDefinedTimestamp(column_family, newest_timestamp);
+  }
+
   Status GetSortedWalFiles(VectorWalPtr& files) override {
     return db_->GetSortedWalFiles(files);
   }
 
   Status GetCurrentWalFile(
-      std::unique_ptr<WalFile>* current_log_file) override {
-    return db_->GetCurrentWalFile(current_log_file);
+      std::unique_ptr<WalFile>* current_wal_file) override {
+    return db_->GetCurrentWalFile(current_wal_file);
   }
 
   Status GetCreationTimeOfOldestFile(uint64_t* creation_time) override {
@@ -527,10 +546,11 @@ class StackableDB : public DB {
   }
 
   using DB::SetOptions;
-  Status SetOptions(ColumnFamilyHandle* column_family_handle,
-                    const std::unordered_map<std::string, std::string>&
-                        new_options) override {
-    return db_->SetOptions(column_family_handle, new_options);
+  Status SetOptions(
+      const std::unordered_map<ColumnFamilyHandle*,
+                               std::unordered_map<std::string, std::string>>&
+          column_families_opts_map) override {
+    return db_->SetOptions(column_families_opts_map);
   }
 
   Status SetDBOptions(const std::unordered_map<std::string, std::string>&
@@ -554,6 +574,14 @@ class StackableDB : public DB {
     return db_->GetPropertiesOfTablesInRange(column_family, range, n, props);
   }
 
+  using DB::GetPropertiesOfTablesByLevel;
+  Status GetPropertiesOfTablesByLevel(
+      ColumnFamilyHandle* column_family,
+      std::vector<std::unique_ptr<TablePropertiesCollection>>* props_by_level)
+      override {
+    return db_->GetPropertiesOfTablesByLevel(column_family, props_by_level);
+  }
+
   Status GetUpdatesSince(
       SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
       const TransactionLogIterator::ReadOptions& read_options) override {
diff --git a/include/rocksdb/utilities/table_properties_collectors.h b/include/rocksdb/utilities/table_properties_collectors.h
index 0f79f725e5d8..c8c8af1de6a8 100644
--- a/include/rocksdb/utilities/table_properties_collectors.h
+++ b/include/rocksdb/utilities/table_properties_collectors.h
@@ -23,15 +23,20 @@ class CompactOnDeletionCollectorFactory
   // A factory of a table property collector that marks a SST
   // file as need-compaction when it observe at least "D" deletion
   // entries in any "N" consecutive entries, or the ratio of tombstone
-  // entries >= deletion_ratio.
+  // entries >= deletion_ratio for the entire file.
   //
   // @param sliding_window_size "N"
   // @param deletion_trigger "D"
   // @param deletion_ratio, if <= 0 or > 1, disable triggering compaction
   //     based on deletion ratio.
+  // @param min_file_size, a file needs to be at least this size to be marked
+  //     for compaction. See comments above
+  //     TablePropertiesCollector::AddUserKey() for limitations/inaccuracies on
+  //     the file size.
   CompactOnDeletionCollectorFactory(size_t sliding_window_size,
                                     size_t deletion_trigger,
-                                    double deletion_ratio);
+                                    double deletion_ratio,
+                                    uint64_t min_file_size = 0);
 
   ~CompactOnDeletionCollectorFactory() override {}
 
@@ -59,6 +64,12 @@ class CompactOnDeletionCollectorFactory
   }
 
   double GetDeletionRatio() const { return deletion_ratio_.load(); }
+
+  uint64_t GetMinFileSize() const { return min_file_size_.load(); }
+  void SetMinFileSize(uint64_t min_file_size) {
+    min_file_size_.store(min_file_size);
+  }
+
   static const char* kClassName() { return "CompactOnDeletionCollector"; }
   const char* Name() const override { return kClassName(); }
 
@@ -68,6 +79,7 @@ class CompactOnDeletionCollectorFactory
   std::atomic<size_t> sliding_window_size_;
   std::atomic<size_t> deletion_trigger_;
   std::atomic<double> deletion_ratio_;
+  std::atomic<uint64_t> min_file_size_;
 };
 
 // Creates a factory of a table property collector that marks a SST
@@ -85,7 +97,8 @@ class CompactOnDeletionCollectorFactory
 std::shared_ptr<CompactOnDeletionCollectorFactory>
 NewCompactOnDeletionCollectorFactory(size_t sliding_window_size,
                                      size_t deletion_trigger,
-                                     double deletion_ratio = 0);
+                                     double deletion_ratio = 0,
+                                     uint64_t min_file_size = 0);
 
 // A factory of a table property collector that marks a SST file as
 // need-compaction when for the tiering use case, it observes, among all the
diff --git a/include/rocksdb/utilities/transaction.h b/include/rocksdb/utilities/transaction.h
index 6c444ac26df5..51b4eb026211 100644
--- a/include/rocksdb/utilities/transaction.h
+++ b/include/rocksdb/utilities/transaction.h
@@ -653,7 +653,12 @@ class Transaction {
   // Change the value of TransactionOptions.lock_timeout (in milliseconds) for
   // this transaction.
   // Has no effect on OptimisticTransactions.
-  virtual void SetLockTimeout(int64_t timeout) = 0;
+  virtual void SetLockTimeout(int64_t timeout_ms) = 0;
+
+  // Change the value of deadlock_timeout (in milliseconds) for this
+  // transaction.
+  // Has no effect on OptimisticTransactions.
+  virtual void SetDeadlockTimeout(int64_t timeout_ms) = 0;
 
   // Return the WriteOptions that will be used during Commit()
   virtual WriteOptions* GetWriteOptions() = 0;
diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h
index 766fe75917c5..e0af0caa0bd1 100644
--- a/include/rocksdb/utilities/transaction_db.h
+++ b/include/rocksdb/utilities/transaction_db.h
@@ -24,9 +24,16 @@ class SecondaryIndex;
 class TransactionDBMutexFactory;
 
 enum TxnDBWritePolicy {
-  WRITE_COMMITTED = 0,  // write only the committed data
-  WRITE_PREPARED,       // write data after the prepare phase of 2pc
-  WRITE_UNPREPARED      // write data before the prepare phase of 2pc
+  // Write data at transaction commit time
+  WRITE_COMMITTED = 0,
+
+  // EXPERIMENTAL: The remaining write policies are not as mature, well
+  // validated, nor as compatible with other features as WRITE_COMMITTED.
+
+  // Write data after the prepare phase of 2pc
+  WRITE_PREPARED,
+  // Write data before the prepare phase of 2pc
+  WRITE_UNPREPARED
 };
 
 constexpr uint32_t kInitialMaxDeadlocks = 5;
@@ -210,6 +217,11 @@ struct TransactionDBOptions {
   // Other value means the user provides a custom lock manager.
   std::shared_ptr<LockManagerHandle> lock_mgr_handle;
 
+  // EXPERIMENTAL
+  //
+  // Flag to enable/disable the per key point lock manager.
+  bool use_per_key_point_lock_mgr = false;
+
   // If true, the TransactionDB implementation might skip concurrency control
   // unless it is overridden by TransactionOptions or
   // TransactionDBWriteOptimizations. This can be used in conjunction with
@@ -247,10 +259,12 @@ struct TransactionDBOptions {
   // for more details.
   std::vector<std::shared_ptr<SecondaryIndex>> secondary_indices;
 
-  // EXPERIMENTAL, SUBJECT TO CHANGE
+  // Deprecated, this option has no effect and may be removed in the future.
+  // Use TransactionOptions::large_txn_commit_optimize_threshold instead.
+  //
   // This option is only valid for write committed. If the number of updates in
-  // a transaction exceeds this threshold, then the transaction commit will skip
-  // insertions into memtable as an optimization to reduce commit latency.
+  // a transaction is at least this threshold, then the transaction commit will
+  // skip insertions into memtable as an optimization to reduce commit latency.
   // See comment for TransactionOptions::commit_bypass_memtable for more detail.
   // Setting TransactionOptions::commit_bypass_memtable to true takes precedence
   // over this option.
@@ -310,6 +324,22 @@ struct TransactionOptions {
   // If negative, TransactionDBOptions::transaction_lock_timeout will be used.
   int64_t lock_timeout = -1;
 
+  // Timeout in microseconds before perform dead lock detection.
+  // If 0, deadlock detection will be performed immediately.
+  //
+  // To optimize performance, this parameter could be tuned.
+  //
+  // When deadlock happens very frequently, deadlock timeout should be set to 0,
+  // so deadlock will be detected immediately.
+  //
+  // When deadlock happen very rarely, this timeout could be turned to be
+  // slightly longer than the typical transaction execution time, so that
+  // transaction will be waked up to take the lock before this timeout, which
+  // will allow the transaction to save the CPU time on deadlock detection.
+  //
+  // Deadlock timeout is always smaller than lock_timeout.
+  int64_t deadlock_timeout_us = 500;
+
   // Expiration duration in milliseconds.  If non-negative, transactions that
   // last longer than this many milliseconds will fail to commit.  If not set,
   // a forgotten transaction that is never committed, rolled back, or deleted
@@ -357,10 +387,28 @@ struct TransactionOptions {
   // DeleteRange, SingleDelete.
   bool write_batch_track_timestamp_size = false;
 
+  // The following three options enable optimizations for large transaction
+  // commit to bypass memtable write.
+  // - If any transaction's commit should bybass memtable write,
+  //  set commit_bypass_memtable to true.
+  // - If only bypass memtable write for transactions with >= n operations,
+  //  set commit_bypass_memtable to false,
+  //  large_txn_commit_optimize_threshold to n, and
+  //  large_txn_commit_optimize_byte_threshold to 0.
+  //  Similarly for only optimize when a transaction's write batch size is >= n.
+  // - If bypass memtable write for transactions with >= n operations or >= x
+  // bytes,
+  //  set commit_bypass_memtable to false,
+  //  large_txn_commit_optimize_threshold to n, and
+  //  large_txn_commit_optimize_byte_threshold to x.
+  //
+  //
   // EXPERIMENTAL, SUBJECT TO CHANGE
   // Only supports write-committed policy. If set to true, the transaction will
   // skip memtable write and ingest into the DB directly during Commit(). This
   // makes Commit() much faster for transactions with many operations.
+  // Transaction neeeds to call Prepare() before Commit() for this option to
+  // take effect.
   // Transactions with Merge() or PutEntity() is not supported yet.
   //
   // Note that the transaction will be ingested as an immutable memtable for
@@ -369,15 +417,31 @@ struct TransactionOptions {
   // due to too many memtables.
   // Note that the ingestion relies on the transaction's underlying index,
   // (WriteBatchWithIndex), so updates that are added to the transaction
-  // without indexing (e.g. added directly to the transaction underlying
+  // without indexing (i.e. added directly to the transaction underlying
   // write batch through Transaction::GetWriteBatch()->GetWriteBatch())
-  // are not supported. They will not be applied to the DB.
+  // are not supported, and the optimization will not apply in that case.
   //
   // NOTE: since WBWI keep track of the most recent update per key, a Put
   // followed by a SingleDelete will be written to DB as a SingleDelete. This
   // can cause flush/compaction to report `num_single_del_mismatch` due to
   // consecutive SingleDeletes.
   bool commit_bypass_memtable = false;
+
+  // EXPERIMENTAL, SUBJECT TO CHANGE
+  // When the number of updates in a transaction is at least this threshold,
+  // we will enable optimizations for commiting a large transaction. See
+  // comment for `commit_bypass_memtable` for more optimization detail.
+  //
+  // Default: 0 (disabled).
+  uint32_t large_txn_commit_optimize_threshold = 0;
+
+  // EXPERIMENTAL, SUBJECT TO CHANGE
+  // When the size of a transaction's write batch is at least this threshold,
+  // we will enable optimizations for commiting a large transaction. See
+  // comment for `commit_bypass_memtable` for more optimization detail.
+  //
+  // Default: 0 (disabled).
+  uint64_t large_txn_commit_optimize_byte_threshold = 0;
 };
 
 // The per-write optimizations that do not involve transactions. TransactionDB
diff --git a/include/rocksdb/utilities/write_batch_with_index.h b/include/rocksdb/utilities/write_batch_with_index.h
index 6ff8b587099d..edced15b9ec7 100644
--- a/include/rocksdb/utilities/write_batch_with_index.h
+++ b/include/rocksdb/utilities/write_batch_with_index.h
@@ -90,6 +90,8 @@ class WBWIIterator {
   // Returns n where the current entry is the n-th update to the current key.
   // The update count starts from 1.
   // Only valid if WBWI is created with overwrite_key = true.
+  // With overwrite_key=false, update count for each entry is not maintained,
+  // see UpdateExistingEntryWithCfId().
   virtual uint32_t GetUpdateCount() const { return 0; }
 };
 
@@ -234,7 +236,8 @@ class WriteBatchWithIndex : public WriteBatchBase {
                                 Iterator* base_iterator,
                                 const ReadOptions* opts = nullptr);
   // default column family
-  Iterator* NewIteratorWithBase(Iterator* base_iterator);
+  Iterator* NewIteratorWithBase(Iterator* base_iterator,
+                                const ReadOptions* opts = nullptr);
 
   // Similar to DB::Get() but will only read the key from this batch.
   // If the batch does not have enough data to resolve Merge operations,
@@ -374,11 +377,10 @@ class WriteBatchWithIndex : public WriteBatchBase {
     uint32_t entry_count = 0;
     uint32_t overwritten_sd_count = 0;
   };
-  // Will track CF ID, per CF entry count and overwritten sd count.
-  // Should be enabled when WBWI is empty for correct tracking.
-  void SetTrackPerCFStat(bool track);
   const std::unordered_map<uint32_t, CFStat>& GetCFStats() const;
 
+  // The total number of operations issued into this WBWI.
+  size_t GetWBWIOpCount() const;
   bool GetOverwriteKey() const;
 
  private:
diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h
index 104a6483dc5c..5fe307d19af8 100644
--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@@ -11,16 +11,21 @@
 
 // NOTE: in 'main' development branch, this should be the *next*
 // minor or major version number planned for release.
-#define ROCKSDB_MAJOR 10
-#define ROCKSDB_MINOR 1
+#define ROCKSDB_MAJOR 11
+#define ROCKSDB_MINOR 0
 #define ROCKSDB_PATCH 0
 
-// Do not use these. We made the mistake of declaring macros starting with
-// double underscore. Now we have to live with our choice. We'll deprecate these
-// at some point
-#define __ROCKSDB_MAJOR__ ROCKSDB_MAJOR
-#define __ROCKSDB_MINOR__ ROCKSDB_MINOR
-#define __ROCKSDB_PATCH__ ROCKSDB_PATCH
+// Make it easy to do conditional compilation based on version checks, i.e.
+// #if ROCKSDB_VERSION_GE(4, 5, 6)
+// int thisCoderequiresVersion_4_5_6_OrGreater;
+// #else
+// int thisCodeIsForOlderVersions;
+// #endif
+#define ROCKSDB_MAKE_VERSION_INT(a, b, c) ((a) * 1000000 + (b) * 1000 + (c))
+#define ROCKSDB_VERSION_INT \
+  ROCKSDB_MAKE_VERSION_INT(ROCKSDB_MAJOR, ROCKSDB_MINOR, ROCKSDB_PATCH)
+#define ROCKSDB_VERSION_GE(a, b, c) \
+  (ROCKSDB_VERSION_INT >= ROCKSDB_MAKE_VERSION_INT(a, b, c))
 
 namespace ROCKSDB_NAMESPACE {
 // Returns a set of properties indicating how/when/where this version of RocksDB
diff --git a/java/CMakeLists.txt b/java/CMakeLists.txt
index a60847ead37d..5dd7be6cd1e4 100644
--- a/java/CMakeLists.txt
+++ b/java/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.4)
+cmake_minimum_required(VERSION 3.11)
 
 set(JAVA_JUNIT_VERSION "4.13.1")
 set(JAVA_HAMCR_VERSION "2.2")
@@ -182,6 +182,7 @@ set(JAVA_MAIN_CLASSES
   src/main/java/org/rocksdb/HyperClockCache.java
   src/main/java/org/rocksdb/ImportColumnFamilyOptions.java
   src/main/java/org/rocksdb/IndexShorteningMode.java
+  src/main/java/org/rocksdb/IndexSearchType.java
   src/main/java/org/rocksdb/IndexType.java
   src/main/java/org/rocksdb/InfoLogLevel.java
   src/main/java/org/rocksdb/IngestExternalFileOptions.java
diff --git a/java/rocksjni/compaction_options_fifo.cc b/java/rocksjni/compaction_options_fifo.cc
index 535562fb47f7..f23eee6c3d2a 100644
--- a/java/rocksjni/compaction_options_fifo.cc
+++ b/java/rocksjni/compaction_options_fifo.cc
@@ -71,6 +71,54 @@ jboolean Java_org_rocksdb_CompactionOptionsFIFO_allowCompaction(JNIEnv*, jclass,
   return static_cast<jboolean>(opt->allow_compaction);
 }
 
+/*
+ * Class:     org_rocksdb_CompactionOptionsFIFO
+ * Method:    setMaxDataFilesSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_CompactionOptionsFIFO_setMaxDataFilesSize(
+    JNIEnv*, jclass, jlong jhandle, jlong jmax_data_files_size) {
+  auto* opt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptionsFIFO*>(jhandle);
+  opt->max_data_files_size = static_cast<uint64_t>(jmax_data_files_size);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptionsFIFO
+ * Method:    maxDataFilesSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionOptionsFIFO_maxDataFilesSize(JNIEnv*, jclass,
+                                                              jlong jhandle) {
+  auto* opt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptionsFIFO*>(jhandle);
+  return static_cast<jlong>(opt->max_data_files_size);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptionsFIFO
+ * Method:    setUseKvRatioCompaction
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_CompactionOptionsFIFO_setUseKvRatioCompaction(
+    JNIEnv*, jclass, jlong jhandle, jboolean use_kv_ratio_compaction) {
+  auto* opt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptionsFIFO*>(jhandle);
+  opt->use_kv_ratio_compaction = static_cast<bool>(use_kv_ratio_compaction);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptionsFIFO
+ * Method:    useKvRatioCompaction
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_CompactionOptionsFIFO_useKvRatioCompaction(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* opt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptionsFIFO*>(jhandle);
+  return static_cast<jboolean>(opt->use_kv_ratio_compaction);
+}
+
 /*
  * Class:     org_rocksdb_CompactionOptionsFIFO
  * Method:    disposeInternal
diff --git a/java/rocksjni/config_options.cc b/java/rocksjni/config_options.cc
index 1532dd9e80ad..2f243f978423 100644
--- a/java/rocksjni/config_options.cc
+++ b/java/rocksjni/config_options.cc
@@ -19,9 +19,9 @@
  * Method:    disposeInternal
  * Signature: (J)V
  */
-void Java_org_rocksdb_ConfigOptions_disposeInternalJni(JNIEnv *, jclass,
+void Java_org_rocksdb_ConfigOptions_disposeInternalJni(JNIEnv*, jclass,
                                                        jlong jhandle) {
-  auto *co = reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions *>(jhandle);
+  auto* co = reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions*>(jhandle);
   assert(co != nullptr);
   delete co;
 }
@@ -31,8 +31,8 @@ void Java_org_rocksdb_ConfigOptions_disposeInternalJni(JNIEnv *, jclass,
  * Method:    newConfigOptions
  * Signature: ()J
  */
-jlong Java_org_rocksdb_ConfigOptions_newConfigOptions(JNIEnv *, jclass) {
-  auto *cfg_opt = new ROCKSDB_NAMESPACE::ConfigOptions();
+jlong Java_org_rocksdb_ConfigOptions_newConfigOptions(JNIEnv*, jclass) {
+  auto* cfg_opt = new ROCKSDB_NAMESPACE::ConfigOptions();
   return GET_CPLUSPLUS_POINTER(cfg_opt);
 }
 
@@ -41,11 +41,11 @@ jlong Java_org_rocksdb_ConfigOptions_newConfigOptions(JNIEnv *, jclass) {
  * Method:    setEnv
  * Signature: (JJ;)V
  */
-void Java_org_rocksdb_ConfigOptions_setEnv(JNIEnv *, jclass, jlong handle,
+void Java_org_rocksdb_ConfigOptions_setEnv(JNIEnv*, jclass, jlong handle,
                                            jlong rocksdb_env_handle) {
-  auto *cfg_opt = reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions *>(handle);
-  auto *rocksdb_env =
-      reinterpret_cast<ROCKSDB_NAMESPACE::Env *>(rocksdb_env_handle);
+  auto* cfg_opt = reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions*>(handle);
+  auto* rocksdb_env =
+      reinterpret_cast<ROCKSDB_NAMESPACE::Env*>(rocksdb_env_handle);
   cfg_opt->env = rocksdb_env;
 }
 
@@ -54,10 +54,10 @@ void Java_org_rocksdb_ConfigOptions_setEnv(JNIEnv *, jclass, jlong handle,
  * Method:    setDelimiter
  * Signature: (JLjava/lang/String;)V
  */
-void Java_org_rocksdb_ConfigOptions_setDelimiter(JNIEnv *env, jclass,
+void Java_org_rocksdb_ConfigOptions_setDelimiter(JNIEnv* env, jclass,
                                                  jlong handle, jstring s) {
-  auto *cfg_opt = reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions *>(handle);
-  const char *delim = env->GetStringUTFChars(s, nullptr);
+  auto* cfg_opt = reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions*>(handle);
+  const char* delim = env->GetStringUTFChars(s, nullptr);
   if (delim == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
@@ -71,10 +71,10 @@ void Java_org_rocksdb_ConfigOptions_setDelimiter(JNIEnv *env, jclass,
  * Method:    setIgnoreUnknownOptions
  * Signature: (JZ)V
  */
-void Java_org_rocksdb_ConfigOptions_setIgnoreUnknownOptions(JNIEnv *, jclass,
+void Java_org_rocksdb_ConfigOptions_setIgnoreUnknownOptions(JNIEnv*, jclass,
                                                             jlong handle,
                                                             jboolean b) {
-  auto *cfg_opt = reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions *>(handle);
+  auto* cfg_opt = reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions*>(handle);
   cfg_opt->ignore_unknown_options = static_cast<bool>(b);
 }
 
@@ -83,10 +83,10 @@ void Java_org_rocksdb_ConfigOptions_setIgnoreUnknownOptions(JNIEnv *, jclass,
  * Method:    setInputStringsEscaped
  * Signature: (JZ)V
  */
-void Java_org_rocksdb_ConfigOptions_setInputStringsEscaped(JNIEnv *, jclass,
+void Java_org_rocksdb_ConfigOptions_setInputStringsEscaped(JNIEnv*, jclass,
                                                            jlong handle,
                                                            jboolean b) {
-  auto *cfg_opt = reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions *>(handle);
+  auto* cfg_opt = reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions*>(handle);
   cfg_opt->input_strings_escaped = static_cast<bool>(b);
 }
 
@@ -95,9 +95,9 @@ void Java_org_rocksdb_ConfigOptions_setInputStringsEscaped(JNIEnv *, jclass,
  * Method:    setSanityLevel
  * Signature: (JI)V
  */
-void Java_org_rocksdb_ConfigOptions_setSanityLevel(JNIEnv *, jclass,
+void Java_org_rocksdb_ConfigOptions_setSanityLevel(JNIEnv*, jclass,
                                                    jlong handle, jbyte level) {
-  auto *cfg_opt = reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions *>(handle);
+  auto* cfg_opt = reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions*>(handle);
   cfg_opt->sanity_level =
       ROCKSDB_NAMESPACE::SanityLevelJni::toCppSanityLevel(level);
 }
diff --git a/java/rocksjni/env_options.cc b/java/rocksjni/env_options.cc
index c3a9ae825da1..3f2577193e65 100644
--- a/java/rocksjni/env_options.cc
+++ b/java/rocksjni/env_options.cc
@@ -13,28 +13,28 @@
 #include "rocksdb/env.h"
 #include "rocksjni/cplusplus_to_java_convert.h"
 
-#define ENV_OPTIONS_SET_BOOL(_jhandle, _opt)                          \
-  reinterpret_cast<ROCKSDB_NAMESPACE::EnvOptions *>(_jhandle)->_opt = \
+#define ENV_OPTIONS_SET_BOOL(_jhandle, _opt)                         \
+  reinterpret_cast<ROCKSDB_NAMESPACE::EnvOptions*>(_jhandle)->_opt = \
       static_cast<bool>(_opt)
 
-#define ENV_OPTIONS_SET_SIZE_T(_jhandle, _opt)                        \
-  reinterpret_cast<ROCKSDB_NAMESPACE::EnvOptions *>(_jhandle)->_opt = \
+#define ENV_OPTIONS_SET_SIZE_T(_jhandle, _opt)                       \
+  reinterpret_cast<ROCKSDB_NAMESPACE::EnvOptions*>(_jhandle)->_opt = \
       static_cast<size_t>(_opt)
 
-#define ENV_OPTIONS_SET_UINT64_T(_jhandle, _opt)                      \
-  reinterpret_cast<ROCKSDB_NAMESPACE::EnvOptions *>(_jhandle)->_opt = \
+#define ENV_OPTIONS_SET_UINT64_T(_jhandle, _opt)                     \
+  reinterpret_cast<ROCKSDB_NAMESPACE::EnvOptions*>(_jhandle)->_opt = \
       static_cast<uint64_t>(_opt)
 
 #define ENV_OPTIONS_GET(_jhandle, _opt) \
-  reinterpret_cast<ROCKSDB_NAMESPACE::EnvOptions *>(_jhandle)->_opt
+  reinterpret_cast<ROCKSDB_NAMESPACE::EnvOptions*>(_jhandle)->_opt
 
 /*
  * Class:     org_rocksdb_EnvOptions
  * Method:    newEnvOptions
  * Signature: ()J
  */
-jlong Java_org_rocksdb_EnvOptions_newEnvOptions__(JNIEnv *, jclass) {
-  auto *env_opt = new ROCKSDB_NAMESPACE::EnvOptions();
+jlong Java_org_rocksdb_EnvOptions_newEnvOptions__(JNIEnv*, jclass) {
+  auto* env_opt = new ROCKSDB_NAMESPACE::EnvOptions();
   return GET_CPLUSPLUS_POINTER(env_opt);
 }
 
@@ -43,11 +43,11 @@ jlong Java_org_rocksdb_EnvOptions_newEnvOptions__(JNIEnv *, jclass) {
  * Method:    newEnvOptions
  * Signature: (J)J
  */
-jlong Java_org_rocksdb_EnvOptions_newEnvOptions__J(JNIEnv *, jclass,
+jlong Java_org_rocksdb_EnvOptions_newEnvOptions__J(JNIEnv*, jclass,
                                                    jlong jdboptions_handle) {
-  auto *db_options =
-      reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions *>(jdboptions_handle);
-  auto *env_opt = new ROCKSDB_NAMESPACE::EnvOptions(*db_options);
+  auto* db_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jdboptions_handle);
+  auto* env_opt = new ROCKSDB_NAMESPACE::EnvOptions(*db_options);
   return GET_CPLUSPLUS_POINTER(env_opt);
 }
 
@@ -56,9 +56,9 @@ jlong Java_org_rocksdb_EnvOptions_newEnvOptions__J(JNIEnv *, jclass,
  * Method:    disposeInternal
  * Signature: (J)V
  */
-void Java_org_rocksdb_EnvOptions_disposeInternalJni(JNIEnv *, jclass,
+void Java_org_rocksdb_EnvOptions_disposeInternalJni(JNIEnv*, jclass,
                                                     jlong jhandle) {
-  auto *eo = reinterpret_cast<ROCKSDB_NAMESPACE::EnvOptions *>(jhandle);
+  auto* eo = reinterpret_cast<ROCKSDB_NAMESPACE::EnvOptions*>(jhandle);
   assert(eo != nullptr);
   delete eo;
 }
@@ -68,8 +68,7 @@ void Java_org_rocksdb_EnvOptions_disposeInternalJni(JNIEnv *, jclass,
  * Method:    setUseMmapReads
  * Signature: (JZ)V
  */
-void Java_org_rocksdb_EnvOptions_setUseMmapReads(JNIEnv *, jclass,
-                                                 jlong jhandle,
+void Java_org_rocksdb_EnvOptions_setUseMmapReads(JNIEnv*, jclass, jlong jhandle,
                                                  jboolean use_mmap_reads) {
   ENV_OPTIONS_SET_BOOL(jhandle, use_mmap_reads);
 }
@@ -79,7 +78,7 @@ void Java_org_rocksdb_EnvOptions_setUseMmapReads(JNIEnv *, jclass,
  * Method:    useMmapReads
  * Signature: (J)Z
  */
-jboolean Java_org_rocksdb_EnvOptions_useMmapReads(JNIEnv *, jclass,
+jboolean Java_org_rocksdb_EnvOptions_useMmapReads(JNIEnv*, jclass,
                                                   jlong jhandle) {
   return ENV_OPTIONS_GET(jhandle, use_mmap_reads);
 }
@@ -89,7 +88,7 @@ jboolean Java_org_rocksdb_EnvOptions_useMmapReads(JNIEnv *, jclass,
  * Method:    setUseMmapWrites
  * Signature: (JZ)V
  */
-void Java_org_rocksdb_EnvOptions_setUseMmapWrites(JNIEnv *, jclass,
+void Java_org_rocksdb_EnvOptions_setUseMmapWrites(JNIEnv*, jclass,
                                                   jlong jhandle,
                                                   jboolean use_mmap_writes) {
   ENV_OPTIONS_SET_BOOL(jhandle, use_mmap_writes);
@@ -100,7 +99,7 @@ void Java_org_rocksdb_EnvOptions_setUseMmapWrites(JNIEnv *, jclass,
  * Method:    useMmapWrites
  * Signature: (J)Z
  */
-jboolean Java_org_rocksdb_EnvOptions_useMmapWrites(JNIEnv *, jclass,
+jboolean Java_org_rocksdb_EnvOptions_useMmapWrites(JNIEnv*, jclass,
                                                    jlong jhandle) {
   return ENV_OPTIONS_GET(jhandle, use_mmap_writes);
 }
@@ -110,7 +109,7 @@ jboolean Java_org_rocksdb_EnvOptions_useMmapWrites(JNIEnv *, jclass,
  * Method:    setUseDirectReads
  * Signature: (JZ)V
  */
-void Java_org_rocksdb_EnvOptions_setUseDirectReads(JNIEnv *, jclass,
+void Java_org_rocksdb_EnvOptions_setUseDirectReads(JNIEnv*, jclass,
                                                    jlong jhandle,
                                                    jboolean use_direct_reads) {
   ENV_OPTIONS_SET_BOOL(jhandle, use_direct_reads);
@@ -121,7 +120,7 @@ void Java_org_rocksdb_EnvOptions_setUseDirectReads(JNIEnv *, jclass,
  * Method:    useDirectReads
  * Signature: (J)Z
  */
-jboolean Java_org_rocksdb_EnvOptions_useDirectReads(JNIEnv *, jclass,
+jboolean Java_org_rocksdb_EnvOptions_useDirectReads(JNIEnv*, jclass,
                                                     jlong jhandle) {
   return ENV_OPTIONS_GET(jhandle, use_direct_reads);
 }
@@ -132,7 +131,7 @@ jboolean Java_org_rocksdb_EnvOptions_useDirectReads(JNIEnv *, jclass,
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_EnvOptions_setUseDirectWrites(
-    JNIEnv *, jclass, jlong jhandle, jboolean use_direct_writes) {
+    JNIEnv*, jclass, jlong jhandle, jboolean use_direct_writes) {
   ENV_OPTIONS_SET_BOOL(jhandle, use_direct_writes);
 }
 
@@ -141,7 +140,7 @@ void Java_org_rocksdb_EnvOptions_setUseDirectWrites(
  * Method:    useDirectWrites
  * Signature: (J)Z
  */
-jboolean Java_org_rocksdb_EnvOptions_useDirectWrites(JNIEnv *, jclass,
+jboolean Java_org_rocksdb_EnvOptions_useDirectWrites(JNIEnv*, jclass,
                                                      jlong jhandle) {
   return ENV_OPTIONS_GET(jhandle, use_direct_writes);
 }
@@ -151,7 +150,7 @@ jboolean Java_org_rocksdb_EnvOptions_useDirectWrites(JNIEnv *, jclass,
  * Method:    setAllowFallocate
  * Signature: (JZ)V
  */
-void Java_org_rocksdb_EnvOptions_setAllowFallocate(JNIEnv *, jclass,
+void Java_org_rocksdb_EnvOptions_setAllowFallocate(JNIEnv*, jclass,
                                                    jlong jhandle,
                                                    jboolean allow_fallocate) {
   ENV_OPTIONS_SET_BOOL(jhandle, allow_fallocate);
@@ -162,7 +161,7 @@ void Java_org_rocksdb_EnvOptions_setAllowFallocate(JNIEnv *, jclass,
  * Method:    allowFallocate
  * Signature: (J)Z
  */
-jboolean Java_org_rocksdb_EnvOptions_allowFallocate(JNIEnv *, jclass,
+jboolean Java_org_rocksdb_EnvOptions_allowFallocate(JNIEnv*, jclass,
                                                     jlong jhandle) {
   return ENV_OPTIONS_GET(jhandle, allow_fallocate);
 }
@@ -172,8 +171,7 @@ jboolean Java_org_rocksdb_EnvOptions_allowFallocate(JNIEnv *, jclass,
  * Method:    setSetFdCloexec
  * Signature: (JZ)V
  */
-void Java_org_rocksdb_EnvOptions_setSetFdCloexec(JNIEnv *, jclass,
-                                                 jlong jhandle,
+void Java_org_rocksdb_EnvOptions_setSetFdCloexec(JNIEnv*, jclass, jlong jhandle,
                                                  jboolean set_fd_cloexec) {
   ENV_OPTIONS_SET_BOOL(jhandle, set_fd_cloexec);
 }
@@ -183,7 +181,7 @@ void Java_org_rocksdb_EnvOptions_setSetFdCloexec(JNIEnv *, jclass,
  * Method:    setFdCloexec
  * Signature: (J)Z
  */
-jboolean Java_org_rocksdb_EnvOptions_setFdCloexec(JNIEnv *, jclass,
+jboolean Java_org_rocksdb_EnvOptions_setFdCloexec(JNIEnv*, jclass,
                                                   jlong jhandle) {
   return ENV_OPTIONS_GET(jhandle, set_fd_cloexec);
 }
@@ -193,8 +191,7 @@ jboolean Java_org_rocksdb_EnvOptions_setFdCloexec(JNIEnv *, jclass,
  * Method:    setBytesPerSync
  * Signature: (JJ)V
  */
-void Java_org_rocksdb_EnvOptions_setBytesPerSync(JNIEnv *, jclass,
-                                                 jlong jhandle,
+void Java_org_rocksdb_EnvOptions_setBytesPerSync(JNIEnv*, jclass, jlong jhandle,
                                                  jlong bytes_per_sync) {
   ENV_OPTIONS_SET_UINT64_T(jhandle, bytes_per_sync);
 }
@@ -204,8 +201,7 @@ void Java_org_rocksdb_EnvOptions_setBytesPerSync(JNIEnv *, jclass,
  * Method:    bytesPerSync
  * Signature: (J)J
  */
-jlong Java_org_rocksdb_EnvOptions_bytesPerSync(JNIEnv *, jclass,
-                                               jlong jhandle) {
+jlong Java_org_rocksdb_EnvOptions_bytesPerSync(JNIEnv*, jclass, jlong jhandle) {
   return ENV_OPTIONS_GET(jhandle, bytes_per_sync);
 }
 
@@ -215,7 +211,7 @@ jlong Java_org_rocksdb_EnvOptions_bytesPerSync(JNIEnv *, jclass,
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_EnvOptions_setFallocateWithKeepSize(
-    JNIEnv *, jclass, jlong jhandle, jboolean fallocate_with_keep_size) {
+    JNIEnv*, jclass, jlong jhandle, jboolean fallocate_with_keep_size) {
   ENV_OPTIONS_SET_BOOL(jhandle, fallocate_with_keep_size);
 }
 
@@ -224,7 +220,7 @@ void Java_org_rocksdb_EnvOptions_setFallocateWithKeepSize(
  * Method:    fallocateWithKeepSize
  * Signature: (J)Z
  */
-jboolean Java_org_rocksdb_EnvOptions_fallocateWithKeepSize(JNIEnv *, jclass,
+jboolean Java_org_rocksdb_EnvOptions_fallocateWithKeepSize(JNIEnv*, jclass,
                                                            jlong jhandle) {
   return ENV_OPTIONS_GET(jhandle, fallocate_with_keep_size);
 }
@@ -235,7 +231,7 @@ jboolean Java_org_rocksdb_EnvOptions_fallocateWithKeepSize(JNIEnv *, jclass,
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_EnvOptions_setCompactionReadaheadSize(
-    JNIEnv *, jclass, jlong jhandle, jlong compaction_readahead_size) {
+    JNIEnv*, jclass, jlong jhandle, jlong compaction_readahead_size) {
   ENV_OPTIONS_SET_SIZE_T(jhandle, compaction_readahead_size);
 }
 
@@ -244,7 +240,7 @@ void Java_org_rocksdb_EnvOptions_setCompactionReadaheadSize(
  * Method:    compactionReadaheadSize
  * Signature: (J)J
  */
-jlong Java_org_rocksdb_EnvOptions_compactionReadaheadSize(JNIEnv *, jclass,
+jlong Java_org_rocksdb_EnvOptions_compactionReadaheadSize(JNIEnv*, jclass,
                                                           jlong jhandle) {
   return ENV_OPTIONS_GET(jhandle, compaction_readahead_size);
 }
@@ -255,7 +251,7 @@ jlong Java_org_rocksdb_EnvOptions_compactionReadaheadSize(JNIEnv *, jclass,
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_EnvOptions_setWritableFileMaxBufferSize(
-    JNIEnv *, jclass, jlong jhandle, jlong writable_file_max_buffer_size) {
+    JNIEnv*, jclass, jlong jhandle, jlong writable_file_max_buffer_size) {
   ENV_OPTIONS_SET_SIZE_T(jhandle, writable_file_max_buffer_size);
 }
 
@@ -264,7 +260,7 @@ void Java_org_rocksdb_EnvOptions_setWritableFileMaxBufferSize(
  * Method:    writableFileMaxBufferSize
  * Signature: (J)J
  */
-jlong Java_org_rocksdb_EnvOptions_writableFileMaxBufferSize(JNIEnv *, jclass,
+jlong Java_org_rocksdb_EnvOptions_writableFileMaxBufferSize(JNIEnv*, jclass,
                                                             jlong jhandle) {
   return ENV_OPTIONS_GET(jhandle, writable_file_max_buffer_size);
 }
@@ -274,11 +270,11 @@ jlong Java_org_rocksdb_EnvOptions_writableFileMaxBufferSize(JNIEnv *, jclass,
  * Method:    setRateLimiter
  * Signature: (JJ)V
  */
-void Java_org_rocksdb_EnvOptions_setRateLimiter(JNIEnv *, jclass, jlong jhandle,
+void Java_org_rocksdb_EnvOptions_setRateLimiter(JNIEnv*, jclass, jlong jhandle,
                                                 jlong rl_handle) {
-  auto *sptr_rate_limiter =
-      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::RateLimiter> *>(
+  auto* sptr_rate_limiter =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::RateLimiter>*>(
           rl_handle);
-  auto *env_opt = reinterpret_cast<ROCKSDB_NAMESPACE::EnvOptions *>(jhandle);
+  auto* env_opt = reinterpret_cast<ROCKSDB_NAMESPACE::EnvOptions*>(jhandle);
   env_opt->rate_limiter = sptr_rate_limiter->get();
 }
diff --git a/java/rocksjni/import_column_family_options.cc b/java/rocksjni/import_column_family_options.cc
index 1a9bded516b1..cd7bdfe007fa 100644
--- a/java/rocksjni/import_column_family_options.cc
+++ b/java/rocksjni/import_column_family_options.cc
@@ -16,8 +16,8 @@
  * Signature: ()J
  */
 jlong Java_org_rocksdb_ImportColumnFamilyOptions_newImportColumnFamilyOptions(
-    JNIEnv *, jclass) {
-  ROCKSDB_NAMESPACE::ImportColumnFamilyOptions *opts =
+    JNIEnv*, jclass) {
+  ROCKSDB_NAMESPACE::ImportColumnFamilyOptions* opts =
       new ROCKSDB_NAMESPACE::ImportColumnFamilyOptions();
   return GET_CPLUSPLUS_POINTER(opts);
 }
@@ -28,9 +28,9 @@ jlong Java_org_rocksdb_ImportColumnFamilyOptions_newImportColumnFamilyOptions(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_ImportColumnFamilyOptions_setMoveFiles(
-    JNIEnv *, jobject, jlong jhandle, jboolean jmove_files) {
-  auto *options =
-      reinterpret_cast<ROCKSDB_NAMESPACE::ImportColumnFamilyOptions *>(jhandle);
+    JNIEnv*, jobject, jlong jhandle, jboolean jmove_files) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ImportColumnFamilyOptions*>(jhandle);
   options->move_files = static_cast<bool>(jmove_files);
 }
 
@@ -39,10 +39,10 @@ void Java_org_rocksdb_ImportColumnFamilyOptions_setMoveFiles(
  * Method:    moveFiles
  * Signature: (J)Z
  */
-jboolean Java_org_rocksdb_ImportColumnFamilyOptions_moveFiles(JNIEnv *, jobject,
+jboolean Java_org_rocksdb_ImportColumnFamilyOptions_moveFiles(JNIEnv*, jobject,
                                                               jlong jhandle) {
-  auto *options =
-      reinterpret_cast<ROCKSDB_NAMESPACE::ImportColumnFamilyOptions *>(jhandle);
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ImportColumnFamilyOptions*>(jhandle);
   return static_cast<jboolean>(options->move_files);
 }
 
@@ -51,9 +51,9 @@ jboolean Java_org_rocksdb_ImportColumnFamilyOptions_moveFiles(JNIEnv *, jobject,
  * Method:    disposeInternal
  * Signature: (J)V
  */
-void Java_org_rocksdb_ImportColumnFamilyOptions_disposeInternal(JNIEnv *,
+void Java_org_rocksdb_ImportColumnFamilyOptions_disposeInternal(JNIEnv*,
                                                                 jobject,
                                                                 jlong jhandle) {
-  delete reinterpret_cast<ROCKSDB_NAMESPACE::ImportColumnFamilyOptions *>(
+  delete reinterpret_cast<ROCKSDB_NAMESPACE::ImportColumnFamilyOptions*>(
       jhandle);
 }
\ No newline at end of file
diff --git a/java/rocksjni/kv_helper.h b/java/rocksjni/kv_helper.h
index 5f0a8ffc57eb..75f254b173cd 100644
--- a/java/rocksjni/kv_helper.h
+++ b/java/rocksjni/kv_helper.h
@@ -81,7 +81,7 @@ class KVException : public std::exception {
     }
   }
 
-  KVException(jint code) : kCode_(code){};
+  KVException(jint code) : kCode_(code) {};
 
   virtual const char* what() const noexcept {
     return "Exception raised by JNI. There may be a Java exception in the "
@@ -176,13 +176,13 @@ class JByteArrayPinnableSlice {
       : env_(env),
         jbuffer_(jbuffer),
         jbuffer_off_(jbuffer_off),
-        jbuffer_len_(jbuffer_len){};
+        jbuffer_len_(jbuffer_len) {};
 
   /**
    * @brief Construct an empty new JByteArrayPinnableSlice object
    *
    */
-  JByteArrayPinnableSlice(JNIEnv* env) : env_(env){};
+  JByteArrayPinnableSlice(JNIEnv* env) : env_(env) {};
 
   PinnableSlice& pinnable_slice() { return pinnable_slice_; }
 
diff --git a/java/rocksjni/memory_util.cc b/java/rocksjni/memory_util.cc
index c87c4f403bbb..d60a89296481 100644
--- a/java/rocksjni/memory_util.cc
+++ b/java/rocksjni/memory_util.cc
@@ -21,9 +21,9 @@
  * Signature: ([J[J)Ljava/util/Map;
  */
 jobject Java_org_rocksdb_MemoryUtil_getApproximateMemoryUsageByType(
-    JNIEnv *env, jclass, jlongArray jdb_handles, jlongArray jcache_handles) {
+    JNIEnv* env, jclass, jlongArray jdb_handles, jlongArray jcache_handles) {
   jboolean has_exception = JNI_FALSE;
-  std::vector<ROCKSDB_NAMESPACE::DB *> dbs =
+  std::vector<ROCKSDB_NAMESPACE::DB*> dbs =
       ROCKSDB_NAMESPACE::JniUtil::fromJPointers<ROCKSDB_NAMESPACE::DB>(
           env, jdb_handles, &has_exception);
   if (has_exception == JNI_TRUE) {
@@ -31,18 +31,18 @@ jobject Java_org_rocksdb_MemoryUtil_getApproximateMemoryUsageByType(
     return nullptr;
   }
 
-  std::unordered_set<const ROCKSDB_NAMESPACE::Cache *> cache_set;
+  std::unordered_set<const ROCKSDB_NAMESPACE::Cache*> cache_set;
   jsize cache_handle_count = env->GetArrayLength(jcache_handles);
   if (cache_handle_count > 0) {
-    jlong *ptr_jcache_handles =
+    jlong* ptr_jcache_handles =
         env->GetLongArrayElements(jcache_handles, nullptr);
     if (ptr_jcache_handles == nullptr) {
       // exception thrown: OutOfMemoryError
       return nullptr;
     }
     for (jsize i = 0; i < cache_handle_count; i++) {
-      auto *cache_ptr =
-          reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Cache> *>(
+      auto* cache_ptr =
+          reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Cache>*>(
               ptr_jcache_handles[i]);
       cache_set.insert(cache_ptr->get());
     }
@@ -68,7 +68,7 @@ jobject Java_org_rocksdb_MemoryUtil_getApproximateMemoryUsageByType(
       jobject>
       fn_map_kv = [env](
                       const std::pair<ROCKSDB_NAMESPACE::MemoryUtil::UsageType,
-                                      uint64_t> &pair) {
+                                      uint64_t>& pair) {
         // Construct key
         const jobject jusage_type = ROCKSDB_NAMESPACE::ByteJni::valueOf(
             env, ROCKSDB_NAMESPACE::MemoryUsageTypeJni::toJavaMemoryUsageType(
diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
index c986511a3f2f..3166e6625090 100644
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@@ -1959,30 +1959,6 @@ jboolean Java_org_rocksdb_Options_skipStatsUpdateOnDbOpen(JNIEnv*, jclass,
   return static_cast<jboolean>(opt->skip_stats_update_on_db_open);
 }
 
-/*
- * Class:     org_rocksdb_Options
- * Method:    setSkipCheckingSstFileSizesOnDbOpen
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_Options_setSkipCheckingSstFileSizesOnDbOpen(
-    JNIEnv*, jclass, jlong jhandle,
-    jboolean jskip_checking_sst_file_sizes_on_db_open) {
-  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
-  opt->skip_checking_sst_file_sizes_on_db_open =
-      static_cast<bool>(jskip_checking_sst_file_sizes_on_db_open);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    skipCheckingSstFileSizesOnDbOpen
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_Options_skipCheckingSstFileSizesOnDbOpen(
-    JNIEnv*, jclass, jlong jhandle) {
-  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
-  return static_cast<jboolean>(opt->skip_checking_sst_file_sizes_on_db_open);
-}
-
 /*
  * Class:     org_rocksdb_Options
  * Method:    setWalRecoveryMode
@@ -2055,29 +2031,6 @@ void Java_org_rocksdb_Options_setWalFilter(JNIEnv*, jclass, jlong jhandle,
   opt->wal_filter = wal_filter;
 }
 
-/*
- * Class:     org_rocksdb_Options
- * Method:    setFailIfOptionsFileError
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_Options_setFailIfOptionsFileError(
-    JNIEnv*, jclass, jlong jhandle, jboolean jfail_if_options_file_error) {
-  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
-  opt->fail_if_options_file_error =
-      static_cast<bool>(jfail_if_options_file_error);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    failIfOptionsFileError
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_Options_failIfOptionsFileError(JNIEnv*, jclass,
-                                                         jlong jhandle) {
-  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
-  return static_cast<jboolean>(opt->fail_if_options_file_error);
-}
-
 /*
  * Class:     org_rocksdb_Options
  * Method:    setDumpMallocStats
@@ -2456,28 +2409,6 @@ void Java_org_rocksdb_Options_setMinWriteBufferNumberToMerge(
       ->min_write_buffer_number_to_merge =
       static_cast<int>(jmin_write_buffer_number_to_merge);
 }
-/*
- * Class:     org_rocksdb_Options
- * Method:    maxWriteBufferNumberToMaintain
- * Signature: (J)I
- */
-jint Java_org_rocksdb_Options_maxWriteBufferNumberToMaintain(JNIEnv*, jclass,
-                                                             jlong jhandle) {
-  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
-      ->max_write_buffer_number_to_maintain;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setMaxWriteBufferNumberToMaintain
- * Signature: (JI)V
- */
-void Java_org_rocksdb_Options_setMaxWriteBufferNumberToMaintain(
-    JNIEnv*, jclass, jlong jhandle, jint jmax_write_buffer_number_to_maintain) {
-  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
-      ->max_write_buffer_number_to_maintain =
-      static_cast<int>(jmax_write_buffer_number_to_maintain);
-}
 
 /*
  * Class:     org_rocksdb_Options
@@ -4496,29 +4427,6 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMinWriteBufferNumberToMerge(
       static_cast<int>(jmin_write_buffer_number_to_merge);
 }
 
-/*
- * Class:     org_rocksdb_ColumnFamilyOptions
- * Method:    maxWriteBufferNumberToMaintain
- * Signature: (J)I
- */
-jint Java_org_rocksdb_ColumnFamilyOptions_maxWriteBufferNumberToMaintain(
-    JNIEnv*, jclass, jlong jhandle) {
-  return reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
-      ->max_write_buffer_number_to_maintain;
-}
-
-/*
- * Class:     org_rocksdb_ColumnFamilyOptions
- * Method:    setMaxWriteBufferNumberToMaintain
- * Signature: (JI)V
- */
-void Java_org_rocksdb_ColumnFamilyOptions_setMaxWriteBufferNumberToMaintain(
-    JNIEnv*, jclass, jlong jhandle, jint jmax_write_buffer_number_to_maintain) {
-  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
-      ->max_write_buffer_number_to_maintain =
-      static_cast<int>(jmax_write_buffer_number_to_maintain);
-}
-
 /*
  * Class:     org_rocksdb_ColumnFamilyOptions
  * Method:    setCompressionType
@@ -7427,30 +7335,6 @@ jboolean Java_org_rocksdb_DBOptions_skipStatsUpdateOnDbOpen(JNIEnv*, jclass,
   return static_cast<jboolean>(opt->skip_stats_update_on_db_open);
 }
 
-/*
- * Class:     org_rocksdb_DBOptions
- * Method:    setSkipCheckingSstFileSizesOnDbOpen
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_DBOptions_setSkipCheckingSstFileSizesOnDbOpen(
-    JNIEnv*, jclass, jlong jhandle,
-    jboolean jskip_checking_sst_file_sizes_on_db_open) {
-  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
-  opt->skip_checking_sst_file_sizes_on_db_open =
-      static_cast<bool>(jskip_checking_sst_file_sizes_on_db_open);
-}
-
-/*
- * Class:     org_rocksdb_DBOptions
- * Method:    skipCheckingSstFileSizesOnDbOpen
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_DBOptions_skipCheckingSstFileSizesOnDbOpen(
-    JNIEnv*, jclass, jlong jhandle) {
-  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
-  return static_cast<jboolean>(opt->skip_checking_sst_file_sizes_on_db_open);
-}
-
 /*
  * Class:     org_rocksdb_DBOptions
  * Method:    setWalRecoveryMode
@@ -7524,29 +7408,6 @@ void Java_org_rocksdb_DBOptions_setWalFilter(JNIEnv*, jclass, jlong jhandle,
   opt->wal_filter = wal_filter;
 }
 
-/*
- * Class:     org_rocksdb_DBOptions
- * Method:    setFailIfOptionsFileError
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_DBOptions_setFailIfOptionsFileError(
-    JNIEnv*, jclass, jlong jhandle, jboolean jfail_if_options_file_error) {
-  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
-  opt->fail_if_options_file_error =
-      static_cast<bool>(jfail_if_options_file_error);
-}
-
-/*
- * Class:     org_rocksdb_DBOptions
- * Method:    failIfOptionsFileError
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_DBOptions_failIfOptionsFileError(JNIEnv*, jclass,
-                                                           jlong jhandle) {
-  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
-  return static_cast<jboolean>(opt->fail_if_options_file_error);
-}
-
 /*
  * Class:     org_rocksdb_DBOptions
  * Method:    setDumpMallocStats
@@ -8170,26 +8031,6 @@ jboolean Java_org_rocksdb_ReadOptions_tailing(JNIEnv*, jclass, jlong jhandle) {
   return reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle)->tailing;
 }
 
-/*
- * Class:     org_rocksdb_ReadOptions
- * Method:    managed
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_ReadOptions_managed(JNIEnv*, jclass, jlong jhandle) {
-  return reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle)->managed;
-}
-
-/*
- * Class:     org_rocksdb_ReadOptions
- * Method:    setManaged
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_ReadOptions_setManaged(JNIEnv*, jclass, jlong jhandle,
-                                             jboolean jmanaged) {
-  reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle)->managed =
-      static_cast<bool>(jmanaged);
-}
-
 /*
  * Class:     org_rocksdb_ReadOptions
  * Method:    totalOrderSeek
diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h
index d0f288ca8281..9600a736573a 100644
--- a/java/rocksjni/portal.h
+++ b/java/rocksjni/portal.h
@@ -5101,9 +5101,9 @@ class TickerTypeJni {
         return -0x1;
       case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BYTES_READ:
         return -0x2;
-      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED:
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED_DEPRECATED:
         return -0x3;
-      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED_TTL:
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED_TTL_DEPRECATED:
         return -0x4;
       case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_BLOB:
         return -0x5;
@@ -5195,6 +5195,8 @@ class TickerTypeJni {
         return -0x2F;
       case ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_WRITE_BYTES:
         return -0x30;
+      case ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_RESUMED_BYTES:
+        return -0x5F;
       case ROCKSDB_NAMESPACE::Tickers::HOT_FILE_READ_BYTES:
         return -0x31;
       case ROCKSDB_NAMESPACE::Tickers::WARM_FILE_READ_BYTES:
@@ -5273,6 +5275,38 @@ class TickerTypeJni {
         return -0x56;
       case ROCKSDB_NAMESPACE::Tickers::FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT:
         return -0x57;
+      case ROCKSDB_NAMESPACE::Tickers::FIFO_CHANGE_TEMPERATURE_COMPACTIONS:
+        return -0x58;
+      case ROCKSDB_NAMESPACE::Tickers::ICE_FILE_READ_BYTES:
+        return -0x59;
+      case ROCKSDB_NAMESPACE::Tickers::ICE_FILE_READ_COUNT:
+        return -0x5A;
+      case ROCKSDB_NAMESPACE::Tickers::COOL_FILE_READ_BYTES:
+        return -0x5B;
+      case ROCKSDB_NAMESPACE::Tickers::COOL_FILE_READ_COUNT:
+        return -0x5C;
+      case ROCKSDB_NAMESPACE::Tickers::NUMBER_WBWI_INGEST:
+        return -0x5D;
+      case ROCKSDB_NAMESPACE::Tickers::SST_USER_DEFINED_INDEX_LOAD_FAIL_COUNT:
+        return -0x5E;
+      case ROCKSDB_NAMESPACE::Tickers::MULTISCAN_PREPARE_CALLS:
+        return -0x60;
+      case ROCKSDB_NAMESPACE::Tickers::MULTISCAN_PREPARE_ERRORS:
+        return -0x61;
+      case ROCKSDB_NAMESPACE::Tickers::MULTISCAN_BLOCKS_PREFETCHED:
+        return -0x62;
+      case ROCKSDB_NAMESPACE::Tickers::MULTISCAN_BLOCKS_FROM_CACHE:
+        return -0x63;
+      case ROCKSDB_NAMESPACE::Tickers::MULTISCAN_PREFETCH_BYTES:
+        return -0x64;
+      case ROCKSDB_NAMESPACE::Tickers::MULTISCAN_PREFETCH_BLOCKS_WASTED:
+        return -0x65;
+      case ROCKSDB_NAMESPACE::Tickers::MULTISCAN_IO_REQUESTS:
+        return -0x66;
+      case ROCKSDB_NAMESPACE::Tickers::MULTISCAN_IO_COALESCED_NONADJACENT:
+        return -0x67;
+      case ROCKSDB_NAMESPACE::Tickers::MULTISCAN_SEEK_ERRORS:
+        return -0x68;
       case ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX:
         // -0x54 is the max value at this time. Since these values are exposed
         // directly to Java clients, we'll keep the value the same till the next
@@ -5560,9 +5594,9 @@ class TickerTypeJni {
       case -0x2:
         return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BYTES_READ;
       case -0x3:
-        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED;
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED_DEPRECATED;
       case -0x4:
-        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED_TTL;
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED_TTL_DEPRECATED;
       case -0x5:
         return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_BLOB;
       case -0x6:
@@ -5654,6 +5688,8 @@ class TickerTypeJni {
         return ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_READ_BYTES;
       case -0x30:
         return ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_WRITE_BYTES;
+      case -0x5F:
+        return ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_RESUMED_BYTES;
       case -0x31:
         return ROCKSDB_NAMESPACE::Tickers::HOT_FILE_READ_BYTES;
       case -0x32:
@@ -5735,6 +5771,39 @@ class TickerTypeJni {
       case -0x57:
         return ROCKSDB_NAMESPACE::Tickers::
             FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT;
+      case -0x58:
+        return ROCKSDB_NAMESPACE::Tickers::FIFO_CHANGE_TEMPERATURE_COMPACTIONS;
+      case -0x59:
+        return ROCKSDB_NAMESPACE::Tickers::ICE_FILE_READ_BYTES;
+      case -0x5A:
+        return ROCKSDB_NAMESPACE::Tickers::ICE_FILE_READ_COUNT;
+      case -0x5B:
+        return ROCKSDB_NAMESPACE::Tickers::COOL_FILE_READ_BYTES;
+      case -0x5C:
+        return ROCKSDB_NAMESPACE::Tickers::COOL_FILE_READ_COUNT;
+      case -0x5D:
+        return ROCKSDB_NAMESPACE::Tickers::NUMBER_WBWI_INGEST;
+      case -0x5E:
+        return ROCKSDB_NAMESPACE::Tickers::
+            SST_USER_DEFINED_INDEX_LOAD_FAIL_COUNT;
+      case -0x60:
+        return ROCKSDB_NAMESPACE::Tickers::MULTISCAN_PREPARE_CALLS;
+      case -0x61:
+        return ROCKSDB_NAMESPACE::Tickers::MULTISCAN_PREPARE_ERRORS;
+      case -0x62:
+        return ROCKSDB_NAMESPACE::Tickers::MULTISCAN_BLOCKS_PREFETCHED;
+      case -0x63:
+        return ROCKSDB_NAMESPACE::Tickers::MULTISCAN_BLOCKS_FROM_CACHE;
+      case -0x64:
+        return ROCKSDB_NAMESPACE::Tickers::MULTISCAN_PREFETCH_BYTES;
+      case -0x65:
+        return ROCKSDB_NAMESPACE::Tickers::MULTISCAN_PREFETCH_BLOCKS_WASTED;
+      case -0x66:
+        return ROCKSDB_NAMESPACE::Tickers::MULTISCAN_IO_REQUESTS;
+      case -0x67:
+        return ROCKSDB_NAMESPACE::Tickers::MULTISCAN_IO_COALESCED_NONADJACENT;
+      case -0x68:
+        return ROCKSDB_NAMESPACE::Tickers::MULTISCAN_SEEK_ERRORS;
       case -0x54:
         // -0x54 is the max value at this time. Since these values are exposed
         // directly to Java clients, we'll keep the value the same till the next
@@ -5889,8 +5958,15 @@ class HistogramTypeJni {
         return 0x3C;
       case ROCKSDB_NAMESPACE::Histograms::TABLE_OPEN_PREFETCH_TAIL_READ_BYTES:
         return 0x3D;
+      case ROCKSDB_NAMESPACE::Histograms::COMPACTION_PREFETCH_BYTES:
+        return 0x3F;
+      case ROCKSDB_NAMESPACE::Histograms::MULTISCAN_PREPARE_MICROS:
+        return 0x40;
+      case ROCKSDB_NAMESPACE::Histograms::MULTISCAN_BLOCKS_PER_PREPARE:
+        return 0x41;
       case ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX:
-        // 0x3D for backwards compatibility on current minor version.
+        // 0x3E is reserved for backwards compatibility on current minor
+        // version.
         return 0x3E;
       default:
         // undefined/default
@@ -6033,8 +6109,15 @@ class HistogramTypeJni {
       case 0x3D:
         return ROCKSDB_NAMESPACE::Histograms::
             TABLE_OPEN_PREFETCH_TAIL_READ_BYTES;
+      case 0x3F:
+        return ROCKSDB_NAMESPACE::Histograms::COMPACTION_PREFETCH_BYTES;
+      case 0x40:
+        return ROCKSDB_NAMESPACE::Histograms::MULTISCAN_PREPARE_MICROS;
+      case 0x41:
+        return ROCKSDB_NAMESPACE::Histograms::MULTISCAN_BLOCKS_PER_PREPARE;
       case 0x3E:
-        // 0x1F for backwards compatibility on current minor version.
+        // 0x3E is reserved for backwards compatibility on current minor
+        // version.
         return ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX;
 
       default:
@@ -6933,6 +7016,44 @@ class DataBlockIndexTypeJni {
   }
 };
 
+// The portal class for org.rocksdb.IndexSearchType
+class IndexSearchTypeJni {
+ public:
+  // Returns the equivalent org.rocksdb.IndexSearchType for the provided
+  // C++ ROCKSDB_NAMESPACE::BlockSearchType enum
+  static jbyte toJavaIndexSearchType(
+      const ROCKSDB_NAMESPACE::BlockBasedTableOptions::BlockSearchType&
+          index_block_search_type) {
+    switch (index_block_search_type) {
+      case ROCKSDB_NAMESPACE::BlockBasedTableOptions::BlockSearchType::kBinary:
+        return 0x0;
+      case ROCKSDB_NAMESPACE::BlockBasedTableOptions::BlockSearchType::
+          kInterpolation:
+        return 0x1;
+      default:
+        return 0x7F;  // undefined
+    }
+  }
+
+  // Returns the equivalent C++ ROCKSDB_NAMESPACE::BlockSearchType enum for
+  // the provided Java org.rocksdb.IndexSearchType
+  static ROCKSDB_NAMESPACE::BlockBasedTableOptions::BlockSearchType
+  toCppIndexSearchType(jbyte jindex_search_type) {
+    switch (jindex_search_type) {
+      case 0x0:
+        return ROCKSDB_NAMESPACE::BlockBasedTableOptions::BlockSearchType::
+            kBinary;
+      case 0x1:
+        return ROCKSDB_NAMESPACE::BlockBasedTableOptions::BlockSearchType::
+            kInterpolation;
+      default:
+        // undefined/default
+        return ROCKSDB_NAMESPACE::BlockBasedTableOptions::BlockSearchType::
+            kBinary;
+    }
+  }
+};
+
 // The portal class for org.rocksdb.ChecksumType
 class ChecksumTypeJni {
  public:
@@ -9117,7 +9238,7 @@ class BlockBasedTableOptionsJni
     }
 
     jmethodID method_id_init =
-        env->GetMethodID(jclazz, "<init>", "(ZZZZBBDBZJIIIJZZZZZIIZZBBJD)V");
+        env->GetMethodID(jclazz, "<init>", "(ZZZZBBDBZJIIIJZZZZZIIZZJJBBBJD)V");
     if (method_id_init == nullptr) {
       // exception thrown: NoSuchMethodException or OutOfMemoryError
       return nullptr;
@@ -9162,8 +9283,13 @@ class BlockBasedTableOptionsJni
         table_factory_options->format_version,
         table_factory_options->enable_index_compression,
         table_factory_options->block_align,
+        static_cast<jlong>(table_factory_options->super_block_alignment_size),
+        static_cast<jlong>(
+            table_factory_options->super_block_alignment_space_overhead_ratio),
         IndexShorteningModeJni::toJavaIndexShorteningMode(
             table_factory_options->index_shortening),
+        IndexSearchTypeJni::toJavaIndexSearchType(
+            table_factory_options->index_block_search_type),
         FilterPolicyJni::toJavaIndexType(filter_policy_type),
         filter_policy_handle, filter_policy_config_value);
     if (env->ExceptionCheck()) {
diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc
index 9561b3893661..4a33d4e2f5e4 100644
--- a/java/rocksjni/rocksjni.cc
+++ b/java/rocksjni/rocksjni.cc
@@ -34,11 +34,12 @@
 #undef min
 #endif
 
-jlong rocksdb_open_helper(JNIEnv* env, jlong jopt_handle, jstring jdb_path,
-                          std::function<ROCKSDB_NAMESPACE::Status(
-                              const ROCKSDB_NAMESPACE::Options&,
-                              const std::string&, ROCKSDB_NAMESPACE::DB**)>
-                              open_fn) {
+jlong rocksdb_open_helper(
+    JNIEnv* env, jlong jopt_handle, jstring jdb_path,
+    std::function<ROCKSDB_NAMESPACE::Status(
+        const ROCKSDB_NAMESPACE::Options&, const std::string&,
+        std::unique_ptr<ROCKSDB_NAMESPACE::DB>*)>
+        open_fn) {
   const char* db_path = env->GetStringUTFChars(jdb_path, nullptr);
   if (db_path == nullptr) {
     // exception thrown: OutOfMemoryError
@@ -46,13 +47,13 @@ jlong rocksdb_open_helper(JNIEnv* env, jlong jopt_handle, jstring jdb_path,
   }
 
   auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jopt_handle);
-  ROCKSDB_NAMESPACE::DB* db = nullptr;
+  std::unique_ptr<ROCKSDB_NAMESPACE::DB> db;
   ROCKSDB_NAMESPACE::Status s = open_fn(*opt, db_path, &db);
 
   env->ReleaseStringUTFChars(jdb_path, db_path);
 
   if (s.ok()) {
-    return GET_CPLUSPLUS_POINTER(db);
+    return GET_CPLUSPLUS_POINTER(db.release());
   } else {
     ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
     return 0;
@@ -67,11 +68,12 @@ jlong rocksdb_open_helper(JNIEnv* env, jlong jopt_handle, jstring jdb_path,
 jlong Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2(JNIEnv* env, jclass,
                                                           jlong jopt_handle,
                                                           jstring jdb_path) {
-  return rocksdb_open_helper(env, jopt_handle, jdb_path,
-                             (ROCKSDB_NAMESPACE::Status(*)(
-                                 const ROCKSDB_NAMESPACE::Options&,
-                                 const std::string&, ROCKSDB_NAMESPACE::DB**)) &
-                                 ROCKSDB_NAMESPACE::DB::Open);
+  return rocksdb_open_helper(
+      env, jopt_handle, jdb_path,
+      [](const ROCKSDB_NAMESPACE::Options& options, const std::string& db_path,
+         std::unique_ptr<ROCKSDB_NAMESPACE::DB>* db) {
+        return ROCKSDB_NAMESPACE::DB::Open(options, db_path, db);
+      });
 }
 
 /*
@@ -87,7 +89,7 @@ jlong Java_org_rocksdb_RocksDB_openROnly__JLjava_lang_String_2Z(
       env, jopt_handle, jdb_path,
       [error_if_wal_file_exists](const ROCKSDB_NAMESPACE::Options& options,
                                  const std::string& db_path,
-                                 ROCKSDB_NAMESPACE::DB** db) {
+                                 std::unique_ptr<ROCKSDB_NAMESPACE::DB>* db) {
         return ROCKSDB_NAMESPACE::DB::OpenForReadOnly(options, db_path, db,
                                                       error_if_wal_file_exists);
       });
@@ -100,7 +102,7 @@ jlongArray rocksdb_open_helper(
         const ROCKSDB_NAMESPACE::DBOptions&, const std::string&,
         const std::vector<ROCKSDB_NAMESPACE::ColumnFamilyDescriptor>&,
         std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>*,
-        ROCKSDB_NAMESPACE::DB**)>
+        std::unique_ptr<ROCKSDB_NAMESPACE::DB>*)>
         open_fn) {
   const char* db_path = env->GetStringUTFChars(jdb_path, nullptr);
   if (db_path == nullptr) {
@@ -141,7 +143,7 @@ jlongArray rocksdb_open_helper(
 
   auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jopt_handle);
   std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*> cf_handles;
-  ROCKSDB_NAMESPACE::DB* db = nullptr;
+  std::unique_ptr<ROCKSDB_NAMESPACE::DB> db;
   ROCKSDB_NAMESPACE::Status s =
       open_fn(*opt, db_path, column_families, &cf_handles, &db);
 
@@ -157,7 +159,7 @@ jlongArray rocksdb_open_helper(
   const jsize resultsLen = 1 + len_cols;  // db handle + column family handles
   std::unique_ptr<jlong[]> results =
       std::unique_ptr<jlong[]>(new jlong[resultsLen]);
-  results[0] = GET_CPLUSPLUS_POINTER(db);
+  results[0] = GET_CPLUSPLUS_POINTER(db.release());
   for (int i = 1; i <= len_cols; i++) {
     results[i] = GET_CPLUSPLUS_POINTER(cf_handles[i - 1]);
   }
@@ -196,7 +198,7 @@ jlongArray Java_org_rocksdb_RocksDB_openROnly__JLjava_lang_String_2_3_3B_3JZ(
           const std::vector<ROCKSDB_NAMESPACE::ColumnFamilyDescriptor>&
               column_families,
           std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>* handles,
-          ROCKSDB_NAMESPACE::DB** db) {
+          std::unique_ptr<ROCKSDB_NAMESPACE::DB>* db) {
         return ROCKSDB_NAMESPACE::DB::OpenForReadOnly(
             options, db_path, column_families, handles, db,
             error_if_wal_file_exists);
@@ -213,12 +215,15 @@ jlongArray Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2_3_3B_3J(
     jobjectArray jcolumn_names, jlongArray jcolumn_options) {
   return rocksdb_open_helper(
       env, jopt_handle, jdb_path, jcolumn_names, jcolumn_options,
-      (ROCKSDB_NAMESPACE::Status(*)(
-          const ROCKSDB_NAMESPACE::DBOptions&, const std::string&,
-          const std::vector<ROCKSDB_NAMESPACE::ColumnFamilyDescriptor>&,
-          std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>*,
-          ROCKSDB_NAMESPACE::DB**)) &
-          ROCKSDB_NAMESPACE::DB::Open);
+      [](const ROCKSDB_NAMESPACE::DBOptions& options,
+         const std::string& db_path,
+         const std::vector<ROCKSDB_NAMESPACE::ColumnFamilyDescriptor>&
+             column_families,
+         std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>* handles,
+         std::unique_ptr<ROCKSDB_NAMESPACE::DB>* db) {
+        return ROCKSDB_NAMESPACE::DB::Open(options, db_path, column_families,
+                                           handles, db);
+      });
 }
 
 /*
@@ -240,7 +245,7 @@ jlong Java_org_rocksdb_RocksDB_openAsSecondary__JLjava_lang_String_2Ljava_lang_S
       env, jopt_handle, jdb_path,
       [secondary_db_path](const ROCKSDB_NAMESPACE::Options& options,
                           const std::string& db_path,
-                          ROCKSDB_NAMESPACE::DB** db) {
+                          std::unique_ptr<ROCKSDB_NAMESPACE::DB>* db) {
         return ROCKSDB_NAMESPACE::DB::OpenAsSecondary(options, db_path,
                                                       secondary_db_path, db);
       });
@@ -276,7 +281,7 @@ Java_org_rocksdb_RocksDB_openAsSecondary__JLjava_lang_String_2Ljava_lang_String_
           const std::vector<ROCKSDB_NAMESPACE::ColumnFamilyDescriptor>&
               column_families,
           std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>* handles,
-          ROCKSDB_NAMESPACE::DB** db) {
+          std::unique_ptr<ROCKSDB_NAMESPACE::DB>* db) {
         return ROCKSDB_NAMESPACE::DB::OpenAsSecondary(
             options, db_path, secondary_db_path, column_families, handles, db);
       });
@@ -1210,6 +1215,9 @@ jint Java_org_rocksdb_RocksDB_getDirect(JNIEnv* env, jclass /*jdb*/,
           db->DefaultColumnFamily(), key.slice(), &value.pinnable_slice());
     }
 
+    if (s.IsNotFound()) {
+      return ROCKSDB_NAMESPACE::KVException::kNotFound;
+    }
     ROCKSDB_NAMESPACE::KVException::ThrowOnError(env, s);
     return value.Fetch();
   } catch (ROCKSDB_NAMESPACE::KVException& e) {
@@ -1453,10 +1461,13 @@ jbyteArray Java_org_rocksdb_RocksDB_get__J_3BII(JNIEnv* env, jclass,
   try {
     ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len);
     ROCKSDB_NAMESPACE::JByteArrayPinnableSlice value(env);
-    ROCKSDB_NAMESPACE::KVException::ThrowOnError(
-        env,
+    ROCKSDB_NAMESPACE::Status s =
         db->Get(ROCKSDB_NAMESPACE::ReadOptions(), db->DefaultColumnFamily(),
-                key.slice(), &value.pinnable_slice()));
+                key.slice(), &value.pinnable_slice());
+    if (s.IsNotFound()) {
+      return nullptr;
+    }
+    ROCKSDB_NAMESPACE::KVException::ThrowOnError(env, s);
     return value.NewByteArray();
 
   } catch (ROCKSDB_NAMESPACE::KVException&) {
@@ -1484,9 +1495,13 @@ jbyteArray Java_org_rocksdb_RocksDB_get__J_3BIIJ(JNIEnv* env, jclass,
   try {
     ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len);
     ROCKSDB_NAMESPACE::JByteArrayPinnableSlice value(env);
-    ROCKSDB_NAMESPACE::KVException::ThrowOnError(
-        env, db->Get(ROCKSDB_NAMESPACE::ReadOptions(), cf_handle, key.slice(),
-                     &value.pinnable_slice()));
+    ROCKSDB_NAMESPACE::Status s =
+        db->Get(ROCKSDB_NAMESPACE::ReadOptions(), cf_handle, key.slice(),
+                &value.pinnable_slice());
+    if (s.IsNotFound()) {
+      return nullptr;
+    }
+    ROCKSDB_NAMESPACE::KVException::ThrowOnError(env, s);
     return value.NewByteArray();
 
   } catch (ROCKSDB_NAMESPACE::KVException&) {
@@ -1509,11 +1524,13 @@ jbyteArray Java_org_rocksdb_RocksDB_get__JJ_3BII(JNIEnv* env, jclass,
   try {
     ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len);
     ROCKSDB_NAMESPACE::JByteArrayPinnableSlice value(env);
-    ROCKSDB_NAMESPACE::KVException::ThrowOnError(
-        env,
-        db->Get(
-            *reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jropt_handle),
-            db->DefaultColumnFamily(), key.slice(), &value.pinnable_slice()));
+    ROCKSDB_NAMESPACE::Status s = db->Get(
+        *reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jropt_handle),
+        db->DefaultColumnFamily(), key.slice(), &value.pinnable_slice());
+    if (s.IsNotFound()) {
+      return nullptr;
+    }
+    ROCKSDB_NAMESPACE::KVException::ThrowOnError(env, s);
     return value.NewByteArray();
   } catch (ROCKSDB_NAMESPACE::KVException&) {
     return nullptr;
@@ -1538,10 +1555,13 @@ jbyteArray Java_org_rocksdb_RocksDB_get__JJ_3BIIJ(
   try {
     ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len);
     ROCKSDB_NAMESPACE::JByteArrayPinnableSlice value(env);
-    ROCKSDB_NAMESPACE::KVException::ThrowOnError(
-        env, db->Get(*reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(
-                         jropt_handle),
-                     cf_handle, key.slice(), &value.pinnable_slice()));
+    ROCKSDB_NAMESPACE::Status s = db->Get(
+        *reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jropt_handle),
+        cf_handle, key.slice(), &value.pinnable_slice());
+    if (s.IsNotFound()) {
+      return nullptr;
+    }
+    ROCKSDB_NAMESPACE::KVException::ThrowOnError(env, s);
     return value.NewByteArray();
   } catch (ROCKSDB_NAMESPACE::KVException&) {
     return nullptr;
@@ -1563,10 +1583,13 @@ jint Java_org_rocksdb_RocksDB_get__J_3BII_3BII(JNIEnv* env, jclass,
     ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len);
     ROCKSDB_NAMESPACE::JByteArrayPinnableSlice value(env, jval, jval_off,
                                                      jval_len);
-    ROCKSDB_NAMESPACE::KVException::ThrowOnError(
-        env,
+    ROCKSDB_NAMESPACE::Status s =
         db->Get(ROCKSDB_NAMESPACE::ReadOptions(), db->DefaultColumnFamily(),
-                key.slice(), &value.pinnable_slice()));
+                key.slice(), &value.pinnable_slice());
+    if (s.IsNotFound()) {
+      return ROCKSDB_NAMESPACE::KVException::kNotFound;
+    }
+    ROCKSDB_NAMESPACE::KVException::ThrowOnError(env, s);
     return value.Fetch();
 
   } catch (ROCKSDB_NAMESPACE::KVException& e) {
@@ -1595,9 +1618,13 @@ jint Java_org_rocksdb_RocksDB_get__J_3BII_3BIIJ(JNIEnv* env, jclass,
     ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len);
     ROCKSDB_NAMESPACE::JByteArrayPinnableSlice value(env, jval, jval_off,
                                                      jval_len);
-    ROCKSDB_NAMESPACE::KVException::ThrowOnError(
-        env, db->Get(ROCKSDB_NAMESPACE::ReadOptions(), cf_handle, key.slice(),
-                     &value.pinnable_slice()));
+    ROCKSDB_NAMESPACE::Status s =
+        db->Get(ROCKSDB_NAMESPACE::ReadOptions(), cf_handle, key.slice(),
+                &value.pinnable_slice());
+    if (s.IsNotFound()) {
+      return ROCKSDB_NAMESPACE::KVException::kNotFound;
+    }
+    ROCKSDB_NAMESPACE::KVException::ThrowOnError(env, s);
     return value.Fetch();
 
   } catch (ROCKSDB_NAMESPACE::KVException& e) {
@@ -1621,11 +1648,13 @@ jint Java_org_rocksdb_RocksDB_get__JJ_3BII_3BII(JNIEnv* env, jclass,
     ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len);
     ROCKSDB_NAMESPACE::JByteArrayPinnableSlice value(env, jval, jval_off,
                                                      jval_len);
-    ROCKSDB_NAMESPACE::KVException::ThrowOnError(
-        env,
-        db->Get(
-            *reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jropt_handle),
-            db->DefaultColumnFamily(), key.slice(), &value.pinnable_slice()));
+    ROCKSDB_NAMESPACE::Status s = db->Get(
+        *reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jropt_handle),
+        db->DefaultColumnFamily(), key.slice(), &value.pinnable_slice());
+    if (s.IsNotFound()) {
+      return ROCKSDB_NAMESPACE::KVException::kNotFound;
+    }
+    ROCKSDB_NAMESPACE::KVException::ThrowOnError(env, s);
     return value.Fetch();
 
   } catch (ROCKSDB_NAMESPACE::KVException& e) {
@@ -1652,10 +1681,13 @@ jint Java_org_rocksdb_RocksDB_get__JJ_3BII_3BIIJ(
     ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len);
     ROCKSDB_NAMESPACE::JByteArrayPinnableSlice value(env, jval, jval_off,
                                                      jval_len);
-    ROCKSDB_NAMESPACE::KVException::ThrowOnError(
-        env, db->Get(*reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(
-                         jropt_handle),
-                     cf_handle, key.slice(), &value.pinnable_slice()));
+    ROCKSDB_NAMESPACE::Status s = db->Get(
+        *reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jropt_handle),
+        cf_handle, key.slice(), &value.pinnable_slice());
+    if (s.IsNotFound()) {
+      return ROCKSDB_NAMESPACE::KVException::kNotFound;
+    }
+    ROCKSDB_NAMESPACE::KVException::ThrowOnError(env, s);
     return value.Fetch();
 
   } catch (ROCKSDB_NAMESPACE::KVException& e) {
@@ -2951,6 +2983,28 @@ void Java_org_rocksdb_RocksDB_continueBackgroundWork(JNIEnv* env, jclass,
   }
 }
 
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    abortAllCompactions
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksDB_abortAllCompactions(JNIEnv*, jclass,
+                                                  jlong jdb_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  db->AbortAllCompactions();
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    resumeAllCompactions
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksDB_resumeAllCompactions(JNIEnv*, jclass,
+                                                   jlong jdb_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  db->ResumeAllCompactions();
+}
+
 /*
  * Class:     org_rocksdb_RocksDB
  * Method:    enableAutoCompaction
@@ -2996,17 +3050,9 @@ jint Java_org_rocksdb_RocksDB_numberLevels(JNIEnv*, jclass, jlong jdb_handle,
  * Signature: (JJ)I
  */
 jint Java_org_rocksdb_RocksDB_maxMemCompactionLevel(JNIEnv*, jclass,
-                                                    jlong jdb_handle,
-                                                    jlong jcf_handle) {
-  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
-  ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle;
-  if (jcf_handle == 0) {
-    cf_handle = db->DefaultColumnFamily();
-  } else {
-    cf_handle =
-        reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
-  }
-  return static_cast<jint>(db->MaxMemCompactionLevel(cf_handle));
+                                                    jlong /*jdb_handle*/,
+                                                    jlong /*jcf_handle*/) {
+  return 0;
 }
 
 /*
@@ -3637,7 +3683,7 @@ void Java_org_rocksdb_RocksDB_destroyDB(JNIEnv* env, jclass, jstring jdb_path,
 }
 
 bool get_slice_helper(JNIEnv* env, jobjectArray ranges, jsize index,
-                      std::unique_ptr<ROCKSDB_NAMESPACE::Slice>& slice,
+                      ROCKSDB_NAMESPACE::OptSlice& opt_slice,
                       std::vector<std::unique_ptr<jbyte[]>>& ranges_to_free) {
   jobject jArray = env->GetObjectArrayElement(ranges, index);
   if (env->ExceptionCheck()) {
@@ -3659,8 +3705,8 @@ bool get_slice_helper(JNIEnv* env, jobjectArray ranges, jsize index,
     return false;
   }
   env->DeleteLocalRef(jArray);
-  slice.reset(new ROCKSDB_NAMESPACE::Slice(
-      reinterpret_cast<char*>(ranges_to_free.back().get()), len_ba));
+  opt_slice = ROCKSDB_NAMESPACE::Slice(
+      reinterpret_cast<char*>(ranges_to_free.back().get()), len_ba);
   return true;
 }
 /*
@@ -3675,24 +3721,24 @@ void Java_org_rocksdb_RocksDB_deleteFilesInRanges(JNIEnv* env, jclass /*jdb*/,
                                                   jboolean include_end) {
   jsize length = env->GetArrayLength(ranges);
 
-  std::vector<ROCKSDB_NAMESPACE::RangePtr> rangesVector;
-  std::vector<std::unique_ptr<ROCKSDB_NAMESPACE::Slice>> slices;
+  std::vector<ROCKSDB_NAMESPACE::RangeOpt> rangesVector;
+  std::vector<ROCKSDB_NAMESPACE::OptSlice> slices;
   std::vector<std::unique_ptr<jbyte[]>> ranges_to_free;
   for (jsize i = 0; (i + 1) < length; i += 2) {
-    slices.push_back(std::unique_ptr<ROCKSDB_NAMESPACE::Slice>());
+    slices.emplace_back();
     if (!get_slice_helper(env, ranges, i, slices.back(), ranges_to_free)) {
       // exception thrown
       return;
     }
 
-    slices.push_back(std::unique_ptr<ROCKSDB_NAMESPACE::Slice>());
+    slices.emplace_back();
     if (!get_slice_helper(env, ranges, i + 1, slices.back(), ranges_to_free)) {
       // exception thrown
       return;
     }
 
-    rangesVector.push_back(ROCKSDB_NAMESPACE::RangePtr(
-        slices[slices.size() - 2].get(), slices[slices.size() - 1].get()));
+    rangesVector.push_back(ROCKSDB_NAMESPACE::RangeOpt(
+        slices[slices.size() - 2], slices[slices.size() - 1]));
   }
 
   auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
diff --git a/java/rocksjni/sst_file_readerjni.cc b/java/rocksjni/sst_file_readerjni.cc
index 4af472ecfb1c..c0370b1d64d8 100644
--- a/java/rocksjni/sst_file_readerjni.cc
+++ b/java/rocksjni/sst_file_readerjni.cc
@@ -24,12 +24,11 @@
  * Method:    newSstFileReader
  * Signature: (J)J
  */
-jlong Java_org_rocksdb_SstFileReader_newSstFileReader(JNIEnv * /*env*/,
+jlong Java_org_rocksdb_SstFileReader_newSstFileReader(JNIEnv* /*env*/,
                                                       jclass /*jcls*/,
                                                       jlong joptions) {
-  auto *options =
-      reinterpret_cast<const ROCKSDB_NAMESPACE::Options *>(joptions);
-  ROCKSDB_NAMESPACE::SstFileReader *sst_file_reader =
+  auto* options = reinterpret_cast<const ROCKSDB_NAMESPACE::Options*>(joptions);
+  ROCKSDB_NAMESPACE::SstFileReader* sst_file_reader =
       new ROCKSDB_NAMESPACE::SstFileReader(*options);
   return GET_CPLUSPLUS_POINTER(sst_file_reader);
 }
@@ -39,15 +38,15 @@ jlong Java_org_rocksdb_SstFileReader_newSstFileReader(JNIEnv * /*env*/,
  * Method:    open
  * Signature: (JLjava/lang/String;)V
  */
-void Java_org_rocksdb_SstFileReader_open(JNIEnv *env, jclass /*jcls*/,
+void Java_org_rocksdb_SstFileReader_open(JNIEnv* env, jclass /*jcls*/,
                                          jlong jhandle, jstring jfile_path) {
-  const char *file_path = env->GetStringUTFChars(jfile_path, nullptr);
+  const char* file_path = env->GetStringUTFChars(jfile_path, nullptr);
   if (file_path == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
   }
   ROCKSDB_NAMESPACE::Status s =
-      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileReader *>(jhandle)->Open(
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileReader*>(jhandle)->Open(
           file_path);
   env->ReleaseStringUTFChars(jfile_path, file_path);
 
@@ -61,13 +60,13 @@ void Java_org_rocksdb_SstFileReader_open(JNIEnv *env, jclass /*jcls*/,
  * Method:    newIterator
  * Signature: (JJ)J
  */
-jlong Java_org_rocksdb_SstFileReader_newIterator(JNIEnv * /*env*/,
+jlong Java_org_rocksdb_SstFileReader_newIterator(JNIEnv* /*env*/,
                                                  jclass /*jcls*/, jlong jhandle,
                                                  jlong jread_options_handle) {
-  auto *sst_file_reader =
-      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileReader *>(jhandle);
-  auto *read_options =
-      reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions *>(jread_options_handle);
+  auto* sst_file_reader =
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileReader*>(jhandle);
+  auto* read_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jread_options_handle);
   return GET_CPLUSPLUS_POINTER(sst_file_reader->NewIterator(*read_options));
 }
 
@@ -76,10 +75,10 @@ jlong Java_org_rocksdb_SstFileReader_newIterator(JNIEnv * /*env*/,
  * Method:    disposeInternal
  * Signature: (J)V
  */
-void Java_org_rocksdb_SstFileReader_disposeInternalJni(JNIEnv * /*env*/,
+void Java_org_rocksdb_SstFileReader_disposeInternalJni(JNIEnv* /*env*/,
                                                        jclass /*jcls*/,
                                                        jlong jhandle) {
-  delete reinterpret_cast<ROCKSDB_NAMESPACE::SstFileReader *>(jhandle);
+  delete reinterpret_cast<ROCKSDB_NAMESPACE::SstFileReader*>(jhandle);
 }
 
 /*
@@ -87,10 +86,10 @@ void Java_org_rocksdb_SstFileReader_disposeInternalJni(JNIEnv * /*env*/,
  * Method:    verifyChecksum
  * Signature: (J)V
  */
-void Java_org_rocksdb_SstFileReader_verifyChecksum(JNIEnv *env, jclass /*jcls*/,
+void Java_org_rocksdb_SstFileReader_verifyChecksum(JNIEnv* env, jclass /*jcls*/,
                                                    jlong jhandle) {
-  auto *sst_file_reader =
-      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileReader *>(jhandle);
+  auto* sst_file_reader =
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileReader*>(jhandle);
   auto s = sst_file_reader->VerifyChecksum();
   if (!s.ok()) {
     ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
@@ -102,11 +101,11 @@ void Java_org_rocksdb_SstFileReader_verifyChecksum(JNIEnv *env, jclass /*jcls*/,
  * Method:    getTableProperties
  * Signature: (J)J
  */
-jobject Java_org_rocksdb_SstFileReader_getTableProperties(JNIEnv *env,
+jobject Java_org_rocksdb_SstFileReader_getTableProperties(JNIEnv* env,
                                                           jclass /*jcls*/,
                                                           jlong jhandle) {
-  auto *sst_file_reader =
-      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileReader *>(jhandle);
+  auto* sst_file_reader =
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileReader*>(jhandle);
   std::shared_ptr<const ROCKSDB_NAMESPACE::TableProperties> tp =
       sst_file_reader->GetTableProperties();
   jobject jtable_properties =
diff --git a/java/rocksjni/sst_file_writerjni.cc b/java/rocksjni/sst_file_writerjni.cc
index 481adbc85640..fbe888ab01b3 100644
--- a/java/rocksjni/sst_file_writerjni.cc
+++ b/java/rocksjni/sst_file_writerjni.cc
@@ -25,27 +25,26 @@
  * Signature: (JJJB)J
  */
 jlong Java_org_rocksdb_SstFileWriter_newSstFileWriter__JJJB(
-    JNIEnv * /*env*/, jclass /*jcls*/, jlong jenvoptions, jlong joptions,
+    JNIEnv* /*env*/, jclass /*jcls*/, jlong jenvoptions, jlong joptions,
     jlong jcomparator_handle, jbyte jcomparator_type) {
-  ROCKSDB_NAMESPACE::Comparator *comparator = nullptr;
+  ROCKSDB_NAMESPACE::Comparator* comparator = nullptr;
   switch (jcomparator_type) {
     // JAVA_COMPARATOR
     case 0x0:
-      comparator = reinterpret_cast<ROCKSDB_NAMESPACE::ComparatorJniCallback *>(
+      comparator = reinterpret_cast<ROCKSDB_NAMESPACE::ComparatorJniCallback*>(
           jcomparator_handle);
       break;
 
     // JAVA_NATIVE_COMPARATOR_WRAPPER
     case 0x1:
       comparator =
-          reinterpret_cast<ROCKSDB_NAMESPACE::Comparator *>(jcomparator_handle);
+          reinterpret_cast<ROCKSDB_NAMESPACE::Comparator*>(jcomparator_handle);
       break;
   }
-  auto *env_options =
-      reinterpret_cast<const ROCKSDB_NAMESPACE::EnvOptions *>(jenvoptions);
-  auto *options =
-      reinterpret_cast<const ROCKSDB_NAMESPACE::Options *>(joptions);
-  ROCKSDB_NAMESPACE::SstFileWriter *sst_file_writer =
+  auto* env_options =
+      reinterpret_cast<const ROCKSDB_NAMESPACE::EnvOptions*>(jenvoptions);
+  auto* options = reinterpret_cast<const ROCKSDB_NAMESPACE::Options*>(joptions);
+  ROCKSDB_NAMESPACE::SstFileWriter* sst_file_writer =
       new ROCKSDB_NAMESPACE::SstFileWriter(*env_options, *options, comparator);
   return GET_CPLUSPLUS_POINTER(sst_file_writer);
 }
@@ -55,15 +54,14 @@ jlong Java_org_rocksdb_SstFileWriter_newSstFileWriter__JJJB(
  * Method:    newSstFileWriter
  * Signature: (JJ)J
  */
-jlong Java_org_rocksdb_SstFileWriter_newSstFileWriter__JJ(JNIEnv * /*env*/,
+jlong Java_org_rocksdb_SstFileWriter_newSstFileWriter__JJ(JNIEnv* /*env*/,
                                                           jclass /*jcls*/,
                                                           jlong jenvoptions,
                                                           jlong joptions) {
-  auto *env_options =
-      reinterpret_cast<const ROCKSDB_NAMESPACE::EnvOptions *>(jenvoptions);
-  auto *options =
-      reinterpret_cast<const ROCKSDB_NAMESPACE::Options *>(joptions);
-  ROCKSDB_NAMESPACE::SstFileWriter *sst_file_writer =
+  auto* env_options =
+      reinterpret_cast<const ROCKSDB_NAMESPACE::EnvOptions*>(jenvoptions);
+  auto* options = reinterpret_cast<const ROCKSDB_NAMESPACE::Options*>(joptions);
+  ROCKSDB_NAMESPACE::SstFileWriter* sst_file_writer =
       new ROCKSDB_NAMESPACE::SstFileWriter(*env_options, *options);
   return GET_CPLUSPLUS_POINTER(sst_file_writer);
 }
@@ -73,15 +71,15 @@ jlong Java_org_rocksdb_SstFileWriter_newSstFileWriter__JJ(JNIEnv * /*env*/,
  * Method:    open
  * Signature: (JLjava/lang/String;)V
  */
-void Java_org_rocksdb_SstFileWriter_open(JNIEnv *env, jclass /*jcls*/,
+void Java_org_rocksdb_SstFileWriter_open(JNIEnv* env, jclass /*jcls*/,
                                          jlong jhandle, jstring jfile_path) {
-  const char *file_path = env->GetStringUTFChars(jfile_path, nullptr);
+  const char* file_path = env->GetStringUTFChars(jfile_path, nullptr);
   if (file_path == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
   }
   ROCKSDB_NAMESPACE::Status s =
-      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter *>(jhandle)->Open(
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter*>(jhandle)->Open(
           file_path);
   env->ReleaseStringUTFChars(jfile_path, file_path);
 
@@ -95,14 +93,14 @@ void Java_org_rocksdb_SstFileWriter_open(JNIEnv *env, jclass /*jcls*/,
  * Method:    put
  * Signature: (JJJ)V
  */
-void Java_org_rocksdb_SstFileWriter_put__JJJ(JNIEnv *env, jclass /*jcls*/,
+void Java_org_rocksdb_SstFileWriter_put__JJJ(JNIEnv* env, jclass /*jcls*/,
                                              jlong jhandle, jlong jkey_handle,
                                              jlong jvalue_handle) {
-  auto *key_slice = reinterpret_cast<ROCKSDB_NAMESPACE::Slice *>(jkey_handle);
-  auto *value_slice =
-      reinterpret_cast<ROCKSDB_NAMESPACE::Slice *>(jvalue_handle);
+  auto* key_slice = reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(jkey_handle);
+  auto* value_slice =
+      reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(jvalue_handle);
   ROCKSDB_NAMESPACE::Status s =
-      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter *>(jhandle)->Put(
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter*>(jhandle)->Put(
           *key_slice, *value_slice);
   if (!s.ok()) {
     ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
@@ -114,28 +112,28 @@ void Java_org_rocksdb_SstFileWriter_put__JJJ(JNIEnv *env, jclass /*jcls*/,
  * Method:    put
  * Signature: (JJJ)V
  */
-void Java_org_rocksdb_SstFileWriter_put__J_3B_3B(JNIEnv *env, jclass /*jcls*/,
+void Java_org_rocksdb_SstFileWriter_put__J_3B_3B(JNIEnv* env, jclass /*jcls*/,
                                                  jlong jhandle, jbyteArray jkey,
                                                  jbyteArray jval) {
-  jbyte *key = env->GetByteArrayElements(jkey, nullptr);
+  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
   if (key == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
   }
-  ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast<char *>(key),
+  ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast<char*>(key),
                                      env->GetArrayLength(jkey));
 
-  jbyte *value = env->GetByteArrayElements(jval, nullptr);
+  jbyte* value = env->GetByteArrayElements(jval, nullptr);
   if (value == nullptr) {
     // exception thrown: OutOfMemoryError
     env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
     return;
   }
-  ROCKSDB_NAMESPACE::Slice value_slice(reinterpret_cast<char *>(value),
+  ROCKSDB_NAMESPACE::Slice value_slice(reinterpret_cast<char*>(value),
                                        env->GetArrayLength(jval));
 
   ROCKSDB_NAMESPACE::Status s =
-      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter *>(jhandle)->Put(
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter*>(jhandle)->Put(
           key_slice, value_slice);
 
   env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
@@ -151,15 +149,15 @@ void Java_org_rocksdb_SstFileWriter_put__J_3B_3B(JNIEnv *env, jclass /*jcls*/,
  * Method:    putDirect
  * Signature: (JLjava/nio/ByteBuffer;IILjava/nio/ByteBuffer;II)V
  */
-void Java_org_rocksdb_SstFileWriter_putDirect(JNIEnv *env, jclass /*jcls*/,
+void Java_org_rocksdb_SstFileWriter_putDirect(JNIEnv* env, jclass /*jcls*/,
                                               jlong jdb_handle, jobject jkey,
                                               jint jkey_off, jint jkey_len,
                                               jobject jval, jint jval_off,
                                               jint jval_len) {
-  auto *writer =
-      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter *>(jdb_handle);
-  auto put = [&env, &writer](ROCKSDB_NAMESPACE::Slice &key,
-                             ROCKSDB_NAMESPACE::Slice &value) {
+  auto* writer =
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter*>(jdb_handle);
+  auto put = [&env, &writer](ROCKSDB_NAMESPACE::Slice& key,
+                             ROCKSDB_NAMESPACE::Slice& value) {
     ROCKSDB_NAMESPACE::Status s = writer->Put(key, value);
     if (s.ok()) {
       return;
@@ -175,10 +173,10 @@ void Java_org_rocksdb_SstFileWriter_putDirect(JNIEnv *env, jclass /*jcls*/,
  * Method:    fileSize
  * Signature: (J)J
  */
-jlong Java_org_rocksdb_SstFileWriter_fileSize(JNIEnv * /*env*/, jclass /*jcls*/,
+jlong Java_org_rocksdb_SstFileWriter_fileSize(JNIEnv* /*env*/, jclass /*jcls*/,
                                               jlong jdb_handle) {
-  auto *writer =
-      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter *>(jdb_handle);
+  auto* writer =
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter*>(jdb_handle);
   return static_cast<jlong>(writer->FileSize());
 }
 
@@ -187,14 +185,14 @@ jlong Java_org_rocksdb_SstFileWriter_fileSize(JNIEnv * /*env*/, jclass /*jcls*/,
  * Method:    merge
  * Signature: (JJJ)V
  */
-void Java_org_rocksdb_SstFileWriter_merge__JJJ(JNIEnv *env, jclass /*jcls*/,
+void Java_org_rocksdb_SstFileWriter_merge__JJJ(JNIEnv* env, jclass /*jcls*/,
                                                jlong jhandle, jlong jkey_handle,
                                                jlong jvalue_handle) {
-  auto *key_slice = reinterpret_cast<ROCKSDB_NAMESPACE::Slice *>(jkey_handle);
-  auto *value_slice =
-      reinterpret_cast<ROCKSDB_NAMESPACE::Slice *>(jvalue_handle);
+  auto* key_slice = reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(jkey_handle);
+  auto* value_slice =
+      reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(jvalue_handle);
   ROCKSDB_NAMESPACE::Status s =
-      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter *>(jhandle)->Merge(
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter*>(jhandle)->Merge(
           *key_slice, *value_slice);
   if (!s.ok()) {
     ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
@@ -206,29 +204,29 @@ void Java_org_rocksdb_SstFileWriter_merge__JJJ(JNIEnv *env, jclass /*jcls*/,
  * Method:    merge
  * Signature: (J[B[B)V
  */
-void Java_org_rocksdb_SstFileWriter_merge__J_3B_3B(JNIEnv *env, jclass /*jcls*/,
+void Java_org_rocksdb_SstFileWriter_merge__J_3B_3B(JNIEnv* env, jclass /*jcls*/,
                                                    jlong jhandle,
                                                    jbyteArray jkey,
                                                    jbyteArray jval) {
-  jbyte *key = env->GetByteArrayElements(jkey, nullptr);
+  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
   if (key == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
   }
-  ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast<char *>(key),
+  ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast<char*>(key),
                                      env->GetArrayLength(jkey));
 
-  jbyte *value = env->GetByteArrayElements(jval, nullptr);
+  jbyte* value = env->GetByteArrayElements(jval, nullptr);
   if (value == nullptr) {
     // exception thrown: OutOfMemoryError
     env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
     return;
   }
-  ROCKSDB_NAMESPACE::Slice value_slice(reinterpret_cast<char *>(value),
+  ROCKSDB_NAMESPACE::Slice value_slice(reinterpret_cast<char*>(value),
                                        env->GetArrayLength(jval));
 
   ROCKSDB_NAMESPACE::Status s =
-      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter *>(jhandle)->Merge(
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter*>(jhandle)->Merge(
           key_slice, value_slice);
 
   env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
@@ -244,19 +242,19 @@ void Java_org_rocksdb_SstFileWriter_merge__J_3B_3B(JNIEnv *env, jclass /*jcls*/,
  * Method:    delete
  * Signature: (JJJ)V
  */
-void Java_org_rocksdb_SstFileWriter_delete__J_3B(JNIEnv *env, jclass /*jcls*/,
+void Java_org_rocksdb_SstFileWriter_delete__J_3B(JNIEnv* env, jclass /*jcls*/,
                                                  jlong jhandle,
                                                  jbyteArray jkey) {
-  jbyte *key = env->GetByteArrayElements(jkey, nullptr);
+  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
   if (key == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
   }
-  ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast<char *>(key),
+  ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast<char*>(key),
                                      env->GetArrayLength(jkey));
 
   ROCKSDB_NAMESPACE::Status s =
-      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter *>(jhandle)->Delete(
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter*>(jhandle)->Delete(
           key_slice);
 
   env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
@@ -271,12 +269,12 @@ void Java_org_rocksdb_SstFileWriter_delete__J_3B(JNIEnv *env, jclass /*jcls*/,
  * Method:    delete
  * Signature: (JJJ)V
  */
-void Java_org_rocksdb_SstFileWriter_delete__JJ(JNIEnv *env, jclass /*jcls*/,
+void Java_org_rocksdb_SstFileWriter_delete__JJ(JNIEnv* env, jclass /*jcls*/,
                                                jlong jhandle,
                                                jlong jkey_handle) {
-  auto *key_slice = reinterpret_cast<ROCKSDB_NAMESPACE::Slice *>(jkey_handle);
+  auto* key_slice = reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(jkey_handle);
   ROCKSDB_NAMESPACE::Status s =
-      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter *>(jhandle)->Delete(
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter*>(jhandle)->Delete(
           *key_slice);
   if (!s.ok()) {
     ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
@@ -288,10 +286,10 @@ void Java_org_rocksdb_SstFileWriter_delete__JJ(JNIEnv *env, jclass /*jcls*/,
  * Method:    finish
  * Signature: (J)V
  */
-void Java_org_rocksdb_SstFileWriter_finish(JNIEnv *env, jclass /*jcls*/,
+void Java_org_rocksdb_SstFileWriter_finish(JNIEnv* env, jclass /*jcls*/,
                                            jlong jhandle) {
   ROCKSDB_NAMESPACE::Status s =
-      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter *>(jhandle)->Finish();
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter*>(jhandle)->Finish();
   if (!s.ok()) {
     ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
   }
@@ -302,8 +300,8 @@ void Java_org_rocksdb_SstFileWriter_finish(JNIEnv *env, jclass /*jcls*/,
  * Method:    disposeInternal
  * Signature: (J)V
  */
-void Java_org_rocksdb_SstFileWriter_disposeInternalJni(JNIEnv * /*env*/,
+void Java_org_rocksdb_SstFileWriter_disposeInternalJni(JNIEnv* /*env*/,
                                                        jclass /*jobj*/,
                                                        jlong jhandle) {
-  delete reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter *>(jhandle);
+  delete reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter*>(jhandle);
 }
diff --git a/java/rocksjni/table.cc b/java/rocksjni/table.cc
index eb5de1695e6c..064a5b1a7fac 100644
--- a/java/rocksjni/table.cc
+++ b/java/rocksjni/table.cc
@@ -23,7 +23,7 @@
  * Signature: (IIDIIBZZ)J
  */
 jlong Java_org_rocksdb_PlainTableConfig_newTableFactoryHandle(
-    JNIEnv * /*env*/, jclass /*jcls*/, jint jkey_size, jint jbloom_bits_per_key,
+    JNIEnv* /*env*/, jclass /*jcls*/, jint jkey_size, jint jbloom_bits_per_key,
     jdouble jhash_table_ratio, jint jindex_sparseness, jint jhuge_page_tlb_size,
     jbyte jencoding_type, jboolean jfull_scan_mode,
     jboolean jstore_index_in_file) {
@@ -45,10 +45,10 @@ jlong Java_org_rocksdb_PlainTableConfig_newTableFactoryHandle(
 /*
  * Class:     org_rocksdb_BlockBasedTableConfig
  * Method:    newTableFactoryHandle
- * Signature: (ZZZZBBDBZJJJJIIIJZZZJZZIIZZBJIJI)J
+ * Signature: (ZZZZBBDBZJJJIIIJZZZJZZIIZZJJBBJI)J
  */
 jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
-    JNIEnv *, jclass, jboolean jcache_index_and_filter_blocks,
+    JNIEnv*, jclass, jboolean jcache_index_and_filter_blocks,
     jboolean jcache_index_and_filter_blocks_with_high_priority,
     jboolean jpin_l0_filter_and_index_blocks_in_cache,
     jboolean jpin_top_level_index_and_filter, jbyte jindex_type_value,
@@ -63,7 +63,9 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
     jboolean jwhole_key_filtering, jboolean jverify_compression,
     jint jread_amp_bytes_per_bit, jint jformat_version,
     jboolean jenable_index_compression, jboolean jblock_align,
-    jbyte jindex_shortening, jlong jblock_cache_size,
+    jlong jsuper_block_alignment_size,
+    jlong jsuper_block_alignment_space_overhead_ratio, jbyte jindex_shortening,
+    jbyte jindex_search_type, jlong jblock_cache_size,
     jint jblock_cache_num_shard_bits) {
   ROCKSDB_NAMESPACE::BlockBasedTableOptions options;
   options.cache_index_and_filter_blocks =
@@ -88,8 +90,8 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
     options.block_cache = nullptr;
   } else {
     if (jblock_cache_handle > 0) {
-      std::shared_ptr<ROCKSDB_NAMESPACE::Cache> *pCache =
-          reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Cache> *>(
+      std::shared_ptr<ROCKSDB_NAMESPACE::Cache>* pCache =
+          reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Cache>*>(
               jblock_cache_handle);
       options.block_cache = *pCache;
     } else if (jblock_cache_size >= 0) {
@@ -107,8 +109,8 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
     }
   }
   if (jpersistent_cache_handle > 0) {
-    std::shared_ptr<ROCKSDB_NAMESPACE::PersistentCache> *pCache =
-        reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::PersistentCache> *>(
+    std::shared_ptr<ROCKSDB_NAMESPACE::PersistentCache>* pCache =
+        reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::PersistentCache>*>(
             jpersistent_cache_handle);
     options.persistent_cache = *pCache;
   }
@@ -123,8 +125,8 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
       static_cast<bool>(joptimize_filters_for_memory);
   options.use_delta_encoding = static_cast<bool>(juse_delta_encoding);
   if (jfilter_policy_handle > 0) {
-    std::shared_ptr<ROCKSDB_NAMESPACE::FilterPolicy> *pFilterPolicy =
-        reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::FilterPolicy> *>(
+    std::shared_ptr<ROCKSDB_NAMESPACE::FilterPolicy>* pFilterPolicy =
+        reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::FilterPolicy>*>(
             jfilter_policy_handle);
     options.filter_policy = *pFilterPolicy;
   }
@@ -136,9 +138,16 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
   options.enable_index_compression =
       static_cast<bool>(jenable_index_compression);
   options.block_align = static_cast<bool>(jblock_align);
+  options.super_block_alignment_size =
+      static_cast<size_t>(jsuper_block_alignment_size);
+  options.super_block_alignment_space_overhead_ratio =
+      static_cast<size_t>(jsuper_block_alignment_space_overhead_ratio);
   options.index_shortening =
       ROCKSDB_NAMESPACE::IndexShorteningModeJni::toCppIndexShorteningMode(
           jindex_shortening);
+  options.index_block_search_type =
+      ROCKSDB_NAMESPACE::IndexSearchTypeJni::toCppIndexSearchType(
+          jindex_search_type);
 
   return GET_CPLUSPLUS_POINTER(
       ROCKSDB_NAMESPACE::NewBlockBasedTableFactory(options));
diff --git a/java/rocksjni/table_properties_collector_factory.cc b/java/rocksjni/table_properties_collector_factory.cc
index 60e1df6e8b13..365a50d7eb5a 100644
--- a/java/rocksjni/table_properties_collector_factory.cc
+++ b/java/rocksjni/table_properties_collector_factory.cc
@@ -17,9 +17,9 @@
  * Signature: (JJD)J
  */
 jlong Java_org_rocksdb_TablePropertiesCollectorFactory_newCompactOnDeletionCollectorFactory(
-    JNIEnv *, jclass, jlong sliding_window_size, jlong deletion_trigger,
+    JNIEnv*, jclass, jlong sliding_window_size, jlong deletion_trigger,
     jdouble deletion_ratio) {
-  auto *wrapper = new TablePropertiesCollectorFactoriesJniWrapper();
+  auto* wrapper = new TablePropertiesCollectorFactoriesJniWrapper();
   wrapper->table_properties_collector_factories =
       ROCKSDB_NAMESPACE::NewCompactOnDeletionCollectorFactory(
           sliding_window_size, deletion_trigger, deletion_ratio);
@@ -32,8 +32,8 @@ jlong Java_org_rocksdb_TablePropertiesCollectorFactory_newCompactOnDeletionColle
  * Signature: (J)J
  */
 void Java_org_rocksdb_TablePropertiesCollectorFactory_deleteCompactOnDeletionCollectorFactory(
-    JNIEnv *, jclass, jlong jhandle) {
+    JNIEnv*, jclass, jlong jhandle) {
   auto instance =
-      reinterpret_cast<TablePropertiesCollectorFactoriesJniWrapper *>(jhandle);
+      reinterpret_cast<TablePropertiesCollectorFactoriesJniWrapper*>(jhandle);
   delete instance;
 }
diff --git a/java/rocksjni/testable_event_listener.cc b/java/rocksjni/testable_event_listener.cc
index 483ade160561..febf8cbd1bb7 100644
--- a/java/rocksjni/testable_event_listener.cc
+++ b/java/rocksjni/testable_event_listener.cc
@@ -78,9 +78,9 @@ static TableProperties newTablePropertiesForTest() {
  * Signature: (J)V
  */
 void Java_org_rocksdb_test_TestableEventListener_invokeAllCallbacks(
-    JNIEnv *, jclass, jlong jhandle) {
-  const auto &el =
-      *reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::EventListener> *>(
+    JNIEnv*, jclass, jlong jhandle) {
+  const auto& el =
+      *reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::EventListener>*>(
           jhandle);
 
   TableProperties table_properties = newTablePropertiesForTest();
@@ -127,7 +127,7 @@ void Java_org_rocksdb_test_TestableEventListener_invokeAllCallbacks(
   compaction_job_info.output_file_infos = {};
   compaction_job_info.table_properties = {
       {"tableProperties", std::shared_ptr<TableProperties>(
-                              &table_properties, [](TableProperties *) {})}};
+                              &table_properties, [](TableProperties*) {})}};
   compaction_job_info.compaction_reason = CompactionReason::kFlush;
   compaction_job_info.compression = CompressionType::kSnappyCompression;
 
diff --git a/java/rocksjni/transaction.cc b/java/rocksjni/transaction.cc
index e211ebe5d6dd..f457ef331c54 100644
--- a/java/rocksjni/transaction.cc
+++ b/java/rocksjni/transaction.cc
@@ -341,6 +341,36 @@ jobjectArray Java_org_rocksdb_Transaction_multiGet__JJ_3_3B(
                                                           statuses);
 }
 
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    multiGet
+ * Signature: (JJJ[[B)[[B
+ */
+jobjectArray Java_org_rocksdb_Transaction_multiGet__JJJ_3_3B(
+    JNIEnv* env, jclass /*jobj*/, jlong jhandle, jlong jread_options_handle,
+    jlong jcf_handle, jobjectArray jkeys) {
+  ROCKSDB_NAMESPACE::MultiGetJNIKeys keys;
+  if (!keys.fromByteArrays(env, jkeys)) {
+    return nullptr;
+  }
+
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+
+  size_t num_keys = keys.size();
+  std::vector<ROCKSDB_NAMESPACE::PinnableSlice> values(num_keys);
+  std::vector<ROCKSDB_NAMESPACE::Status> statuses(num_keys);
+
+  txn->MultiGet(
+      *reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jread_options_handle),
+      cf_handle, num_keys, keys.slices().data(), values.data(), statuses.data(),
+      /*sorted_input=*/false);
+
+  return ROCKSDB_NAMESPACE::MultiGetJNIValues::byteArrays<
+      ROCKSDB_NAMESPACE::PinnableSlice>(env, values, statuses);
+}
+
 /*
  * Class:     org_rocksdb_Transaction
  * Method:    getForUpdate
diff --git a/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java b/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java
index d1d1123dded4..867f5ca959bd 100644
--- a/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java
+++ b/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java
@@ -44,53 +44,6 @@ T setMinWriteBufferNumberToMerge(
    */
   int minWriteBufferNumberToMerge();
 
-  /**
-   * The total maximum number of write buffers to maintain in memory including
-   * copies of buffers that have already been flushed.  Unlike
-   * {@link AdvancedMutableColumnFamilyOptionsInterface#maxWriteBufferNumber()},
-   * this parameter does not affect flushing.
-   * This controls the minimum amount of write history that will be available
-   * in memory for conflict checking when Transactions are used.
-   * <p>
-   * When using an OptimisticTransactionDB:
-   * If this value is too low, some transactions may fail at commit time due
-   * to not being able to determine whether there were any write conflicts.
-   * <p>
-   * When using a TransactionDB:
-   * If Transaction::SetSnapshot is used, TransactionDB will read either
-   * in-memory write buffers or SST files to do write-conflict checking.
-   * Increasing this value can reduce the number of reads to SST files
-   * done for conflict detection.
-   * <p>
-   * Setting this value to 0 will cause write buffers to be freed immediately
-   * after they are flushed.
-   * If this value is set to -1,
-   * {@link AdvancedMutableColumnFamilyOptionsInterface#maxWriteBufferNumber()}
-   * will be used.
-   * <p>
-   * Default:
-   * If using a TransactionDB/OptimisticTransactionDB, the default value will
-   * be set to the value of
-   * {@link AdvancedMutableColumnFamilyOptionsInterface#maxWriteBufferNumber()}
-   * if it is not explicitly set by the user. Otherwise, the default is 0.
-   *
-   * @param maxWriteBufferNumberToMaintain The maximum number of write
-   *     buffers to maintain
-   *
-   * @return the reference to the current options.
-   */
-  T setMaxWriteBufferNumberToMaintain(
-      int maxWriteBufferNumberToMaintain);
-
-  /**
-   * The total maximum number of write buffers to maintain in memory including
-   * copies of buffers that have already been flushed.
-   *
-   * @return maxWriteBufferNumberToMaintain The maximum number of write buffers
-   *     to maintain
-   */
-  int maxWriteBufferNumberToMaintain();
-
   /**
    * Allows thread-safe inplace updates.
    * If inplace_callback function is not set,
diff --git a/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java b/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java
index c8159db2ddca..555f54f3b748 100644
--- a/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java
+++ b/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java
@@ -37,10 +37,13 @@ public BlockBasedTableConfig() {
     wholeKeyFiltering = true;
     verifyCompression = false;
     readAmpBytesPerBit = 0;
-    formatVersion = 6;
+    formatVersion = 7;
     enableIndexCompression = true;
     blockAlign = false;
+    superBlockAlignmentSize = 0;
+    superBlockAlignmentSpaceOverheadRatio = 128;
     indexShortening = IndexShorteningMode.kShortenSeparators;
+    indexSearchType = IndexSearchType.kBinary;
 
     // NOTE: ONLY used if blockCache == null
     blockCacheSize = 8 * 1024 * 1024;
@@ -60,9 +63,10 @@ private BlockBasedTableConfig(final boolean cacheIndexAndFilterBlocks,
       final boolean partitionFilters, final boolean optimizeFiltersForMemory,
       final boolean useDeltaEncoding, final boolean wholeKeyFiltering,
       final boolean verifyCompression, final int readAmpBytesPerBit, final int formatVersion,
-      final boolean enableIndexCompression, final boolean blockAlign, final byte indexShortening,
-      final byte filterPolicyType, final long filterPolicyHandle,
-      final double filterPolicyConfigValue) {
+      final boolean enableIndexCompression, final boolean blockAlign,
+      final long superBlockAlignmentSize, final long superBlockAlignmentSpaceOverheadRatio,
+      final byte indexShortening, final byte indexSearchType, final byte filterPolicyType,
+      final long filterPolicyHandle, final double filterPolicyConfigValue) {
     this.cacheIndexAndFilterBlocks = cacheIndexAndFilterBlocks;
     this.cacheIndexAndFilterBlocksWithHighPriority = cacheIndexAndFilterBlocksWithHighPriority;
     this.pinL0FilterAndIndexBlocksInCache = pinL0FilterAndIndexBlocksInCache;
@@ -86,7 +90,10 @@ private BlockBasedTableConfig(final boolean cacheIndexAndFilterBlocks,
     this.formatVersion = formatVersion;
     this.enableIndexCompression = enableIndexCompression;
     this.blockAlign = blockAlign;
+    this.superBlockAlignmentSize = superBlockAlignmentSize;
+    this.superBlockAlignmentSpaceOverheadRatio = superBlockAlignmentSpaceOverheadRatio;
     this.indexShortening = IndexShorteningMode.values()[indexShortening];
+    this.indexSearchType = IndexSearchType.values()[indexSearchType];
     try (Filter filterPolicy = FilterPolicyType.values()[filterPolicyType].createFilter(
              filterPolicyHandle, filterPolicyConfigValue)) {
       if (filterPolicy != null) {
@@ -799,6 +806,50 @@ public BlockBasedTableConfig setBlockAlign(final boolean blockAlign) {
     return this;
   }
 
+  /**
+   * Get the super block alignment size.
+   *
+   * @return the super block alignment size.
+   */
+  public long superBlockAlignmentSize() {
+    return superBlockAlignmentSize;
+  }
+
+  /**
+   * Set the super block alignment size.
+   * When set to 0, super block alignment is disabled.
+   *
+   * @param superBlockAlignmentSize the super block alignment size.
+   *
+   * @return the reference to the current option.
+   */
+  public BlockBasedTableConfig setSuperBlockAlignmentSize(final long superBlockAlignmentSize) {
+    this.superBlockAlignmentSize = superBlockAlignmentSize;
+    return this;
+  }
+
+  /**
+   * Get the space overhead ratio of super block alignment.
+   *
+   * @return space overhead ratio of super block alignment.
+   */
+  public long superBlockAlignmentSpaceOverheadRatio() {
+    return superBlockAlignmentSpaceOverheadRatio;
+  }
+
+  /**
+   * Set the space overhead ratio of super block alignment.
+   *
+   * @param superBlockAlignmentSpaceOverheadRatio the space overhead ratio of super block alignment.
+   *
+   * @return the reference to the current option.
+   */
+  public BlockBasedTableConfig setSuperBlockAlignmentSpaceOverheadRatio(
+      final long superBlockAlignmentSpaceOverheadRatio) {
+    this.superBlockAlignmentSpaceOverheadRatio = superBlockAlignmentSpaceOverheadRatio;
+    return this;
+  }
+
   /**
    * Get the index shortening mode.
    *
@@ -822,6 +873,26 @@ public BlockBasedTableConfig setIndexShortening(final IndexShorteningMode indexS
     return this;
   }
 
+  /**
+   * Get the index search type.
+   *
+   * @return the currently set index search type
+   */
+  public IndexSearchType indexSearchType() {
+    return indexSearchType;
+  }
+
+  /**
+   * Sets the index search type to used with this table.
+   *
+   * @param indexSearchType {@link org.rocksdb.IndexSearchType} value
+   * @return the reference to the current option.
+   */
+  public BlockBasedTableConfig setIndexSearchType(final IndexSearchType indexSearchType) {
+    this.indexSearchType = indexSearchType;
+    return this;
+  }
+
   /**
    * Get the size of the cache in bytes that will be used by RocksDB.
    *
@@ -946,7 +1017,8 @@ public BlockBasedTableConfig setHashIndexAllowCollision(
         indexBlockRestartInterval, metadataBlockSize, partitionFilters, optimizeFiltersForMemory,
         useDeltaEncoding, filterPolicyHandle, wholeKeyFiltering, verifyCompression,
         readAmpBytesPerBit, formatVersion, enableIndexCompression, blockAlign,
-        indexShortening.getValue(), blockCacheSize, blockCacheNumShardBits);
+        superBlockAlignmentSize, superBlockAlignmentSpaceOverheadRatio, indexShortening.getValue(),
+        indexSearchType.getValue(), blockCacheSize, blockCacheNumShardBits);
   }
 
   private static native long newTableFactoryHandle(final boolean cacheIndexAndFilterBlocks,
@@ -961,7 +1033,9 @@ private static native long newTableFactoryHandle(final boolean cacheIndexAndFilt
       final boolean useDeltaEncoding, final long filterPolicyHandle,
       final boolean wholeKeyFiltering, final boolean verifyCompression,
       final int readAmpBytesPerBit, final int formatVersion, final boolean enableIndexCompression,
-      final boolean blockAlign, final byte indexShortening,
+      final boolean blockAlign, final long superBlockAlignmentSize,
+      final long superBlockAlignmentSpaceOverheadRatio, final byte indexShortening,
+      final byte indexSearchType,
 
       @Deprecated final long blockCacheSize, @Deprecated final int blockCacheNumShardBits);
 
@@ -992,7 +1066,10 @@ private static native long newTableFactoryHandle(final boolean cacheIndexAndFilt
   private int formatVersion;
   private boolean enableIndexCompression;
   private boolean blockAlign;
+  private long superBlockAlignmentSize;
+  private long superBlockAlignmentSpaceOverheadRatio;
   private IndexShorteningMode indexShortening;
+  private IndexSearchType indexSearchType;
 
   // NOTE: ONLY used if blockCache == null
   @Deprecated private long blockCacheSize;
diff --git a/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java b/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java
index 3af4d2a8ed6f..d25f8c73bc7b 100644
--- a/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java
+++ b/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java
@@ -835,19 +835,6 @@ public boolean paranoidFileChecks() {
     return paranoidFileChecks(nativeHandle_);
   }
 
-  @Override
-  public ColumnFamilyOptions setMaxWriteBufferNumberToMaintain(
-      final int maxWriteBufferNumberToMaintain) {
-    setMaxWriteBufferNumberToMaintain(
-        nativeHandle_, maxWriteBufferNumberToMaintain);
-    return this;
-  }
-
-  @Override
-  public int maxWriteBufferNumberToMaintain() {
-    return maxWriteBufferNumberToMaintain(nativeHandle_);
-  }
-
   @Override
   public ColumnFamilyOptions setCompactionPriority(
       final CompactionPriority compactionPriority) {
@@ -1467,9 +1454,6 @@ private static native void setMaxBytesForLevelMultiplierAdditional(
   private static native int[] maxBytesForLevelMultiplierAdditional(long handle);
   private static native void setParanoidFileChecks(long handle, boolean paranoidFileChecks);
   private static native boolean paranoidFileChecks(long handle);
-  private static native void setMaxWriteBufferNumberToMaintain(
-      final long handle, final int maxWriteBufferNumberToMaintain);
-  private static native int maxWriteBufferNumberToMaintain(final long handle);
   private static native void setCompactionPriority(
       final long handle, final byte compactionPriority);
   private static native byte compactionPriority(final long handle);
diff --git a/java/src/main/java/org/rocksdb/CompactionOptionsFIFO.java b/java/src/main/java/org/rocksdb/CompactionOptionsFIFO.java
index 24ebe0da2ff1..3d94e7eb0215 100644
--- a/java/src/main/java/org/rocksdb/CompactionOptionsFIFO.java
+++ b/java/src/main/java/org/rocksdb/CompactionOptionsFIFO.java
@@ -75,6 +75,51 @@ public boolean allowCompaction() {
     return allowCompaction(nativeHandle_);
   }
 
+  /**
+   * Combined SST + blob file size limit for FIFO compaction trimming.
+   * When non-zero, FIFO uses total_sst + total_blob for size-based dropping.
+   * When zero (default), uses max_table_files_size (SST-only).
+   *
+   * @param maxDataFilesSize the combined size limit in bytes
+   *
+   * @return the reference to the current options.
+   */
+  public CompactionOptionsFIFO setMaxDataFilesSize(final long maxDataFilesSize) {
+    setMaxDataFilesSize(nativeHandle_, maxDataFilesSize);
+    return this;
+  }
+
+  /**
+   * Get the combined SST + blob file size limit.
+   *
+   * @return max data files size in bytes, 0 means disabled
+   */
+  public long maxDataFilesSize() {
+    return maxDataFilesSize(nativeHandle_);
+  }
+
+  /**
+   * Enable capacity-derived intra-L0 compaction using the observed key/value
+   * size ratio. Requires maxDataFilesSize &gt; 0.
+   *
+   * @param useKvRatioCompaction true to enable
+   *
+   * @return the reference to the current options.
+   */
+  public CompactionOptionsFIFO setUseKvRatioCompaction(final boolean useKvRatioCompaction) {
+    setUseKvRatioCompaction(nativeHandle_, useKvRatioCompaction);
+    return this;
+  }
+
+  /**
+   * Check if capacity-derived intra-L0 compaction is enabled.
+   *
+   * @return true if enabled
+   */
+  public boolean useKvRatioCompaction() {
+    return useKvRatioCompaction(nativeHandle_);
+  }
+
   private static native long newCompactionOptionsFIFO();
   @Override
   protected final void disposeInternal(final long handle) {
@@ -86,4 +131,9 @@ protected final void disposeInternal(final long handle) {
   private static native long maxTableFilesSize(final long handle);
   private static native void setAllowCompaction(final long handle, final boolean allowCompaction);
   private static native boolean allowCompaction(final long handle);
+  private static native void setMaxDataFilesSize(final long handle, final long maxDataFilesSize);
+  private static native long maxDataFilesSize(final long handle);
+  private static native void setUseKvRatioCompaction(
+      final long handle, final boolean useKvRatioCompaction);
+  private static native boolean useKvRatioCompaction(final long handle);
 }
diff --git a/java/src/main/java/org/rocksdb/DBOptions.java b/java/src/main/java/org/rocksdb/DBOptions.java
index 0221a63fba07..12f5d4913c2f 100644
--- a/java/src/main/java/org/rocksdb/DBOptions.java
+++ b/java/src/main/java/org/rocksdb/DBOptions.java
@@ -962,19 +962,6 @@ public boolean skipStatsUpdateOnDbOpen() {
     return skipStatsUpdateOnDbOpen(nativeHandle_);
   }
 
-  @Override
-  public DBOptions setSkipCheckingSstFileSizesOnDbOpen(
-      final boolean skipCheckingSstFileSizesOnDbOpen) {
-    setSkipCheckingSstFileSizesOnDbOpen(nativeHandle_, skipCheckingSstFileSizesOnDbOpen);
-    return this;
-  }
-
-  @Override
-  public boolean skipCheckingSstFileSizesOnDbOpen() {
-    assert (isOwningHandle());
-    return skipCheckingSstFileSizesOnDbOpen(nativeHandle_);
-  }
-
   @Override
   public DBOptions setWalRecoveryMode(final WALRecoveryMode walRecoveryMode) {
     assert(isOwningHandle());
@@ -1389,9 +1376,6 @@ private static native void setWriteThreadSlowYieldUsec(
   private static native void setSkipStatsUpdateOnDbOpen(
       final long handle, final boolean skipStatsUpdateOnDbOpen);
   private static native boolean skipStatsUpdateOnDbOpen(final long handle);
-  private static native void setSkipCheckingSstFileSizesOnDbOpen(
-      final long handle, final boolean skipChecking);
-  private static native boolean skipCheckingSstFileSizesOnDbOpen(final long handle);
   private static native void setWalRecoveryMode(final long handle, final byte walRecoveryMode);
   private static native byte walRecoveryMode(final long handle);
   private static native void setAllow2pc(final long handle, final boolean allow2pc);
diff --git a/java/src/main/java/org/rocksdb/DBOptionsInterface.java b/java/src/main/java/org/rocksdb/DBOptionsInterface.java
index bc9d9acbd65e..f40fc1a25cfe 100644
--- a/java/src/main/java/org/rocksdb/DBOptionsInterface.java
+++ b/java/src/main/java/org/rocksdb/DBOptionsInterface.java
@@ -1214,36 +1214,6 @@ T setEnableWriteThreadAdaptiveYield(
    */
   boolean skipStatsUpdateOnDbOpen();
 
-  /**
-   * If true, then {@link RocksDB#open(String)} will not fetch and check sizes of all sst files.
-   * This may significantly speed up startup if there are many sst files,
-   * especially when using non-default Env with expensive GetFileSize().
-   * We'll still check that all required sst files exist.
-   * If {@code paranoid_checks} is false, this option is ignored, and sst files are
-   * not checked at all.
-   *
-   * Default: false
-   *
-   * @param skipCheckingSstFileSizesOnDbOpen if true, then SST file sizes will not be checked
-   *                                         when calling {@link RocksDB#open(String)}.
-   * @return the reference to the current options.
-   */
-  T setSkipCheckingSstFileSizesOnDbOpen(final boolean skipCheckingSstFileSizesOnDbOpen);
-
-  /**
-   * If true, then {@link RocksDB#open(String)} will not fetch and check sizes of all sst files.
-   * This may significantly speed up startup if there are many sst files,
-   * especially when using non-default Env with expensive GetFileSize().
-   * We'll still check that all required sst files exist.
-   * If {@code paranoid_checks} is false, this option is ignored, and sst files are
-   * not checked at all.
-   *
-   * Default: false
-   *
-   * @return true, if file sizes will not be checked when calling {@link RocksDB#open(String)}.
-   */
-  boolean skipCheckingSstFileSizesOnDbOpen();
-
   /**
    * Recovery mode to control the consistency while replaying WAL
    *
diff --git a/java/src/main/java/org/rocksdb/HistogramType.java b/java/src/main/java/org/rocksdb/HistogramType.java
index 10d382e7b912..b4a56cc07e0d 100644
--- a/java/src/main/java/org/rocksdb/HistogramType.java
+++ b/java/src/main/java/org/rocksdb/HistogramType.java
@@ -210,7 +210,23 @@ public enum HistogramType {
    */
   TABLE_OPEN_PREFETCH_TAIL_READ_BYTES((byte) 0x3D),
 
-  // 0x3E for backwards compatibility on current minor version.
+  COMPACTION_PREFETCH_BYTES((byte) 0x3F),
+
+  /**
+   * MultiScan histogram statistics
+   */
+
+  /**
+   * Time spent in Iterator::Prepare() for multi-scan (microseconds)
+   */
+  MULTISCAN_PREPARE_MICROS((byte) 0x40),
+
+  /**
+   * Number of blocks per multi-scan Prepare() call
+   */
+  MULTISCAN_BLOCKS_PER_PREPARE((byte) 0x41),
+
+  // 0x3E is reserved for backwards compatibility on current minor version.
   HISTOGRAM_ENUM_MAX((byte) 0x3E);
 
   private final byte value;
diff --git a/java/src/main/java/org/rocksdb/IndexSearchType.java b/java/src/main/java/org/rocksdb/IndexSearchType.java
new file mode 100644
index 000000000000..55ec0eef3820
--- /dev/null
+++ b/java/src/main/java/org/rocksdb/IndexSearchType.java
@@ -0,0 +1,34 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * BlockSearchType used in conjunction with BlockBasedTable.
+ */
+public enum IndexSearchType {
+  /**
+   * Standard binary search
+   */
+  kBinary((byte) 0x0),
+
+  /**
+   * Interpolation search, which may be better suited for uniformly
+   * distributed keys. Only applicable if the comparator is the
+   * byte-wise comparator.
+   */
+  kInterpolation((byte) 0x1);
+
+  private final byte value;
+
+  IndexSearchType(final byte value) {
+    this.value = value;
+  }
+
+  byte getValue() {
+    return value;
+  }
+}
diff --git a/java/src/main/java/org/rocksdb/NativeLibraryLoader.java b/java/src/main/java/org/rocksdb/NativeLibraryLoader.java
index 6fe97994d201..aa841c6f3688 100644
--- a/java/src/main/java/org/rocksdb/NativeLibraryLoader.java
+++ b/java/src/main/java/org/rocksdb/NativeLibraryLoader.java
@@ -30,6 +30,14 @@ public class NativeLibraryLoader {
   private static final String tempFilePrefix = "librocksdbjni";
   private static final String tempFileSuffix = Environment.getJniLibraryExtension();
 
+  /**
+   * If you set the System Property ROCKS_JAVA_DEBUG_NLL can be to true
+   * messages about attempts to load the native library will be printed
+   * to std out.
+   */
+  private static boolean DEBUG_LOADING =
+      "true".equals(System.getProperty("ROCKS_JAVA_DEBUG_NLL", "false"));
+
   /**
    * Get a reference to the NativeLibraryLoader
    *
@@ -55,7 +63,7 @@ public static NativeLibraryLoader getInstance() {
    *
    * @throws java.io.IOException if a filesystem operation fails.
    */
-  @SuppressWarnings("PMD.EmptyCatchBlock")
+  @SuppressWarnings({"PMD.EmptyCatchBlock", "PMD.SystemPrintln"})
   public synchronized void loadLibrary(final String tmpDir) throws IOException {
     try {
       // try dynamic library
@@ -63,6 +71,9 @@ public synchronized void loadLibrary(final String tmpDir) throws IOException {
       return;
     } catch (final UnsatisfiedLinkError ule) {
       // ignore - try from static library
+      if (DEBUG_LOADING) {
+        System.out.println("Unable to load shared dynamic library: " + sharedLibraryName);
+      }
     }
 
     try {
@@ -71,6 +82,9 @@ public synchronized void loadLibrary(final String tmpDir) throws IOException {
       return;
     } catch (final UnsatisfiedLinkError ule) {
       // ignore - then try static library fallback or from jar
+      if (DEBUG_LOADING) {
+        System.out.println("Unable to load shared static library: " + jniLibraryName);
+      }
     }
 
     if (fallbackJniLibraryName != null) {
@@ -80,6 +94,10 @@ public synchronized void loadLibrary(final String tmpDir) throws IOException {
         return;
       } catch (final UnsatisfiedLinkError ule) {
         // ignore - then try from jar
+        if (DEBUG_LOADING) {
+          System.out.println(
+              "Unable to load shared static fallback library: " + fallbackJniLibraryName);
+        }
       }
     }
 
@@ -137,18 +155,23 @@ private File createTemp(final String tmpDir, final String libraryFileName) throw
     }
   }
 
-  @SuppressWarnings({"PMD.UseProperClassLoader", "PMD.UseTryWithResources"})
+  @SuppressWarnings({"PMD.UseProperClassLoader", "PMD.UseTryWithResources", "PMD.SystemPrintln"})
   File loadLibraryFromJarToTemp(final String tmpDir) throws IOException {
     try (InputStream is = getClass().getClassLoader().getResourceAsStream(jniLibraryFileName)) {
       if (is != null) {
         final File temp = createTemp(tmpDir, jniLibraryFileName);
         Files.copy(is, temp.toPath(), StandardCopyOption.REPLACE_EXISTING);
         return temp;
+      } else {
+        if (DEBUG_LOADING) {
+          System.out.println("Unable to find: " + jniLibraryFileName + " on the classpath");
+        }
       }
     }
 
     if (fallbackJniLibraryFileName == null) {
-      throw new RuntimeException(fallbackJniLibraryFileName + " was not found inside JAR.");
+      throw new RuntimeException(
+          jniLibraryFileName + " was not found inside JAR, and there is no fallback.");
     }
 
     try (InputStream is =
@@ -157,10 +180,16 @@ File loadLibraryFromJarToTemp(final String tmpDir) throws IOException {
         final File temp = createTemp(tmpDir, fallbackJniLibraryFileName);
         Files.copy(is, temp.toPath(), StandardCopyOption.REPLACE_EXISTING);
         return temp;
+      } else {
+        if (DEBUG_LOADING) {
+          System.out.println(
+              "Unable to find fallback: " + fallbackJniLibraryFileName + " on the classpath");
+        }
       }
     }
 
-    throw new RuntimeException(jniLibraryFileName + " was not found inside JAR.");
+    throw new RuntimeException("Neither " + jniLibraryFileName + " or " + fallbackJniLibraryFileName
+        + " were found inside the JAR, and there is no fallback.");
   }
 
   /**
diff --git a/java/src/main/java/org/rocksdb/Options.java b/java/src/main/java/org/rocksdb/Options.java
index c184e140f602..3e7bf28405e8 100644
--- a/java/src/main/java/org/rocksdb/Options.java
+++ b/java/src/main/java/org/rocksdb/Options.java
@@ -1045,19 +1045,6 @@ public boolean skipStatsUpdateOnDbOpen() {
     return skipStatsUpdateOnDbOpen(nativeHandle_);
   }
 
-  @Override
-  public Options setSkipCheckingSstFileSizesOnDbOpen(
-      final boolean skipCheckingSstFileSizesOnDbOpen) {
-    setSkipCheckingSstFileSizesOnDbOpen(nativeHandle_, skipCheckingSstFileSizesOnDbOpen);
-    return this;
-  }
-
-  @Override
-  public boolean skipCheckingSstFileSizesOnDbOpen() {
-    assert (isOwningHandle());
-    return skipCheckingSstFileSizesOnDbOpen(nativeHandle_);
-  }
-
   @Override
   public Options setWalRecoveryMode(final WALRecoveryMode walRecoveryMode) {
     assert(isOwningHandle());
@@ -1762,19 +1749,6 @@ public boolean paranoidFileChecks() {
     return paranoidFileChecks(nativeHandle_);
   }
 
-  @Override
-  public Options setMaxWriteBufferNumberToMaintain(
-      final int maxWriteBufferNumberToMaintain) {
-    setMaxWriteBufferNumberToMaintain(
-        nativeHandle_, maxWriteBufferNumberToMaintain);
-    return this;
-  }
-
-  @Override
-  public int maxWriteBufferNumberToMaintain() {
-    return maxWriteBufferNumberToMaintain(nativeHandle_);
-  }
-
   @Override
   public Options setCompactionPriority(
       final CompactionPriority compactionPriority) {
@@ -2296,9 +2270,6 @@ private static native void setWriteThreadSlowYieldUsec(
   private static native void setSkipStatsUpdateOnDbOpen(
       final long handle, final boolean skipStatsUpdateOnDbOpen);
   private static native boolean skipStatsUpdateOnDbOpen(final long handle);
-  private static native void setSkipCheckingSstFileSizesOnDbOpen(
-      final long handle, final boolean skipChecking);
-  private static native boolean skipCheckingSstFileSizesOnDbOpen(final long handle);
   private static native void setWalRecoveryMode(final long handle, final byte walRecoveryMode);
   private static native byte walRecoveryMode(final long handle);
   private static native void setAllow2pc(final long handle, final boolean allow2pc);
@@ -2443,9 +2414,6 @@ private static native void setMaxBytesForLevelMultiplierAdditional(
   private static native int[] maxBytesForLevelMultiplierAdditional(long handle);
   private static native void setParanoidFileChecks(long handle, boolean paranoidFileChecks);
   private static native boolean paranoidFileChecks(long handle);
-  private static native void setMaxWriteBufferNumberToMaintain(
-      final long handle, final int maxWriteBufferNumberToMaintain);
-  private static native int maxWriteBufferNumberToMaintain(final long handle);
   private static native void setCompactionPriority(
       final long handle, final byte compactionPriority);
   private static native byte compactionPriority(final long handle);
diff --git a/java/src/main/java/org/rocksdb/ReadOptions.java b/java/src/main/java/org/rocksdb/ReadOptions.java
index 5ce4a8656d3e..4be053376c61 100644
--- a/java/src/main/java/org/rocksdb/ReadOptions.java
+++ b/java/src/main/java/org/rocksdb/ReadOptions.java
@@ -186,37 +186,6 @@ public ReadOptions setTailing(final boolean tailing) {
     return this;
   }
 
-  /**
-   * Returns whether managed iterators will be used.
-   *
-   * @return the setting of whether managed iterators will be used,
-   *     by default false
-   *
-   * @deprecated This options is not used anymore.
-   */
-  @Deprecated
-  public boolean managed() {
-    assert(isOwningHandle());
-    return managed(nativeHandle_);
-  }
-
-  /**
-   * Specify to create a managed iterator -- a special iterator that
-   * uses less resources by having the ability to free its underlying
-   * resources on request.
-   *
-   * @param managed if true, then managed iterators will be enabled.
-   * @return the reference to the current ReadOptions.
-   *
-   * @deprecated This options is not used anymore.
-   */
-  @Deprecated
-  public ReadOptions setManaged(final boolean managed) {
-    assert(isOwningHandle());
-    setManaged(nativeHandle_, managed);
-    return this;
-  }
-
   /**
    * Returns whether a total seek order will be used
    *
@@ -398,7 +367,10 @@ public ReadOptions setMaxSkippableInternalKeys(
    * Default: false
    *
    * @return true if keys deleted using the DeleteRange() API will be visible
+   *
+   * @deprecated This option may be remove in a future release.
    */
+  @Deprecated
   public boolean ignoreRangeDeletions() {
     assert(isOwningHandle());
     return ignoreRangeDeletions(nativeHandle_);
@@ -414,7 +386,10 @@ public boolean ignoreRangeDeletions() {
    * @param ignoreRangeDeletions true if keys deleted using the DeleteRange()
    *     API should be visible
    * @return the reference to the current ReadOptions.
+   *
+   * @deprecated This option may be remove in a future release.
    */
+  @Deprecated
   public ReadOptions setIgnoreRangeDeletions(final boolean ignoreRangeDeletions) {
     assert(isOwningHandle());
     setIgnoreRangeDeletions(nativeHandle_, ignoreRangeDeletions);
@@ -813,8 +788,6 @@ protected final void disposeInternal(final long handle) {
   private static native void setReadTier(long handle, byte readTierValue);
   private static native boolean tailing(long handle);
   private static native void setTailing(long handle, boolean tailing);
-  private static native boolean managed(long handle);
-  private static native void setManaged(long handle, boolean managed);
   private static native boolean totalOrderSeek(long handle);
   private static native void setTotalOrderSeek(long handle, boolean totalOrderSeek);
   private static native boolean prefixSameAsStart(long handle);
diff --git a/java/src/main/java/org/rocksdb/RocksDB.java b/java/src/main/java/org/rocksdb/RocksDB.java
index 1ffb44b6a1b2..ebe134726982 100644
--- a/java/src/main/java/org/rocksdb/RocksDB.java
+++ b/java/src/main/java/org/rocksdb/RocksDB.java
@@ -84,13 +84,7 @@ public static void loadLibrary() {
       return;
     }
 
-    while (libraryLoaded.get() == LibraryState.LOADING) {
-      try {
-        Thread.sleep(10);
-      } catch(final InterruptedException e) {
-        //ignore
-      }
-    }
+    waitForLibraryToBeLoaded();
   }
 
   /**
@@ -146,12 +140,28 @@ public static void loadLibrary(final List<String> paths) {
       return;
     }
 
-    while (libraryLoaded.get() == LibraryState.LOADING) {
-      try {
-        Thread.sleep(10);
-      } catch(final InterruptedException e) {
-        //ignore
+    waitForLibraryToBeLoaded();
+  }
+
+  private static void waitForLibraryToBeLoaded() {
+    final long wait = 10; // Time to wait before re-checking if another thread loaded the library
+    final long timeout =
+        10 * 1000; // Maximum time to wait for another thread to load the library (10 seconds)
+    long waited = 0;
+    try {
+      while (libraryLoaded.get() == LibraryState.LOADING) {
+        Thread.sleep(wait);
+        waited += wait;
+
+        if (waited >= timeout) {
+          throw new RuntimeException(
+              "Exceeded timeout whilst trying to load the RocksDB shared library");
+        }
       }
+    } catch (final InterruptedException e) {
+      // restore interrupted status
+      Thread.currentThread().interrupt();
+      throw new RuntimeException("Interrupted whilst trying to load the RocksDB shared library", e);
     }
   }
 
@@ -4074,6 +4084,23 @@ public void continueBackgroundWork() throws RocksDBException {
     continueBackgroundWork(nativeHandle_);
   }
 
+  /**
+   * Abort all running and pending compaction jobs. This method will signal
+   * all active compactions to terminate and wait for them to complete.
+   * No new compactions will be scheduled until {@link #resumeAllCompactions()} is called.
+   */
+  public void abortAllCompactions() {
+    abortAllCompactions(nativeHandle_);
+  }
+
+  /**
+   * Resume compaction scheduling after {@link #abortAllCompactions()} was called.
+   * Must be called the same number of times as {@link #abortAllCompactions()}.
+   */
+  public void resumeAllCompactions() {
+    resumeAllCompactions(nativeHandle_);
+  }
+
   /**
    * Enable automatic compactions for the given column
    * families if they were previously disabled.
@@ -4126,6 +4153,7 @@ public int numberLevels(/* @Nullable */final ColumnFamilyHandle columnFamilyHand
    *
    * @return the maximum level
    */
+  @Deprecated
   public int maxMemCompactionLevel() {
     return maxMemCompactionLevel(null);
   }
@@ -4633,10 +4661,13 @@ public Range suggestCompactRange()
    * @param targetLevel the target level for L0
    *
    * @throws RocksDBException if an error occurs whilst promoting L0
+   *
+   * @deprecated this API may be removed in a future release.
    */
+  @Deprecated
   public void promoteL0(
-      /* @Nullable */final ColumnFamilyHandle columnFamilyHandle,
-      final int targetLevel) throws RocksDBException {
+      /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle, final int targetLevel)
+      throws RocksDBException {
     promoteL0(nativeHandle_,
         columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_,
         targetLevel);
@@ -4648,9 +4679,11 @@ public void promoteL0(
    * @param targetLevel the target level for L0
    *
    * @throws RocksDBException if an error occurs whilst promoting L0
+   *
+   * @deprecated this API may be removed in a future release.
    */
-  public void promoteL0(final int targetLevel)
-      throws RocksDBException {
+  @Deprecated
+  public void promoteL0(final int targetLevel) throws RocksDBException {
     promoteL0(null, targetLevel);
   }
 
@@ -5020,6 +5053,8 @@ private static native String[] compactFiles(final long handle, final long compac
   private static native void cancelAllBackgroundWork(final long handle, final boolean wait);
   private static native void pauseBackgroundWork(final long handle) throws RocksDBException;
   private static native void continueBackgroundWork(final long handle) throws RocksDBException;
+  private static native void abortAllCompactions(final long handle);
+  private static native void resumeAllCompactions(final long handle);
   private static native void enableAutoCompaction(
       final long handle, final long[] columnFamilyHandles) throws RocksDBException;
   private static native int numberLevels(final long handle, final long columnFamilyHandle);
diff --git a/java/src/main/java/org/rocksdb/TickerType.java b/java/src/main/java/org/rocksdb/TickerType.java
index 3b488660e851..41e6b7239425 100644
--- a/java/src/main/java/org/rocksdb/TickerType.java
+++ b/java/src/main/java/org/rocksdb/TickerType.java
@@ -550,14 +550,14 @@ public enum TickerType {
     BLOB_DB_BYTES_READ((byte) -0x2),
 
     /**
-     * # of keys written by BlobDB as non-TTL inlined value.
+     * Deprecated and unused. Retained to avoid shifting enum values.
      */
-    BLOB_DB_WRITE_INLINED((byte) -0x3),
+    @Deprecated BLOB_DB_WRITE_INLINED((byte) -0x3),
 
     /**
-     * # of keys written by BlobDB as TTL inlined value.
+     * Deprecated and unused. Retained to avoid shifting enum values.
      */
-    BLOB_DB_WRITE_INLINED_TTL((byte) -0x4),
+    @Deprecated BLOB_DB_WRITE_INLINED_TTL((byte) -0x4),
 
     /**
      * # of keys written by BlobDB as non-TTL blob value.
@@ -764,10 +764,14 @@ public enum TickerType {
      */
     HOT_FILE_READ_BYTES((byte) -0x31),
     WARM_FILE_READ_BYTES((byte) -0x32),
+    COOL_FILE_READ_BYTES((byte) -0x5B),
     COLD_FILE_READ_BYTES((byte) -0x33),
+    ICE_FILE_READ_BYTES((byte) -0x59),
     HOT_FILE_READ_COUNT((byte) -0x34),
     WARM_FILE_READ_COUNT((byte) -0x35),
+    COOL_FILE_READ_COUNT((byte) -0x5C),
     COLD_FILE_READ_COUNT((byte) -0x36),
+    ICE_FILE_READ_COUNT((byte) -0x5A),
 
     /**
      * (non-)last level read statistics
@@ -870,6 +874,8 @@ public enum TickerType {
 
     FIFO_TTL_COMPACTIONS((byte) -0x50),
 
+    FIFO_CHANGE_TEMPERATURE_COMPACTIONS((byte) -0x58),
+
     PREFETCH_BYTES((byte) -0x51),
 
     PREFETCH_BYTES_USEFUL((byte) -0x52),
@@ -882,6 +888,73 @@ public enum TickerType {
 
     FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT((byte) -0x57),
 
+    /**
+     * Counter for the number of times a WBWI is ingested into the DB. This
+     * happens when IngestWriteBatchWithIndex() is used and when large
+     * transaction optimization is enabled through
+     * TransactionOptions::large_txn_commit_optimize_threshold.
+     */
+    NUMBER_WBWI_INGEST((byte) -0x5D),
+
+    /**
+     * Failure to load the UDI during SST table open
+     */
+    SST_USER_DEFINED_INDEX_LOAD_FAIL_COUNT((byte) -0x5E),
+
+    /**
+     * Bytes of output files successfully resumed during remote compaction
+     */
+    REMOTE_COMPACT_RESUMED_BYTES((byte) -0x5F),
+
+    /**
+     * MultiScan statistics
+     */
+
+    /**
+     * # of calls to Iterator::Prepare() for multi-scan
+     */
+    MULTISCAN_PREPARE_CALLS((byte) -0x60),
+
+    /**
+     * # of errors during Iterator::Prepare() for multi-scan
+     */
+    MULTISCAN_PREPARE_ERRORS((byte) -0x61),
+
+    /**
+     * # of data blocks prefetched during multi-scan Prepare()
+     */
+    MULTISCAN_BLOCKS_PREFETCHED((byte) -0x62),
+
+    /**
+     * # of data blocks found in cache during multi-scan Prepare()
+     */
+    MULTISCAN_BLOCKS_FROM_CACHE((byte) -0x63),
+
+    /**
+     * Total bytes prefetched during multi-scan Prepare()
+     */
+    MULTISCAN_PREFETCH_BYTES((byte) -0x64),
+
+    /**
+     * # of prefetched blocks that were never accessed (wasted)
+     */
+    MULTISCAN_PREFETCH_BLOCKS_WASTED((byte) -0x65),
+
+    /**
+     * # of I/O requests issued during multi-scan Prepare()
+     */
+    MULTISCAN_IO_REQUESTS((byte) -0x66),
+
+    /**
+     * # of non-adjacent blocks coalesced into single I/O request
+     */
+    MULTISCAN_IO_COALESCED_NONADJACENT((byte) -0x67),
+
+    /**
+     * # of seek errors during multi-scan iteration
+     */
+    MULTISCAN_SEEK_ERRORS((byte) -0x68),
+
     TICKER_ENUM_MAX((byte) -0x54);
 
     private final byte value;
diff --git a/java/src/main/java/org/rocksdb/Transaction.java b/java/src/main/java/org/rocksdb/Transaction.java
index d1ddcbcbe6c7..ee8656460835 100644
--- a/java/src/main/java/org/rocksdb/Transaction.java
+++ b/java/src/main/java/org/rocksdb/Transaction.java
@@ -203,7 +203,7 @@ public void prepare() throws RocksDBException {
    * Status::Busy() may be returned if the transaction could not guarantee
    * that there are no write conflicts. Status::TryAgain() may be returned
    * if the memtable history size is not large enough
-   *  (See max_write_buffer_number_to_maintain).
+   *  (See max_write_buffer_size_to_maintain).
    * <p>
    * If this transaction was created by a {@link TransactionDB},
    * Status::Expired() may be returned if this transaction has lived for
@@ -661,6 +661,46 @@ public List<byte[]> multiGetAsList(final ReadOptions readOptions, final List<byt
     return Arrays.asList(multiGet(nativeHandle_, readOptions.nativeHandle_, keysArray));
   }
 
+  /**
+   * This function is similar to
+   * {@link RocksDB#multiGetAsList} except it will
+   * also read pending changes in this transaction.
+   * Currently, this function will return Status::MergeInProgress if the most
+   * recent write to the queried key in this batch is a Merge.
+   * <p>
+   * If {@link ReadOptions#snapshot()} is not set, the current version of the
+   * key will be read. Calling {@link #setSnapshot()} does not affect the
+   * version of the data returned.
+   * <p>
+   * Note that setting {@link ReadOptions#setSnapshot(Snapshot)} will affect
+   * what is read from the DB but will NOT change which keys are read from this
+   * transaction (the keys in this transaction do not yet belong to any snapshot
+   * and will be fetched regardless).
+   * <p>
+   * This method uses the optimized path with support for batched reads.
+   *
+   * @param readOptions Read options.=
+   *     {@link org.rocksdb.ColumnFamilyHandle} instances.
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param keys of keys for which values need to be retrieved.
+   *
+   * @return Array of values, one for each key
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public List<byte[]> multiGetAsList(final ReadOptions readOptions,
+      final ColumnFamilyHandle columnFamilyHandle, final List<byte[]> keys)
+      throws RocksDBException {
+    if (keys.isEmpty()) {
+      return new ArrayList<>(0);
+    }
+    final byte[][] keysArray = keys.toArray(new byte[keys.size()][]);
+    return Arrays.asList(multiGet(
+        nativeHandle_, readOptions.nativeHandle_, columnFamilyHandle.nativeHandle_, keysArray));
+  }
+
   /**
    * Read this key and ensure that this transaction will only
    * be able to be committed if this key is not written outside this
@@ -689,8 +729,7 @@ public List<byte[]> multiGetAsList(final ReadOptions readOptions, final List<byt
    *     {@link Status.Code#Busy} if there is a write conflict,
    *     {@link Status.Code#TimedOut} if a lock could not be acquired,
    *     {@link Status.Code#TryAgain} if the memtable history size is not large
-   *         enough. See
-   *         {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *         enough.
    *     {@link Status.Code#MergeInProgress} if merge operations cannot be
    *     resolved.
    *
@@ -770,8 +809,7 @@ public byte[] getForUpdate(final ReadOptions readOptions,
    *     {@link Status.Code#Busy} if there is a write conflict,
    *     {@link Status.Code#TimedOut} if a lock could not be acquired,
    *     {@link Status.Code#TryAgain} if the memtable history size is not large
-   *         enough. See
-   *         {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *         enough.
    *     {@link Status.Code#MergeInProgress} if merge operations cannot be
    *     resolved.
    *
@@ -821,8 +859,7 @@ public byte[] getForUpdate(final ReadOptions readOptions, final byte[] key,
    *     {@link Status.Code#Busy} if there is a write conflict,
    *     {@link Status.Code#TimedOut} if a lock could not be acquired,
    *     {@link Status.Code#TryAgain} if the memtable history size is not large
-   *         enough. See
-   *         {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *         enough.
    *     {@link Status.Code#MergeInProgress} if merge operations cannot be
    *     resolved.
    *
@@ -880,8 +917,7 @@ public GetStatus getForUpdate(final ReadOptions readOptions, final byte[] key, f
    *     {@link Status.Code#Busy} if there is a write conflict,
    *     {@link Status.Code#TimedOut} if a lock could not be acquired,
    *     {@link Status.Code#TryAgain} if the memtable history size is not large
-   *         enough. See
-   *         {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *         enough.
    *     {@link Status.Code#MergeInProgress} if merge operations cannot be
    *     resolved.
    *
@@ -933,8 +969,7 @@ public GetStatus getForUpdate(final ReadOptions readOptions, final ByteBuffer ke
    *     {@link Status.Code#Busy} if there is a write conflict,
    *     {@link Status.Code#TimedOut} if a lock could not be acquired,
    *     {@link Status.Code#TryAgain} if the memtable history size is not large
-   *         enough. See
-   *         {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *         enough.
    *     {@link Status.Code#MergeInProgress} if merge operations cannot be
    *     resolved.
    *
@@ -988,8 +1023,7 @@ public GetStatus getForUpdate(final ReadOptions readOptions,
    *     {@link Status.Code#Busy} if there is a write conflict,
    *     {@link Status.Code#TimedOut} if a lock could not be acquired,
    *     {@link Status.Code#TryAgain} if the memtable history size is not large
-   *         enough. See
-   *         {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *         enough.
    *     {@link Status.Code#MergeInProgress} if merge operations cannot be
    *     resolved.
    *
@@ -1050,8 +1084,7 @@ public GetStatus getForUpdate(final ReadOptions readOptions,
    *     {@link Status.Code#Busy} if there is a write conflict,
    *     {@link Status.Code#TimedOut} if a lock could not be acquired,
    *     {@link Status.Code#TryAgain} if the memtable history size is not large
-   *         enough. See
-   *         {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *         enough.
    *     {@link Status.Code#MergeInProgress} if merge operations cannot be
    *     resolved.
    *
@@ -1106,8 +1139,7 @@ public GetStatus getForUpdate(final ReadOptions readOptions,
    *     {@link Status.Code#Busy} if there is a write conflict,
    *     {@link Status.Code#TimedOut} if a lock could not be acquired,
    *     {@link Status.Code#TryAgain} if the memtable history size is not large
-   *         enough. See
-   *         {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *         enough.
    *     {@link Status.Code#MergeInProgress} if merge operations cannot be
    *     resolved.
    *
@@ -1393,8 +1425,7 @@ public RocksIterator getIterator(final ColumnFamilyHandle columnFamilyHandle) {
    *     {@link Status.Code#Busy} if there is a write conflict,
    *     {@link Status.Code#TimedOut} if a lock could not be acquired,
    *     {@link Status.Code#TryAgain} if the memtable history size is not large
-   *         enough. See
-   *         {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *         enough.
    *
    * @param columnFamilyHandle The column family to put the key/value into
    * @param key the specified key to be inserted.
@@ -1430,8 +1461,7 @@ public void put(final ColumnFamilyHandle columnFamilyHandle, final byte[] key,
    *     {@link Status.Code#Busy} if there is a write conflict,
    *     {@link Status.Code#TimedOut} if a lock could not be acquired,
    *     {@link Status.Code#TryAgain} if the memtable history size is not large
-   *         enough. See
-   *         {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *         enough.
    *
    * @param columnFamilyHandle The column family to put the key/value into
    * @param key the specified key to be inserted.
@@ -1460,8 +1490,7 @@ public void put(final ColumnFamilyHandle columnFamilyHandle, final byte[] key,
    *    {@link Status.Code#Busy} if there is a write conflict,
    *    {@link Status.Code#TimedOut} if a lock could not be acquired,
    *    {@link Status.Code#TryAgain} if the memtable history size is not large
-   *       enough. See
-   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *       enough.
    *
    * @param key the specified key to be inserted.
    * @param value the value associated with the specified key.
@@ -1536,8 +1565,7 @@ public void put(final ColumnFamilyHandle columnFamilyHandle,
    *    {@link Status.Code#Busy} if there is a write conflict,
    *    {@link Status.Code#TimedOut} if a lock could not be acquired,
    *    {@link Status.Code#TryAgain} if the memtable history size is not large
-   *       enough. See
-   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *       enough.
    *
    * @param key the specified key to be inserted.
    * @param value the value associated with the specified key.
@@ -1575,8 +1603,7 @@ public void put(final ByteBuffer key, final ByteBuffer value) throws RocksDBExce
    *    {@link Status.Code#Busy} if there is a write conflict,
    *    {@link Status.Code#TimedOut} if a lock could not be acquired,
    *    {@link Status.Code#TryAgain} if the memtable history size is not large
-   *       enough. See
-   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *       enough.
    *
    * @param columnFamilyHandle The column family to put the key/value into
    * @param key the specified key to be inserted.
@@ -1645,8 +1672,7 @@ public void put(final byte[][] keyParts, final byte[][] valueParts)
    *    {@link Status.Code#Busy} if there is a write conflict,
    *    {@link Status.Code#TimedOut} if a lock could not be acquired,
    *    {@link Status.Code#TryAgain} if the memtable history size is not large
-   *       enough. See
-   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *       enough.
    *
    * @param columnFamilyHandle The column family to merge the key/value into
    * @param key the specified key to be merged.
@@ -1683,8 +1709,7 @@ public void merge(final ColumnFamilyHandle columnFamilyHandle,
    *    {@link Status.Code#Busy} if there is a write conflict,
    *    {@link Status.Code#TimedOut} if a lock could not be acquired,
    *    {@link Status.Code#TryAgain} if the memtable history size is not large
-   *       enough. See
-   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *       enough.
    *
    * @param columnFamilyHandle The column family to merge the key/value into
    * @param key the specified key to be merged.
@@ -1713,8 +1738,7 @@ public void merge(final ColumnFamilyHandle columnFamilyHandle,
    *    {@link Status.Code#Busy} if there is a write conflict,
    *    {@link Status.Code#TimedOut} if a lock could not be acquired,
    *    {@link Status.Code#TryAgain} if the memtable history size is not large
-   *       enough. See
-   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *       enough.
    *
    * @param key the specified key to be merged.
    * @param value the value associated with the specified key.
@@ -1741,8 +1765,7 @@ public void merge(final byte[] key, final byte[] value)
    *    {@link Status.Code#Busy} if there is a write conflict,
    *    {@link Status.Code#TimedOut} if a lock could not be acquired,
    *    {@link Status.Code#TryAgain} if the memtable history size is not large
-   *       enough. See
-   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *       enough.
    *
    * @param key the specified key to be merged.
    * @param value the value associated with the specified key.
@@ -1778,8 +1801,7 @@ public void merge(final ByteBuffer key, final ByteBuffer value) throws RocksDBEx
    *    {@link Status.Code#Busy} if there is a write conflict,
    *    {@link Status.Code#TimedOut} if a lock could not be acquired,
    *    {@link Status.Code#TryAgain} if the memtable history size is not large
-   *       enough. See
-   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *       enough.
    *
    * @param columnFamilyHandle in which to apply the merge
    * @param key the specified key to be merged.
@@ -1821,8 +1843,7 @@ public void merge(final ColumnFamilyHandle columnFamilyHandle, final ByteBuffer
    *    {@link Status.Code#Busy} if there is a write conflict,
    *    {@link Status.Code#TimedOut} if a lock could not be acquired,
    *    {@link Status.Code#TryAgain} if the memtable history size is not large
-   *       enough. See
-   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *       enough.
    *
    * @param columnFamilyHandle in which to apply the merge
    * @param key the specified key to be merged.
@@ -1849,8 +1870,7 @@ public void merge(final ColumnFamilyHandle columnFamilyHandle, final ByteBuffer
    *    {@link Status.Code#Busy} if there is a write conflict,
    *    {@link Status.Code#TimedOut} if a lock could not be acquired,
    *    {@link Status.Code#TryAgain} if the memtable history size is not large
-   *       enough. See
-   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *       enough.
    *
    * @param columnFamilyHandle The column family to delete the key/value from
    * @param key the specified key to be deleted.
@@ -1885,8 +1905,7 @@ public void delete(final ColumnFamilyHandle columnFamilyHandle,
    *    {@link Status.Code#Busy} if there is a write conflict,
    *    {@link Status.Code#TimedOut} if a lock could not be acquired,
    *    {@link Status.Code#TryAgain} if the memtable history size is not large
-   *       enough. See
-   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *       enough.
    *
    * @param columnFamilyHandle The column family to delete the key/value from
    * @param key the specified key to be deleted.
@@ -1914,8 +1933,7 @@ public void delete(final ColumnFamilyHandle columnFamilyHandle,
    *    {@link Status.Code#Busy} if there is a write conflict,
    *    {@link Status.Code#TimedOut} if a lock could not be acquired,
    *    {@link Status.Code#TryAgain} if the memtable history size is not large
-   *       enough. See
-   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *       enough.
    *
    * @param key the specified key to be deleted.
    *
@@ -2001,8 +2019,7 @@ public void delete(final byte[][] keyParts) throws RocksDBException {
    *    {@link Status.Code#Busy} if there is a write conflict,
    *    {@link Status.Code#TimedOut} if a lock could not be acquired,
    *    {@link Status.Code#TryAgain} if the memtable history size is not large
-   *       enough. See
-   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *       enough.
    *
    * @param columnFamilyHandle The column family to delete the key/value from
    * @param key the specified key to be deleted.
@@ -2038,8 +2055,7 @@ public void singleDelete(final ColumnFamilyHandle columnFamilyHandle, final byte
    *    {@link Status.Code#Busy} if there is a write conflict,
    *    {@link Status.Code#TimedOut} if a lock could not be acquired,
    *    {@link Status.Code#TryAgain} if the memtable history size is not large
-   *       enough. See
-   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *       enough.
    *
    * @param columnFamilyHandle The column family to delete the key/value from
    * @param key the specified key to be deleted.
@@ -2068,8 +2084,7 @@ public void singleDelete(final ColumnFamilyHandle columnFamilyHandle, final byte
    *    {@link Status.Code#Busy} if there is a write conflict,
    *    {@link Status.Code#TimedOut} if a lock could not be acquired,
    *    {@link Status.Code#TryAgain} if the memtable history size is not large
-   *       enough. See
-   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *       enough.
    *
    * @param key the specified key to be deleted.
    *
@@ -2902,6 +2917,8 @@ private static native byte[][] multiGet(final long handle, final long readOption
       final byte[][] keys, final long[] columnFamilyHandles) throws RocksDBException;
   private static native byte[][] multiGet(
       final long handle, final long readOptionsHandle, final byte[][] keys) throws RocksDBException;
+  private static native byte[][] multiGet(final long nativeHandle, final long readOptionsHandle,
+      final long cfHandle, final byte[][] keys) throws RocksDBException;
   private static native byte[] getForUpdate(final long handle, final long readOptionsHandle,
       final byte[] key, final int keyOffset, final int keyLength, final long columnFamilyHandle,
       final boolean exclusive, final boolean doValidate) throws RocksDBException;
diff --git a/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java b/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java
index 13247d1e6635..ef904ffe1b54 100644
--- a/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java
+++ b/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java
@@ -109,11 +109,13 @@ public void jniPortal() throws Exception {
     tableConfig.setIndexType(IndexType.kBinarySearch);
     tableConfig.setDataBlockIndexType(DataBlockIndexType.kDataBlockBinarySearch);
     tableConfig.setChecksumType(ChecksumType.kNoChecksum);
+    tableConfig.setIndexSearchType(IndexSearchType.kBinary);
     try (final Options options = new Options().setTableFormatConfig(tableConfig)) {
       final String opts = getOptionAsString(options);
       assertThat(opts).contains("index_type=kBinarySearch");
       assertThat(opts).contains("data_block_index_type=kDataBlockBinarySearch");
       assertThat(opts).contains("checksum=kNoChecksum");
+      assertThat(opts).contains("index_block_search_type=kBinary");
     }
 
     tableConfig.setIndexType(IndexType.kHashSearch);
@@ -377,6 +379,20 @@ public void blockAlign() {
         isTrue();
   }
 
+  @Test
+  public void superBlockAlignmentSize() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setSuperBlockAlignmentSize(1024 * 1024);
+    assertThat(blockBasedTableConfig.superBlockAlignmentSize()).isEqualTo(1024 * 1024);
+  }
+
+  @Test
+  public void superBlockAlignmentSpaceOverheadRatio() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setSuperBlockAlignmentSpaceOverheadRatio(4096);
+    assertThat(blockBasedTableConfig.superBlockAlignmentSpaceOverheadRatio()).isEqualTo(4096);
+  }
+
   @Test
   public void indexShortening() {
     final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
@@ -385,6 +401,16 @@ public void indexShortening() {
         .isEqualTo(IndexShorteningMode.kShortenSeparatorsAndSuccessor);
   }
 
+  @Test
+  public void indexSearchType() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    assertThat(IndexSearchType.values().length).isEqualTo(2);
+    blockBasedTableConfig.setIndexSearchType(IndexSearchType.kInterpolation);
+    assertThat(blockBasedTableConfig.indexSearchType()).isEqualTo(IndexSearchType.kInterpolation);
+    blockBasedTableConfig.setIndexSearchType(IndexSearchType.kBinary);
+    assertThat(blockBasedTableConfig.indexSearchType()).isEqualTo(IndexSearchType.kBinary);
+  }
+
   @Deprecated
   @Test
   public void hashIndexAllowCollision() {
diff --git a/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java b/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java
index 35a04a697f84..c345e80c030f 100644
--- a/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java
+++ b/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java
@@ -561,19 +561,6 @@ public void maxTableFilesSizeFIFO() {
     }
   }
 
-  @Test
-  public void maxWriteBufferNumberToMaintain() {
-    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
-      int intValue = rand.nextInt();
-      // Size has to be positive
-      intValue = (intValue < 0) ? -intValue : intValue;
-      intValue = (intValue == 0) ? intValue + 1 : intValue;
-      opt.setMaxWriteBufferNumberToMaintain(intValue);
-      assertThat(opt.maxWriteBufferNumberToMaintain()).
-          isEqualTo(intValue);
-    }
-  }
-
   @Test
   public void compactionPriorities() {
     try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
diff --git a/java/src/test/java/org/rocksdb/DBOptionsTest.java b/java/src/test/java/org/rocksdb/DBOptionsTest.java
index a71345f744a3..0dc1d0cb0a8c 100644
--- a/java/src/test/java/org/rocksdb/DBOptionsTest.java
+++ b/java/src/test/java/org/rocksdb/DBOptionsTest.java
@@ -656,15 +656,6 @@ public String name() {
     }
   }
 
-  @Test
-  public void failIfOptionsFileError() {
-    try (final DBOptions opt = new DBOptions()) {
-      final boolean boolValue = rand.nextBoolean();
-      opt.setFailIfOptionsFileError(boolValue);
-      assertThat(opt.failIfOptionsFileError()).isEqualTo(boolValue);
-    }
-  }
-
   @Test
   public void dumpMallocStats() {
     try (final DBOptions opt = new DBOptions()) {
@@ -839,15 +830,6 @@ public void maxWriteBatchGroupSizeBytes() {
     }
   }
 
-  @Test
-  public void skipCheckingSstFileSizesOnDbOpen() {
-    try (final DBOptions options = new DBOptions()) {
-      assertThat(options.skipCheckingSstFileSizesOnDbOpen()).isEqualTo(false);
-      assertThat(options.setSkipCheckingSstFileSizesOnDbOpen(true)).isEqualTo(options);
-      assertThat(options.skipCheckingSstFileSizesOnDbOpen()).isEqualTo(true);
-    }
-  }
-
   @Test
   public void eventListeners() {
     final AtomicBoolean wasCalled1 = new AtomicBoolean();
diff --git a/java/src/test/java/org/rocksdb/MutableColumnFamilyOptionsTest.java b/java/src/test/java/org/rocksdb/MutableColumnFamilyOptionsTest.java
index d858a150dfc9..58e3f4be21fd 100644
--- a/java/src/test/java/org/rocksdb/MutableColumnFamilyOptionsTest.java
+++ b/java/src/test/java/org/rocksdb/MutableColumnFamilyOptionsTest.java
@@ -96,30 +96,54 @@ public void mutableColumnFamilyOptions_parse() {
   public void mutableColumnFamilyOptions_parse_getOptions_output() {
     final String optionsString =
         "bottommost_compression=kDisableCompressionOption;  sample_for_compression=0;  "
-        + "blob_garbage_collection_age_cutoff=0.250000;  blob_garbage_collection_force_threshold=0.800000;"
-        + "arena_block_size=1048576;  enable_blob_garbage_collection=false;  level0_stop_writes_trigger=36;  min_blob_size=65536;"
-        + "blob_compaction_readahead_size=262144;  blob_file_starting_level=5;  prepopulate_blob_cache=kDisable;"
-        + "compaction_options_universal={allow_trivial_move=false;stop_style=kCompactionStopStyleTotalSize;min_merge_width=2;"
-        + "compression_size_percent=-1;max_size_amplification_percent=200;max_merge_width=4294967295;size_ratio=1;};  "
-        + "target_file_size_base=67108864;  max_bytes_for_level_base=268435456;  memtable_whole_key_filtering=false;  "
-        + "soft_pending_compaction_bytes_limit=68719476736;  blob_compression_type=kNoCompression;  max_write_buffer_number=2;  "
-        + "ttl=2592000;  compaction_options_fifo={allow_compaction=false;age_for_warm=0;max_table_files_size=1073741824;};  "
-        + "check_flush_compaction_key_order=true;  max_successive_merges=0;  inplace_update_num_locks=10000;  "
-        + "bottommost_compression_opts={enabled=false;parallel_threads=1;zstd_max_train_bytes=0;max_dict_bytes=0;"
+        + "blob_garbage_collection_age_cutoff=0.250000;  "
+        + "blob_garbage_collection_force_threshold=0.800000;"
+        + "arena_block_size=1048576;  enable_blob_garbage_collection=false;  "
+        + "level0_stop_writes_trigger=36;  min_blob_size=65536;"
+        + "blob_compaction_readahead_size=262144;  blob_file_starting_level=5;  "
+        + "prepopulate_blob_cache=kDisable;"
+        + "compaction_options_universal={allow_trivial_move=false;stop_style="
+        + "kCompactionStopStyleTotalSize;min_merge_width=2;"
+        + "compression_size_percent=-1;max_size_amplification_percent=200;max_merge_width="
+        + "4294967295;size_ratio=1;};  "
+        + "target_file_size_base=67108864;  max_bytes_for_level_base=268435456;  "
+        + "memtable_whole_key_filtering=false;  "
+        + "soft_pending_compaction_bytes_limit=68719476736;  blob_compression_type=kNoCompression; "
+        + " max_write_buffer_number=2;  "
+        + "ttl=2592000;  "
+        + "compaction_options_fifo={allow_compaction=false;age_for_warm=0;max_table_files_size="
+        + "1073741824;};  "
+        + "check_flush_compaction_key_order=true;  max_successive_merges=0;  "
+        + "inplace_update_num_locks=10000;  "
+        + "bottommost_compression_opts={enabled=false;parallel_threads=1;zstd_max_train_bytes=0;"
+        + "max_dict_bytes=0;"
         + "strategy=0;max_dict_buffer_bytes=0;level=32767;window_bits=-14;};  "
-        + "target_file_size_multiplier=1;  max_bytes_for_level_multiplier_additional=5:{7}:{9}:{11}:{13}:{15}:{17};  "
-        + "enable_blob_files=true;  level0_slowdown_writes_trigger=20;  compression=kLZ4HCCompression;  level0_file_num_compaction_trigger=4;  "
-        + "blob_file_size=268435456;  prefix_extractor=nullptr;  max_bytes_for_level_multiplier=10.000000;  write_buffer_size=67108864;  "
-        + "disable_auto_compactions=false;  max_compaction_bytes=1677721600;  memtable_huge_page_size=0;  "
-        + "compression_opts={enabled=false;parallel_threads=1;zstd_max_train_bytes=0;max_dict_bytes=0;strategy=0;max_dict_buffer_bytes=0;"
+        + "target_file_size_multiplier=1;  "
+        + "max_bytes_for_level_multiplier_additional=5:{7}:{9}:{11}:{13}:{15}:{17};  "
+        + "enable_blob_files=true;  level0_slowdown_writes_trigger=20;  "
+        + "compression=kLZ4HCCompression;  level0_file_num_compaction_trigger=4;  "
+        + "blob_file_size=268435456;  prefix_extractor=nullptr;  "
+        + "max_bytes_for_level_multiplier=10.000000;  write_buffer_size=67108864;  "
+        + "disable_auto_compactions=false;  max_compaction_bytes=1677721600;  "
+        + "memtable_huge_page_size=0;  "
+        + "compression_opts={enabled=false;parallel_threads=1;zstd_max_train_bytes=0;max_dict_"
+        + "bytes=0;strategy=0;max_dict_buffer_bytes=0;"
         + "level=32767;window_bits=-14;};  "
-        + "hard_pending_compaction_bytes_limit=274877906944;  periodic_compaction_seconds=0;  paranoid_file_checks=true;  "
-        + "memtable_prefix_bloom_size_ratio=7.500000;  max_sequential_skip_in_iterations=8;  report_bg_io_stats=true;  "
-        + "compaction_pri=kMinOverlappingRatio;  compaction_style=kCompactionStyleLevel;  memtable_factory=SkipListFactory;  "
-        + "comparator=leveldb.BytewiseComparator;  bloom_locality=0;  compaction_filter_factory=nullptr;  "
-        + "min_write_buffer_number_to_merge=1;  max_write_buffer_number_to_maintain=0;  compaction_filter=nullptr;  merge_operator=nullptr;  "
-        + "num_levels=7;  optimize_filters_for_hits=false;  force_consistency_checks=true;  table_factory=BlockBasedTable;  "
-        + "max_write_buffer_size_to_maintain=0;  memtable_insert_with_hint_prefix_extractor=nullptr;  level_compaction_dynamic_level_bytes=false;  "
+        + "hard_pending_compaction_bytes_limit=274877906944;  periodic_compaction_seconds=0;  "
+        + "paranoid_file_checks=true;  "
+        + "memtable_prefix_bloom_size_ratio=7.500000;  max_sequential_skip_in_iterations=8;  "
+        + "report_bg_io_stats=true;  "
+        + "compaction_pri=kMinOverlappingRatio;  compaction_style=kCompactionStyleLevel;  "
+        + "memtable_factory=SkipListFactory;  "
+        + "comparator=leveldb.BytewiseComparator;  bloom_locality=0;  "
+        + "compaction_filter_factory=nullptr;  "
+        + "min_write_buffer_number_to_merge=1;  compaction_filter=nullptr;  "
+        + "merge_operator=nullptr;  "
+        + "num_levels=7;  optimize_filters_for_hits=false;  force_consistency_checks=true;  "
+        + "table_factory=BlockBasedTable;  "
+        + "max_write_buffer_size_to_maintain=0;  "
+        + "memtable_insert_with_hint_prefix_extractor=nullptr;  "
+        + "level_compaction_dynamic_level_bytes=false;  "
         + "inplace_update_support=false;  experimental_mempurge_threshold=0.003";
 
     final MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder cf =
diff --git a/java/src/test/java/org/rocksdb/OptionsTest.java b/java/src/test/java/org/rocksdb/OptionsTest.java
index 6615b6761477..f720ed44e220 100644
--- a/java/src/test/java/org/rocksdb/OptionsTest.java
+++ b/java/src/test/java/org/rocksdb/OptionsTest.java
@@ -902,15 +902,6 @@ public String name() {
     }
   }
 
-  @Test
-  public void failIfOptionsFileError() {
-    try (final Options opt = new Options()) {
-      final boolean boolValue = rand.nextBoolean();
-      opt.setFailIfOptionsFileError(boolValue);
-      assertThat(opt.failIfOptionsFileError()).isEqualTo(boolValue);
-    }
-  }
-
   @Test
   public void dumpMallocStats() {
     try (final Options opt = new Options()) {
@@ -1201,19 +1192,6 @@ public void statistics() {
     }
   }
 
-  @Test
-  public void maxWriteBufferNumberToMaintain() {
-    try (final Options options = new Options()) {
-      int intValue = rand.nextInt();
-      // Size has to be positive
-      intValue = (intValue < 0) ? -intValue : intValue;
-      intValue = (intValue == 0) ? intValue + 1 : intValue;
-      options.setMaxWriteBufferNumberToMaintain(intValue);
-      assertThat(options.maxWriteBufferNumberToMaintain()).
-          isEqualTo(intValue);
-    }
-  }
-
   @Test
   public void compactionPriorities() {
     try (final Options options = new Options()) {
@@ -1425,15 +1403,6 @@ public void maxWriteBatchGroupSizeBytes() {
     }
   }
 
-  @Test
-  public void skipCheckingSstFileSizesOnDbOpen() {
-    try (final Options options = new Options()) {
-      assertThat(options.skipCheckingSstFileSizesOnDbOpen()).isEqualTo(false);
-      assertThat(options.setSkipCheckingSstFileSizesOnDbOpen(true)).isEqualTo(options);
-      assertThat(options.skipCheckingSstFileSizesOnDbOpen()).isEqualTo(true);
-    }
-  }
-
   @Test
   public void memtableMaxRangeDeletions() {
     try (final Options options = new Options()) {
diff --git a/java/src/test/java/org/rocksdb/OptionsUtilTest.java b/java/src/test/java/org/rocksdb/OptionsUtilTest.java
index 0998ae83fa73..0cdccbb91ba4 100644
--- a/java/src/test/java/org/rocksdb/OptionsUtilTest.java
+++ b/java/src/test/java/org/rocksdb/OptionsUtilTest.java
@@ -298,6 +298,8 @@ private void verifyTableFormatOptions(final LoaderUnderTest loaderUnderTest)
     altCFTableConfig.setFormatVersion(8);
     altCFTableConfig.setEnableIndexCompression(false);
     altCFTableConfig.setBlockAlign(true);
+    altCFTableConfig.setSuperBlockAlignmentSize(1024 * 1024);
+    altCFTableConfig.setSuperBlockAlignmentSpaceOverheadRatio(4 * 1024);
     altCFTableConfig.setIndexShortening(IndexShorteningMode.kShortenSeparatorsAndSuccessor);
     altCFTableConfig.setBlockCacheSize(3 * 1024 * 1024);
     // Note cache objects are not set here, as they are not read back when reading config.
@@ -365,6 +367,9 @@ private void verifyBlockBasedTableConfig(
     assertThat(actual.formatVersion()).isEqualTo(expected.formatVersion());
     assertThat(actual.enableIndexCompression()).isEqualTo(expected.enableIndexCompression());
     assertThat(actual.blockAlign()).isEqualTo(expected.blockAlign());
+    assertThat(actual.superBlockAlignmentSize()).isEqualTo(expected.superBlockAlignmentSize());
+    assertThat(actual.superBlockAlignmentSpaceOverheadRatio())
+        .isEqualTo(expected.superBlockAlignmentSpaceOverheadRatio());
     assertThat(actual.indexShortening()).isEqualTo(expected.indexShortening());
     if (expected.filterPolicy() == null) {
       assertThat(actual.filterPolicy()).isNull();
diff --git a/java/src/test/java/org/rocksdb/ReadOptionsTest.java b/java/src/test/java/org/rocksdb/ReadOptionsTest.java
index baf51bf9b4b5..3ff4e6bba6d9 100644
--- a/java/src/test/java/org/rocksdb/ReadOptionsTest.java
+++ b/java/src/test/java/org/rocksdb/ReadOptionsTest.java
@@ -98,15 +98,6 @@ public void readTier() {
     }
   }
 
-  @SuppressWarnings("deprecated")
-  @Test
-  public void managed() {
-    try (final ReadOptions opt = new ReadOptions()) {
-      opt.setManaged(true);
-      assertThat(opt.managed()).isTrue();
-    }
-  }
-
   @Test
   public void totalOrderSeek() {
     try (final ReadOptions opt = new ReadOptions()) {
diff --git a/java/src/test/java/org/rocksdb/RocksDBTest.java b/java/src/test/java/org/rocksdb/RocksDBTest.java
index 5a9c76fd8e3b..50cdf86a3e44 100644
--- a/java/src/test/java/org/rocksdb/RocksDBTest.java
+++ b/java/src/test/java/org/rocksdb/RocksDBTest.java
@@ -1563,16 +1563,6 @@ public void numberLevels() throws RocksDBException {
     }
   }
 
-  @Test
-  public void maxMemCompactionLevel() throws RocksDBException {
-    try (final Options options = new Options().setCreateIfMissing(true)) {
-      final String dbPath = dbFolder.getRoot().getAbsolutePath();
-      try (final RocksDB db = RocksDB.open(options, dbPath)) {
-        assertThat(db.maxMemCompactionLevel()).isEqualTo(0);
-      }
-    }
-  }
-
   @Test
   public void level0StopWriteTrigger() throws RocksDBException {
     try (final Options options = new Options().setCreateIfMissing(true)) {
diff --git a/java/src/test/java/org/rocksdb/SstFileReaderTest.java b/java/src/test/java/org/rocksdb/SstFileReaderTest.java
index ef74b08a72ab..27934e0f80b6 100644
--- a/java/src/test/java/org/rocksdb/SstFileReaderTest.java
+++ b/java/src/test/java/org/rocksdb/SstFileReaderTest.java
@@ -217,6 +217,8 @@ public void readSstFile() throws RocksDBException, IOException {
       assertThat(iterator.isValid()).isTrue();
       assertThat(iterator.key()).isEqualTo("key1".getBytes());
       assertThat(iterator.value()).isEqualTo("value1".getBytes());
+
+      iterator.close();
     }
   }
 }
diff --git a/java/src/test/java/org/rocksdb/TransactionTest.java b/java/src/test/java/org/rocksdb/TransactionTest.java
index 03a6b4ff6b3f..9adc26d97018 100644
--- a/java/src/test/java/org/rocksdb/TransactionTest.java
+++ b/java/src/test/java/org/rocksdb/TransactionTest.java
@@ -345,6 +345,32 @@ public void multiGetAsListForUpdate_conflict() throws RocksDBException {
     }
   }
 
+  @Test
+  public void multiGetAsList() throws RocksDBException {
+    final byte[] k1 = "k1".getBytes(UTF_8);
+    final byte[] k2 = "k2".getBytes(UTF_8);
+    final byte[] k3 = "k3".getBytes(UTF_8);
+    final byte[] v1 = "v1".getBytes(UTF_8);
+    final byte[] v2 = "v2".getBytes(UTF_8);
+
+    try (final DBContainer dbContainer = startDb();
+         final ReadOptions readOptions = new ReadOptions()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+
+      try (final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(testCf, k1, v1);
+        txn.put(testCf, k2, v2);
+        txn.commit();
+      }
+
+      try (final Transaction txn = dbContainer.beginTransaction()) {
+        final List<byte[]> result =
+            txn.multiGetAsList(readOptions, testCf, Arrays.asList(k1, k2, k3));
+        assertThat(result).containsExactly(v1, v2, null);
+      }
+    }
+  }
+
   @Test
   public void name() throws RocksDBException {
     try(final DBContainer dbContainer = startDb();
diff --git a/logging/auto_roll_logger_test.cc b/logging/auto_roll_logger_test.cc
index 2e2747729eb1..30ade0f38919 100644
--- a/logging/auto_roll_logger_test.cc
+++ b/logging/auto_roll_logger_test.cc
@@ -647,7 +647,7 @@ TEST_F(AutoRollLoggerTest, LogHeaderTest) {
 }
 
 TEST_F(AutoRollLoggerTest, LogFileExistence) {
-  ROCKSDB_NAMESPACE::DB* db;
+  std::unique_ptr<ROCKSDB_NAMESPACE::DB> db;
   ROCKSDB_NAMESPACE::Options options;
 #ifdef OS_WIN
   // Replace all slashes in the path so windows CompSpec does not
@@ -664,7 +664,6 @@ TEST_F(AutoRollLoggerTest, LogFileExistence) {
   options.create_if_missing = true;
   ASSERT_OK(ROCKSDB_NAMESPACE::DB::Open(options, kTestDir, &db));
   ASSERT_OK(default_env->FileExists(kLogFile));
-  delete db;
 }
 
 TEST_F(AutoRollLoggerTest, FileCreateFailure) {
diff --git a/memory/memory_allocator_impl.h b/memory/memory_allocator_impl.h
index f1d3b9472ccc..65ebfebb94c9 100644
--- a/memory/memory_allocator_impl.h
+++ b/memory/memory_allocator_impl.h
@@ -12,8 +12,8 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-struct CustomDeleter {
-  CustomDeleter(MemoryAllocator* a = nullptr) : allocator(a) {}
+struct CacheAllocationDeleter {
+  CacheAllocationDeleter(MemoryAllocator* a = nullptr) : allocator(a) {}
 
   void operator()(char* ptr) const {
     if (allocator) {
@@ -26,12 +26,12 @@ struct CustomDeleter {
   MemoryAllocator* allocator;
 };
 
-using CacheAllocationPtr = std::unique_ptr<char[], CustomDeleter>;
+using CacheAllocationPtr = std::unique_ptr<char[], CacheAllocationDeleter>;
 
 inline CacheAllocationPtr AllocateBlock(size_t size,
                                         MemoryAllocator* allocator) {
   if (allocator) {
-    auto block = reinterpret_cast<char*>(allocator->Allocate(size));
+    auto block = static_cast<char*>(allocator->Allocate(size));
     return CacheAllocationPtr(block, allocator);
   }
   return CacheAllocationPtr(new char[size]);
diff --git a/memory/memory_allocator_test.cc b/memory/memory_allocator_test.cc
index 2ae38ec11b57..669548970ad2 100644
--- a/memory/memory_allocator_test.cc
+++ b/memory/memory_allocator_test.cc
@@ -83,7 +83,7 @@ TEST_P(MemoryAllocatorTest, DatabaseBlockCache) {
   auto cache = NewLRUCache(1024 * 1024, 6, false, 0.0, allocator_);
   table_options.block_cache = cache;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   Status s = DB::Open(options, dbname, &db);
   ASSERT_OK(s);
   ASSERT_NE(db, nullptr);
@@ -115,7 +115,7 @@ TEST_P(MemoryAllocatorTest, DatabaseBlockCache) {
   // Close database
   s = db->Close();
   ASSERT_OK(s);
-  delete db;
+  db.reset();
   ASSERT_OK(DestroyDB(dbname, options));
 }
 
diff --git a/memtable/inlineskiplist.h b/memtable/inlineskiplist.h
index 9fdf618fa550..d39091ec6d43 100644
--- a/memtable/inlineskiplist.h
+++ b/memtable/inlineskiplist.h
@@ -44,8 +44,6 @@
 #include <assert.h>
 #include <stdlib.h>
 
-#include <algorithm>
-#include <atomic>
 #include <type_traits>
 
 #include "memory/allocator.h"
@@ -53,7 +51,7 @@
 #include "port/port.h"
 #include "rocksdb/slice.h"
 #include "test_util/sync_point.h"
-#include "util/coding.h"
+#include "util/atomic.h"
 #include "util/random.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -182,8 +180,11 @@ class InlineSkipList {
     // Advance to the first entry with a key >= target
     void Seek(const char* target);
 
-    [[nodiscard]] Status SeekAndValidate(const char* target,
-                                         bool allow_data_in_errors);
+    [[nodiscard]] Status SeekAndValidate(
+        const char* target, bool allow_data_in_errors,
+        bool detect_key_out_of_order,
+        const std::function<Status(const char*, bool)>&
+            key_validation_callback);
 
     // Retreat to the last entry with a key <= target
     void SeekForPrev(const char* target);
@@ -215,18 +216,17 @@ class InlineSkipList {
   Comparator const compare_;
   Node* const head_;
 
-  // Modified only by Insert().  Read racily by readers, but stale
-  // values are ok.
-  std::atomic<int> max_height_;  // Height of the entire list
+  // Maximum height of any node in the list (or in the process of being added).
+  //  Modified only by Insert().  Relaxed reads are always OK because starting
+  // from higher levels only helps efficiency, not correctness.
+  RelaxedAtomic<int> max_height_;
 
   // seq_splice_ is a Splice used for insertions in the non-concurrent
   // case.  It caches the prev and next found during the most recent
   // non-concurrent insertion.
   Splice* seq_splice_;
 
-  inline int GetMaxHeight() const {
-    return max_height_.load(std::memory_order_relaxed);
-  }
+  inline int GetMaxHeight() const { return max_height_.LoadRelaxed(); }
 
   int RandomHeight();
 
@@ -246,20 +246,23 @@ class InlineSkipList {
   bool KeyIsAfterNode(const DecodedKey& key, Node* n) const;
 
   // Returns the earliest node with a key >= key.
-  // Returns nullptr if there is no such node.
-  // @param out_of_order_node If not null, will validate the order of visited
-  // nodes. If a pair of out-of-order nodes n1 and n2 are found, n1 will be
-  // returned and *out_of_order_node will be set to n2.
-  Node* FindGreaterOrEqual(const char* key, Node** out_of_order_node) const;
+  // Returns OK, if no corruption is found.
+  // node is set to the found node, or to nullptr if no node is found.
+  // Returns Corruption if a corruption is found.
+  Status FindGreaterOrEqual(const char* key, Node** node,
+                            bool detect_key_out_of_order,
+                            bool allow_data_in_errors,
+                            const std::function<Status(const char*, bool)>&
+                                key_validation_callback) const;
 
   // Returns the latest node with a key < key.
   // Returns head_ if there is no such node.
   // Fills prev[level] with pointer to previous node at "level" for every
   // level in [0..max_height_-1], if prev is non-null.
-  // @param out_of_order_node If not null, will validate the order of visited
+  // @param corrupted_node If not null, will validate the order of visited
   // nodes. If a pair of out-of-order nodes n1 and n2 are found, n1 will be
-  // returned and *out_of_order_node will be set to n2.
-  Node* FindLessThan(const char* key, Node** out_of_order_node) const;
+  // returned and *corrupted_node will be set to n2.
+  Node* FindLessThan(const char* key, Node** corrupted_node) const;
 
   // Return the last node in the list.
   // Return head_ if list is empty.
@@ -311,7 +314,7 @@ struct InlineSkipList<Comparator>::Node {
   // Stores the height of the node in the memory location normally used for
   // next_[0].  This is used for passing data from AllocateKey to Insert.
   void StashHeight(const int height) {
-    assert(sizeof(int) <= sizeof(next_[0]));
+    static_assert(sizeof(int) <= sizeof(next_[0]));
     memcpy(static_cast<void*>(&next_[0]), &height, sizeof(int));
   }
 
@@ -332,30 +335,30 @@ struct InlineSkipList<Comparator>::Node {
     assert(n >= 0);
     // Use an 'acquire load' so that we observe a fully initialized
     // version of the returned Node.
-    return ((&next_[0] - n)->load(std::memory_order_acquire));
+    return ((&next_[0] - n)->Load());
   }
 
   void SetNext(int n, Node* x) {
     assert(n >= 0);
     // Use a 'release store' so that anybody who reads through this
     // pointer observes a fully initialized version of the inserted node.
-    (&next_[0] - n)->store(x, std::memory_order_release);
+    (&next_[0] - n)->Store(x);
   }
 
   bool CASNext(int n, Node* expected, Node* x) {
     assert(n >= 0);
-    return (&next_[0] - n)->compare_exchange_strong(expected, x);
+    return (&next_[0] - n)->CasStrong(expected, x);
   }
 
   // No-barrier variants that can be safely used in a few locations.
   Node* NoBarrier_Next(int n) {
     assert(n >= 0);
-    return (&next_[0] - n)->load(std::memory_order_relaxed);
+    return (&next_[0] - n)->LoadRelaxed();
   }
 
   void NoBarrier_SetNext(int n, Node* x) {
     assert(n >= 0);
-    (&next_[0] - n)->store(x, std::memory_order_relaxed);
+    (&next_[0] - n)->StoreRelaxed(x);
   }
 
   // Insert node after prev on specific level.
@@ -369,7 +372,7 @@ struct InlineSkipList<Comparator>::Node {
  private:
   // next_[0] is the lowest level link (level 0).  Higher levels are
   // stored _earlier_, so level 1 is at next_[-1].
-  std::atomic<Node*> next_[1];
+  Atomic<Node*> next_[1];
 };
 
 template <class Comparator>
@@ -399,6 +402,12 @@ inline const char* InlineSkipList<Comparator>::Iterator::key() const {
 template <class Comparator>
 inline void InlineSkipList<Comparator>::Iterator::Next() {
   assert(Valid());
+
+  // Capture the key before move on to next node
+  TEST_SYNC_POINT_CALLBACK(
+      "InlineSkipList::Iterator::Next::key",
+      static_cast<void*>(const_cast<char*>((node_->Key()))));
+
   node_ = node_->Next(0);
 }
 
@@ -406,6 +415,12 @@ template <class Comparator>
 inline Status InlineSkipList<Comparator>::Iterator::NextAndValidate(
     bool allow_data_in_errors) {
   assert(Valid());
+
+  // Capture the key before move on to next node
+  TEST_SYNC_POINT_CALLBACK(
+      "InlineSkipList::Iterator::Next::key",
+      static_cast<void*>(const_cast<char*>((node_->Key()))));
+
   Node* prev_node = node_;
   node_ = node_->Next(0);
   // Verify that keys are increasing.
@@ -435,12 +450,12 @@ inline Status InlineSkipList<Comparator>::Iterator::PrevAndValidate(
     const bool allow_data_in_errors) {
   assert(Valid());
   // Skip list validation is done in FindLessThan().
-  Node* out_of_order_node = nullptr;
-  node_ = list_->FindLessThan(node_->Key(), &out_of_order_node);
-  if (out_of_order_node) {
+  Node* corrupted_node = nullptr;
+  node_ = list_->FindLessThan(node_->Key(), &corrupted_node);
+  if (corrupted_node) {
     Node* node = node_;
     node_ = nullptr;
-    return Corruption(node, out_of_order_node, allow_data_in_errors);
+    return Corruption(node, corrupted_node, allow_data_in_errors);
   }
   if (node_ == list_->head_) {
     node_ = nullptr;
@@ -450,20 +465,19 @@ inline Status InlineSkipList<Comparator>::Iterator::PrevAndValidate(
 
 template <class Comparator>
 inline void InlineSkipList<Comparator>::Iterator::Seek(const char* target) {
-  node_ = list_->FindGreaterOrEqual(target, nullptr);
+  auto status =
+      list_->FindGreaterOrEqual(target, &node_, false, false, nullptr);
+  assert(status.ok());
 }
 
 template <class Comparator>
 inline Status InlineSkipList<Comparator>::Iterator::SeekAndValidate(
-    const char* target, const bool allow_data_in_errors) {
-  Node* out_of_order_node = nullptr;
-  node_ = list_->FindGreaterOrEqual(target, &out_of_order_node);
-  if (out_of_order_node) {
-    Node* node = node_;
-    node_ = nullptr;
-    return Corruption(node, out_of_order_node, allow_data_in_errors);
-  }
-  return Status::OK();
+    const char* target, const bool allow_data_in_errors,
+    bool check_key_out_of_order,
+    const std::function<Status(const char*, bool)>& key_validation_callback) {
+  return list_->FindGreaterOrEqual(target, &node_, allow_data_in_errors,
+                                   check_key_out_of_order,
+                                   key_validation_callback);
 }
 
 template <class Comparator>
@@ -530,15 +544,18 @@ bool InlineSkipList<Comparator>::KeyIsAfterNode(const DecodedKey& key,
 }
 
 template <class Comparator>
-typename InlineSkipList<Comparator>::Node*
-InlineSkipList<Comparator>::FindGreaterOrEqual(
-    const char* key, Node** const out_of_order_node) const {
+Status InlineSkipList<Comparator>::FindGreaterOrEqual(
+    const char* key, Node** node, bool allow_data_in_errors,
+    bool detect_key_out_of_order,
+    const std::function<Status(const char*, bool)>& key_validation_callback)
+    const {
   // Note: It looks like we could reduce duplication by implementing
   // this function as FindLessThan(key)->Next(0), but we wouldn't be able
   // to exit early on equality and the result wouldn't even be correct.
   // A concurrent insert might occur after FindLessThan(key) but before
   // we get a chance to call Next(0).
   Node* x = head_;
+  *node = nullptr;
   int level = GetMaxHeight() - 1;
   Node* last_bigger = nullptr;
   const DecodedKey key_decoded = compare_.decode_key(key);
@@ -546,10 +563,16 @@ InlineSkipList<Comparator>::FindGreaterOrEqual(
     Node* next = x->Next(level);
     if (next != nullptr) {
       PREFETCH(next->Next(level), 0, 1);
-      if (out_of_order_node && x != head_ &&
+      if (detect_key_out_of_order && x != head_ &&
           compare_(x->Key(), next->Key()) >= 0) {
-        *out_of_order_node = next;
-        return x;
+        return Corruption(x, next, allow_data_in_errors);
+      }
+      if (key_validation_callback != nullptr) {
+        auto status =
+            key_validation_callback(next->Key(), allow_data_in_errors);
+        if (!status.ok()) {
+          return status;
+        }
       }
     }
     // Make sure the lists are sorted
@@ -560,7 +583,8 @@ InlineSkipList<Comparator>::FindGreaterOrEqual(
                   ? 1
                   : compare_(next->Key(), key_decoded);
     if (cmp == 0 || (cmp > 0 && level == 0)) {
-      return next;
+      *node = next;
+      return Status::OK();
     } else if (cmp < 0) {
       // Keep searching in this list
       x = next;
@@ -789,7 +813,7 @@ char* InlineSkipList<Comparator>::AllocateKey(size_t key_size) {
 template <class Comparator>
 typename InlineSkipList<Comparator>::Node*
 InlineSkipList<Comparator>::AllocateNode(size_t key_size, int height) {
-  auto prefix = sizeof(std::atomic<Node*>) * (height - 1);
+  auto prefix = sizeof(Atomic<Node*>) * (height - 1);
 
   // prefix is space for the height - 1 pointers that we store before
   // the Node instance (next_[-(height - 1) .. -1]).  Node starts at
@@ -923,9 +947,9 @@ bool InlineSkipList<Comparator>::Insert(const char* key, Splice* splice,
   int height = x->UnstashHeight();
   assert(height >= 1 && height <= kMaxHeight_);
 
-  int max_height = max_height_.load(std::memory_order_relaxed);
+  int max_height = max_height_.LoadRelaxed();
   while (height > max_height) {
-    if (max_height_.compare_exchange_weak(max_height, height)) {
+    if (max_height_.CasWeakRelaxed(max_height, height)) {
       // successfully updated it
       max_height = height;
       break;
@@ -1116,7 +1140,9 @@ bool InlineSkipList<Comparator>::Insert(const char* key, Splice* splice,
 
 template <class Comparator>
 bool InlineSkipList<Comparator>::Contains(const char* key) const {
-  Node* x = FindGreaterOrEqual(key, nullptr);
+  Node* x = nullptr;
+  auto status = FindGreaterOrEqual(key, &x, false, false, nullptr);
+  assert(status.ok());
   if (x != nullptr && Equal(key, x->Key())) {
     return true;
   } else {
diff --git a/memtable/skiplist.h b/memtable/skiplist.h
index f2e2a829de3b..594c6ec43ce4 100644
--- a/memtable/skiplist.h
+++ b/memtable/skiplist.h
@@ -34,10 +34,9 @@
 #include <assert.h>
 #include <stdlib.h>
 
-#include <atomic>
-
 #include "memory/allocator.h"
 #include "port/port.h"
+#include "util/atomic.h"
 #include "util/random.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -128,18 +127,16 @@ class SkipList {
 
   // Modified only by Insert().  Read racily by readers, but stale
   // values are ok.
-  std::atomic<int> max_height_;  // Height of the entire list
+  RelaxedAtomic<int> max_height_;  // Height of the entire list
 
   // Used for optimizing sequential insert patterns.  Tricky.  prev_[i] for
   // i up to max_height_ is the predecessor of prev_[0] and prev_height_
   // is the height of prev_[0].  prev_[0] can only be equal to head before
   // insertion, in which case max_height_ and prev_height_ are 1.
-  Node** prev_;
   int32_t prev_height_;
+  Node** prev_;
 
-  inline int GetMaxHeight() const {
-    return max_height_.load(std::memory_order_relaxed);
-  }
+  inline int GetMaxHeight() const { return max_height_.LoadRelaxed(); }
 
   Node* NewNode(const Key& key, int height);
   int RandomHeight();
@@ -179,35 +176,35 @@ struct SkipList<Key, Comparator>::Node {
     assert(n >= 0);
     // Use an 'acquire load' so that we observe a fully initialized
     // version of the returned Node.
-    return (next_[n].load(std::memory_order_acquire));
+    return (next_[n].Load());
   }
   void SetNext(int n, Node* x) {
     assert(n >= 0);
     // Use a 'release store' so that anybody who reads through this
     // pointer observes a fully initialized version of the inserted node.
-    next_[n].store(x, std::memory_order_release);
+    next_[n].Store(x);
   }
 
   // No-barrier variants that can be safely used in a few locations.
   Node* NoBarrier_Next(int n) {
     assert(n >= 0);
-    return next_[n].load(std::memory_order_relaxed);
+    return next_[n].LoadRelaxed();
   }
   void NoBarrier_SetNext(int n, Node* x) {
     assert(n >= 0);
-    next_[n].store(x, std::memory_order_relaxed);
+    next_[n].StoreRelaxed(x);
   }
 
  private:
   // Array of length equal to the node height.  next_[0] is lowest level link.
-  std::atomic<Node*> next_[1];
+  Atomic<Node*> next_[1];
 };
 
 template <typename Key, class Comparator>
 typename SkipList<Key, Comparator>::Node* SkipList<Key, Comparator>::NewNode(
     const Key& key, int height) {
-  char* mem = allocator_->AllocateAligned(
-      sizeof(Node) + sizeof(std::atomic<Node*>) * (height - 1));
+  char* mem = allocator_->AllocateAligned(sizeof(Node) +
+                                          sizeof(Atomic<Node*>) * (height - 1));
   return new (mem) Node(key);
 }
 
@@ -438,7 +435,7 @@ SkipList<Key, Comparator>::SkipList(const Comparator cmp, Allocator* allocator,
       kScaledInverseBranching_((Random::kMaxNext + 1) / kBranching_),
       compare_(cmp),
       allocator_(allocator),
-      head_(NewNode(0 /* any key will do */, max_height)),
+      head_(NewNode({} /* any key will do */, max_height)),
       max_height_(1),
       prev_height_(1) {
   assert(max_height > 0 && kMaxHeight_ == static_cast<uint32_t>(max_height));
@@ -494,7 +491,7 @@ void SkipList<Key, Comparator>::Insert(const Key& key) {
     // the loop below.  In the former case the reader will
     // immediately drop to the next level since nullptr sorts after all
     // keys.  In the latter case the reader will use the new node.
-    max_height_.store(height, std::memory_order_relaxed);
+    max_height_.StoreRelaxed(height);
   }
 
   Node* x = NewNode(key, height);
diff --git a/memtable/skiplistrep.cc b/memtable/skiplistrep.cc
index 93d32e9fec6e..c83baeeefcb2 100644
--- a/memtable/skiplistrep.cc
+++ b/memtable/skiplistrep.cc
@@ -94,11 +94,14 @@ class SkipListRep : public MemTableRep {
 
   Status GetAndValidate(const LookupKey& k, void* callback_args,
                         bool (*callback_func)(void* arg, const char* entry),
-                        bool allow_data_in_errors) override {
+                        bool allow_data_in_errors, bool detect_key_out_of_order,
+                        const std::function<Status(const char*, bool)>&
+                            key_validation_callback) override {
     SkipListRep::Iterator iter(&skip_list_);
     Slice dummy_slice;
-    Status status = iter.SeekAndValidate(dummy_slice, k.memtable_key().data(),
-                                         allow_data_in_errors);
+    Status status = iter.SeekAndValidate(
+        dummy_slice, k.memtable_key().data(), allow_data_in_errors,
+        detect_key_out_of_order, key_validation_callback);
     for (; iter.Valid() && status.ok() &&
            callback_func(callback_args, iter.key());
          status = iter.NextAndValidate(allow_data_in_errors)) {
@@ -244,12 +247,18 @@ class SkipListRep : public MemTableRep {
     }
 
     Status SeekAndValidate(const Slice& user_key, const char* memtable_key,
-                           bool allow_data_in_errors) override {
+                           bool allow_data_in_errors,
+                           bool detect_key_out_of_order,
+                           const std::function<Status(const char*, bool)>&
+                               key_validation_callback) override {
       if (memtable_key != nullptr) {
-        return iter_.SeekAndValidate(memtable_key, allow_data_in_errors);
+        return iter_.SeekAndValidate(memtable_key, allow_data_in_errors,
+                                     detect_key_out_of_order,
+                                     key_validation_callback);
       } else {
-        return iter_.SeekAndValidate(EncodeKey(&tmp_, user_key),
-                                     allow_data_in_errors);
+        return iter_.SeekAndValidate(
+            EncodeKey(&tmp_, user_key), allow_data_in_errors,
+            detect_key_out_of_order, key_validation_callback);
       }
     }
 
diff --git a/memtable/vectorrep.cc b/memtable/vectorrep.cc
index 9b0192cb8e8e..738f89f79e9e 100644
--- a/memtable/vectorrep.cc
+++ b/memtable/vectorrep.cc
@@ -30,6 +30,8 @@ class VectorRep : public MemTableRep {
   // collection.
   void Insert(KeyHandle handle) override;
 
+  void InsertConcurrently(KeyHandle handle) override;
+
   // Returns true iff an entry that compares equal to key is in the collection.
   bool Contains(const char* key) const override;
 
@@ -40,6 +42,8 @@ class VectorRep : public MemTableRep {
   void Get(const LookupKey& k, void* callback_args,
            bool (*callback_func)(void* arg, const char* entry)) override;
 
+  void BatchPostProcess() override;
+
   ~VectorRep() override = default;
 
   class Iterator : public MemTableRep::Iterator {
@@ -79,6 +83,13 @@ class VectorRep : public MemTableRep {
     // Advance to the first entry with a key >= target
     void Seek(const Slice& user_key, const char* memtable_key) override;
 
+    // Seek and do some memory validation
+    Status SeekAndValidate(const Slice& internal_key, const char* memtable_key,
+                           bool allow_data_in_errors,
+                           bool detect_key_out_of_order,
+                           const std::function<Status(const char*, bool)>&
+                               key_validation_callback) override;
+
     // Advance to the first entry with a key <= target
     void SeekForPrev(const Slice& user_key, const char* memtable_key) override;
 
@@ -96,19 +107,40 @@ class VectorRep : public MemTableRep {
 
  private:
   friend class Iterator;
+  ALIGN_AS(CACHE_LINE_SIZE) RelaxedAtomic<size_t> bucket_size_;
   using Bucket = std::vector<const char*>;
   std::shared_ptr<Bucket> bucket_;
   mutable port::RWMutex rwlock_;
   bool immutable_;
   bool sorted_;
   const KeyComparator& compare_;
+  // Thread-local vector to buffer concurrent writes.
+  using TlBucket = std::vector<const char*>;
+  ThreadLocalPtr tl_writes_;
+
+  static void DeleteTlBucket(void* ptr) {
+    auto* v = static_cast<TlBucket*>(ptr);
+    delete v;
+  }
 };
 
 void VectorRep::Insert(KeyHandle handle) {
   auto* key = static_cast<char*>(handle);
-  WriteLock l(&rwlock_);
-  assert(!immutable_);
-  bucket_->push_back(key);
+  {
+    WriteLock l(&rwlock_);
+    assert(!immutable_);
+    bucket_->push_back(key);
+  }
+  bucket_size_.FetchAddRelaxed(1);
+}
+
+void VectorRep::InsertConcurrently(KeyHandle handle) {
+  auto* v = static_cast<TlBucket*>(tl_writes_.Get());
+  if (!v) {
+    v = new TlBucket();
+    tl_writes_.Reset(v);
+  }
+  v->push_back(static_cast<char*>(handle));
 }
 
 // Returns true iff an entry that compares equal to key is in the collection.
@@ -123,19 +155,35 @@ void VectorRep::MarkReadOnly() {
 }
 
 size_t VectorRep::ApproximateMemoryUsage() {
-  return sizeof(bucket_) + sizeof(*bucket_) +
-         bucket_->size() *
-             sizeof(
-                 std::remove_reference<decltype(*bucket_)>::type::value_type);
+  return bucket_size_.LoadRelaxed() *
+         sizeof(std::remove_reference<decltype(*bucket_)>::type::value_type);
+}
+
+void VectorRep::BatchPostProcess() {
+  auto* v = static_cast<TlBucket*>(tl_writes_.Get());
+  if (v) {
+    {
+      WriteLock l(&rwlock_);
+      assert(!immutable_);
+      for (auto& key : *v) {
+        bucket_->push_back(key);
+      }
+    }
+    bucket_size_.FetchAddRelaxed(v->size());
+    delete v;
+    tl_writes_.Reset(nullptr);
+  }
 }
 
 VectorRep::VectorRep(const KeyComparator& compare, Allocator* allocator,
                      size_t count)
     : MemTableRep(allocator),
+      bucket_size_(0),
       bucket_(new Bucket()),
       immutable_(false),
       sorted_(false),
-      compare_(compare) {
+      compare_(compare),
+      tl_writes_(DeleteTlBucket) {
   bucket_.get()->reserve(count);
 }
 
@@ -221,6 +269,24 @@ void VectorRep::Iterator::Seek(const Slice& user_key,
              .first;
 }
 
+Status VectorRep::Iterator::SeekAndValidate(
+    const Slice& /* internal_key */, const char* /* memtable_key */,
+    bool /* allow_data_in_errors */, bool /* detect_key_out_of_order */,
+    const std::function<Status(const char*, bool)>&
+    /* key_validation_callback */) {
+  if (vrep_) {
+    WriteLock l(&vrep_->rwlock_);
+    if (bucket_->begin() == bucket_->end()) {
+      // Memtable is empty
+      return Status::OK();
+    } else {
+      return Status::NotSupported("SeekAndValidate() not implemented");
+    }
+  } else {
+    return Status::NotSupported("SeekAndValidate() not implemented");
+  }
+}
+
 // Advance to the first entry with a key <= target
 void VectorRep::Iterator::SeekForPrev(const Slice& /*user_key*/,
                                       const char* /*memtable_key*/) {
diff --git a/memtable/wbwi_memtable.cc b/memtable/wbwi_memtable.cc
index 540253666908..9686eac50299 100644
--- a/memtable/wbwi_memtable.cc
+++ b/memtable/wbwi_memtable.cc
@@ -61,6 +61,7 @@ bool WBWIMemTable::Get(const LookupKey& key, std::string* value,
   assert(!wbwi_->GetWriteBatch()->HasDeleteRange());
   assert(merge_context);
 
+  *out_seq = kMaxSequenceNumber;
   [[maybe_unused]] SequenceNumber read_seq =
       GetInternalKeySeqno(key.internal_key());
   // This is memtable is a single write batch, no snapshot can be taken within
diff --git a/memtable/wbwi_memtable.h b/memtable/wbwi_memtable.h
index 3f0ae3e23d5b..b3231b4d565d 100644
--- a/memtable/wbwi_memtable.h
+++ b/memtable/wbwi_memtable.h
@@ -235,7 +235,7 @@ class WBWIMemTable final : public ReadOnlyMemTable {
   uint64_t num_entries_;
   // WBWI can contains updates to multiple CFs. `cf_id_` determines which CF
   // this memtable is for.
-  uint32_t cf_id_;
+  const uint32_t cf_id_;
 };
 
 class WBWIMemTableIterator final : public InternalIterator {
diff --git a/microbench/db_basic_bench.cc b/microbench/db_basic_bench.cc
index 2eca31f10843..dd4bbb0d68f7 100644
--- a/microbench/db_basic_bench.cc
+++ b/microbench/db_basic_bench.cc
@@ -138,13 +138,11 @@ static void SetupDB(benchmark::State& state, Options& options,
       db_path + kFilePathSeparator + test_name + std::to_string(getpid());
   DestroyDB(db_name, options);
 
-  DB* db_ptr = nullptr;
-  s = DB::Open(options, db_name, &db_ptr);
+  s = DB::Open(options, db_name, db);
   if (!s.ok()) {
     state.SkipWithError(s.ToString().c_str());
     return;
   }
-  db->reset(db_ptr);
 }
 
 static void TeardownDB(benchmark::State& state, const std::unique_ptr<DB>& db,
@@ -181,12 +179,10 @@ static void DBOpen(benchmark::State& state) {
 
   for (auto _ : state) {
     {
-      DB* db_ptr = nullptr;
-      Status s = DB::Open(options, db_name, &db_ptr);
+      Status s = DB::Open(options, db_name, &db);
       if (!s.ok()) {
         state.SkipWithError(s.ToString().c_str());
       }
-      db.reset(db_ptr);
     }
     state.PauseTiming();
     auto wo = WriteOptions();
@@ -231,12 +227,10 @@ static void DBClose(benchmark::State& state) {
   for (auto _ : state) {
     state.PauseTiming();
     {
-      DB* db_ptr = nullptr;
-      Status s = DB::Open(options, db_name, &db_ptr);
+      Status s = DB::Open(options, db_name, &db);
       if (!s.ok()) {
         state.SkipWithError(s.ToString().c_str());
       }
-      db.reset(db_ptr);
     }
     auto wo = WriteOptions();
     Status s;
@@ -727,13 +721,11 @@ static void SimpleGetWithPerfContext(benchmark::State& state) {
     DestroyDB(db_name, options);
 
     {
-      DB* db_ptr = nullptr;
-      s = DB::Open(options, db_name, &db_ptr);
+      s = DB::Open(options, db_name, &db);
       if (!s.ok()) {
         state.SkipWithError(s.ToString().c_str());
         return;
       }
-      db.reset(db_ptr);
     }
     // load db
     auto wo = WriteOptions();
diff --git a/microbench/ribbon_bench.cc b/microbench/ribbon_bench.cc
index d0fb2ec9ab2e..58cd710a4c70 100644
--- a/microbench/ribbon_bench.cc
+++ b/microbench/ribbon_bench.cc
@@ -32,7 +32,7 @@ struct KeyMaker {
     // To get range [avg_size - 2, avg_size + 2]
     // use range [smallest_size, smallest_size + 4]
     len += FastRange32((val_num >> 5) * 1234567891, 5);
-    char *data = buf_.get() + start;
+    char* data = buf_.get() + start;
     // Populate key data such that all data makes it into a key of at
     // least 8 bytes. We also don't want all the within-filter key
     // variance confined to a contiguous 32 bits, because then a 32 bit
@@ -51,7 +51,7 @@ struct KeyMaker {
 // 1. filter config bits_per_key
 // 2. average data key length
 // 3. data entry number
-static void CustomArguments(benchmark::internal::Benchmark *b) {
+static void CustomArguments(benchmark::internal::Benchmark* b) {
   const auto kImplCount =
       static_cast<int>(BloomLikeFilterPolicy::GetAllFixedImpls().size());
   for (int filter_impl = 0; filter_impl < kImplCount; ++filter_impl) {
@@ -66,7 +66,7 @@ static void CustomArguments(benchmark::internal::Benchmark *b) {
   b->ArgNames({"filter_impl", "bits_per_key", "key_len_avg", "entry_num"});
 }
 
-static void FilterBuild(benchmark::State &state) {
+static void FilterBuild(benchmark::State& state) {
   // setup data
   auto filter = BloomLikeFilterPolicy::Create(
       BloomLikeFilterPolicy::GetAllFixedImpls().at(state.range(0)),
@@ -89,7 +89,7 @@ static void FilterBuild(benchmark::State &state) {
 }
 BENCHMARK(FilterBuild)->Apply(CustomArguments);
 
-static void FilterQueryPositive(benchmark::State &state) {
+static void FilterQueryPositive(benchmark::State& state) {
   // setup data
   auto filter = BloomLikeFilterPolicy::Create(
       BloomLikeFilterPolicy::GetAllFixedImpls().at(state.range(0)),
@@ -117,7 +117,7 @@ static void FilterQueryPositive(benchmark::State &state) {
 }
 BENCHMARK(FilterQueryPositive)->Apply(CustomArguments);
 
-static void FilterQueryNegative(benchmark::State &state) {
+static void FilterQueryNegative(benchmark::State& state) {
   // setup data
   auto filter = BloomLikeFilterPolicy::Create(
       BloomLikeFilterPolicy::GetAllFixedImpls().at(state.range(0)),
diff --git a/monitoring/iostats_context.cc b/monitoring/iostats_context.cc
index 04e98914da9c..9f96655a6b48 100644
--- a/monitoring/iostats_context.cc
+++ b/monitoring/iostats_context.cc
@@ -65,9 +65,11 @@ std::string IOStatsContext::ToString(bool exclude_zero_counters) const {
   IOSTATS_CONTEXT_OUTPUT(cpu_read_nanos);
   IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.hot_file_bytes_read);
   IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.warm_file_bytes_read);
+  IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.cool_file_bytes_read);
   IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.cold_file_bytes_read);
   IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.hot_file_read_count);
   IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.warm_file_read_count);
+  IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.cool_file_read_count);
   IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.cold_file_read_count);
   std::string str = ss.str();
   str.erase(str.find_last_not_of(", ") + 1);
diff --git a/monitoring/perf_context.cc b/monitoring/perf_context.cc
index a38f6ec01805..59f5f19f66df 100644
--- a/monitoring/perf_context.cc
+++ b/monitoring/perf_context.cc
@@ -259,10 +259,10 @@ void PerfContext::Reset() {
 #endif
 }
 
-void PerfContextByLevel::Reset(){
+void PerfContextByLevel::Reset() {
 #ifndef NPERF_CONTEXT
 #define EMIT_FIELDS(x) x = 0;
-    DEF_PERF_CONTEXT_LEVEL_METRICS(EMIT_FIELDS)
+  DEF_PERF_CONTEXT_LEVEL_METRICS(EMIT_FIELDS)
 #undef EMIT_FIELDS
 #endif
 }
diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc
index 05163d3e29e1..e6060cbeac20 100644
--- a/monitoring/statistics.cc
+++ b/monitoring/statistics.cc
@@ -93,6 +93,7 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
     {COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE,
      "rocksdb.compaction.optimized.del.drop.obsolete"},
     {COMPACTION_CANCELLED, "rocksdb.compaction.cancelled"},
+    {COMPACTION_ABORTED, "rocksdb.compaction.aborted"},
     {NUMBER_KEYS_WRITTEN, "rocksdb.number.keys.written"},
     {NUMBER_KEYS_READ, "rocksdb.number.keys.read"},
     {NUMBER_KEYS_UPDATED, "rocksdb.number.keys.updated"},
@@ -169,8 +170,8 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
     {BLOB_DB_NUM_KEYS_READ, "rocksdb.blobdb.num.keys.read"},
     {BLOB_DB_BYTES_WRITTEN, "rocksdb.blobdb.bytes.written"},
     {BLOB_DB_BYTES_READ, "rocksdb.blobdb.bytes.read"},
-    {BLOB_DB_WRITE_INLINED, "rocksdb.blobdb.write.inlined"},
-    {BLOB_DB_WRITE_INLINED_TTL, "rocksdb.blobdb.write.inlined.ttl"},
+    {BLOB_DB_WRITE_INLINED_DEPRECATED, "rocksdb.blobdb.write.inlined"},
+    {BLOB_DB_WRITE_INLINED_TTL_DEPRECATED, "rocksdb.blobdb.write.inlined.ttl"},
     {BLOB_DB_WRITE_BLOB, "rocksdb.blobdb.write.blob"},
     {BLOB_DB_WRITE_BLOB_TTL, "rocksdb.blobdb.write.blob.ttl"},
     {BLOB_DB_BLOB_FILE_BYTES_WRITTEN, "rocksdb.blobdb.blob.file.bytes.written"},
@@ -224,12 +225,17 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
     {BACKUP_WRITE_BYTES, "rocksdb.backup.write.bytes"},
     {REMOTE_COMPACT_READ_BYTES, "rocksdb.remote.compact.read.bytes"},
     {REMOTE_COMPACT_WRITE_BYTES, "rocksdb.remote.compact.write.bytes"},
+    {REMOTE_COMPACT_RESUMED_BYTES, "rocksdb.remote.compact.resumed.bytes"},
     {HOT_FILE_READ_BYTES, "rocksdb.hot.file.read.bytes"},
     {WARM_FILE_READ_BYTES, "rocksdb.warm.file.read.bytes"},
+    {COOL_FILE_READ_BYTES, "rocksdb.cool.file.read.bytes"},
     {COLD_FILE_READ_BYTES, "rocksdb.cold.file.read.bytes"},
+    {ICE_FILE_READ_BYTES, "rocksdb.ice.file.read.bytes"},
     {HOT_FILE_READ_COUNT, "rocksdb.hot.file.read.count"},
     {WARM_FILE_READ_COUNT, "rocksdb.warm.file.read.count"},
+    {COOL_FILE_READ_COUNT, "rocksdb.cool.file.read.count"},
     {COLD_FILE_READ_COUNT, "rocksdb.cold.file.read.count"},
+    {ICE_FILE_READ_COUNT, "rocksdb.ice.file.read.count"},
     {LAST_LEVEL_READ_BYTES, "rocksdb.last.level.read.bytes"},
     {LAST_LEVEL_READ_COUNT, "rocksdb.last.level.read.count"},
     {NON_LAST_LEVEL_READ_BYTES, "rocksdb.non.last.level.read.bytes"},
@@ -262,6 +268,8 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
     {READAHEAD_TRIMMED, "rocksdb.readahead.trimmed"},
     {FIFO_MAX_SIZE_COMPACTIONS, "rocksdb.fifo.max.size.compactions"},
     {FIFO_TTL_COMPACTIONS, "rocksdb.fifo.ttl.compactions"},
+    {FIFO_CHANGE_TEMPERATURE_COMPACTIONS,
+     "rocksdb.fifo.change_temperature.compactions"},
     {PREFETCH_BYTES, "rocksdb.prefetch.bytes"},
     {PREFETCH_BYTES_USEFUL, "rocksdb.prefetch.bytes.useful"},
     {PREFETCH_HITS, "rocksdb.prefetch.hits"},
@@ -270,6 +278,24 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
      "rocksdb.file.read.corruption.retry.count"},
     {FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT,
      "rocksdb.file.read.corruption.retry.success.count"},
+    {NUMBER_WBWI_INGEST, "rocksdb.number.wbwi.ingest"},
+    {SST_USER_DEFINED_INDEX_LOAD_FAIL_COUNT,
+     "rocksdb.sst.user.defined.index.load.fail.count"},
+    {MULTISCAN_PREPARE_CALLS, "rocksdb.multiscan.prepare.calls"},
+    {MULTISCAN_PREPARE_ERRORS, "rocksdb.multiscan.prepare.errors"},
+    {MULTISCAN_BLOCKS_PREFETCHED, "rocksdb.multiscan.blocks.prefetched"},
+    {MULTISCAN_BLOCKS_FROM_CACHE, "rocksdb.multiscan.blocks.from.cache"},
+    {MULTISCAN_PREFETCH_BYTES, "rocksdb.multiscan.prefetch.bytes"},
+    {MULTISCAN_PREFETCH_BLOCKS_WASTED,
+     "rocksdb.multiscan.prefetch.blocks.wasted"},
+    {MULTISCAN_IO_REQUESTS, "rocksdb.multiscan.io.requests"},
+    {MULTISCAN_IO_COALESCED_NONADJACENT,
+     "rocksdb.multiscan.io.coalesced.nonadjacent"},
+    {MULTISCAN_SEEK_ERRORS, "rocksdb.multiscan.seek.errors"},
+    {PREFETCH_MEMORY_BYTES_GRANTED, "rocksdb.prefetch.memory.bytes.granted"},
+    {PREFETCH_MEMORY_BYTES_RELEASED, "rocksdb.prefetch.memory.bytes.released"},
+    {PREFETCH_MEMORY_REQUESTS_BLOCKED,
+     "rocksdb.prefetch.memory.requests.blocked"},
 };
 
 const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
@@ -336,10 +362,16 @@ const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
      "rocksdb.error.handler.autoresume.retry.count"},
     {ASYNC_READ_BYTES, "rocksdb.async.read.bytes"},
     {POLL_WAIT_MICROS, "rocksdb.poll.wait.micros"},
+    {COMPACTION_PREFETCH_BYTES, "rocksdb.compaction.prefetch.bytes"},
     {PREFETCHED_BYTES_DISCARDED, "rocksdb.prefetched.bytes.discarded"},
     {ASYNC_PREFETCH_ABORT_MICROS, "rocksdb.async.prefetch.abort.micros"},
     {TABLE_OPEN_PREFETCH_TAIL_READ_BYTES,
      "rocksdb.table.open.prefetch.tail.read.bytes"},
+    {NUM_OP_PER_TRANSACTION, "rocksdb.num.op.per.transaction"},
+    {MULTISCAN_PREPARE_ITERATORS,
+     "rocksdb.multiscan.op.prepare.iterators.micros"},
+    {MULTISCAN_PREPARE_MICROS, "rocksdb.multiscan.prepare.micros"},
+    {MULTISCAN_BLOCKS_PER_PREPARE, "rocksdb.multiscan.blocks.per.prepare"},
 };
 
 std::shared_ptr<Statistics> CreateDBStatistics() {
diff --git a/monitoring/stats_history_test.cc b/monitoring/stats_history_test.cc
index 295e7bf3daa3..f98917a5f4a3 100644
--- a/monitoring/stats_history_test.cc
+++ b/monitoring/stats_history_test.cc
@@ -185,7 +185,7 @@ TEST_F(StatsHistoryTest, GetStatsHistoryInMemory) {
 
 TEST_F(StatsHistoryTest, InMemoryStatsHistoryPurging) {
   constexpr int kPeriodSec = 1;
-  constexpr int kEstimatedOneSliceSize = 16000;
+  constexpr int kEstimatedOneSliceSize = 22100;
 
   Options options;
   options.create_if_missing = true;
@@ -277,7 +277,7 @@ TEST_F(StatsHistoryTest, InMemoryStatsHistoryPurging) {
   // If `slice_count == 0` when new statistics are added, consider increasing
   // `kEstimatedOneSliceSize`
   ASSERT_EQ(slice_count, 1);
-  ASSERT_TRUE(stats_history_size_reopen < 16000 &&
+  ASSERT_TRUE(stats_history_size_reopen < kEstimatedOneSliceSize &&
               stats_history_size_reopen > 0);
   ASSERT_TRUE(stats_count_reopen < stats_count && stats_count_reopen > 0);
   Close();
@@ -616,7 +616,7 @@ TEST_F(StatsHistoryTest, ForceManualFlushStatsCF) {
   // LogNumbers: default: 16, stats: 10, pikachu: 5
   // Since in recovery process, cfd_stats column is created after WAL is
   // created, synced and MANIFEST is persisted, its log number which depends on
-  // logfile_number_ will be different. Since "pikachu" is never flushed, thus
+  // cur_wal_number_ will be different. Since "pikachu" is never flushed, thus
   // its log_number should be the smallest of the three.
   ASSERT_OK(Flush());
   ASSERT_LT(cfd_test->GetLogNumber(), cfd_stats->GetLogNumber());
diff --git a/monitoring/thread_status_impl.cc b/monitoring/thread_status_impl.cc
index 153753682cfa..2b3041c4c61d 100644
--- a/monitoring/thread_status_impl.cc
+++ b/monitoring/thread_status_impl.cc
@@ -13,7 +13,9 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-#ifdef ROCKSDB_USING_THREAD_STATUS
+#ifndef NROCKSDB_THREAD_STATUS
+const bool ThreadStatus::kEnabled = true;
+
 std::string ThreadStatus::GetThreadTypeName(
     ThreadStatus::ThreadType thread_type) {
   switch (thread_type) {
@@ -117,6 +119,7 @@ std::map<std::string, uint64_t> ThreadStatus::InterpretOperationProperties(
 }
 
 #else
+const bool ThreadStatus::kEnabled = false;
 
 std::string ThreadStatus::GetThreadTypeName(
     ThreadStatus::ThreadType /*thread_type*/) {
@@ -159,5 +162,5 @@ std::map<std::string, uint64_t> ThreadStatus::InterpretOperationProperties(
   return std::map<std::string, uint64_t>();
 }
 
-#endif  // ROCKSDB_USING_THREAD_STATUS
+#endif  // !NROCKSDB_THREAD_STATUS
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/monitoring/thread_status_updater.cc b/monitoring/thread_status_updater.cc
index 37fcef62b0f9..7df2b2c6fa4b 100644
--- a/monitoring/thread_status_updater.cc
+++ b/monitoring/thread_status_updater.cc
@@ -14,7 +14,7 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-#ifdef ROCKSDB_USING_THREAD_STATUS
+#ifndef NROCKSDB_THREAD_STATUS
 
 thread_local ThreadStatusData* ThreadStatusUpdater::thread_status_data_ =
     nullptr;
@@ -324,5 +324,5 @@ void ThreadStatusUpdater::SetThreadOperationProperty(int /*i*/,
 void ThreadStatusUpdater::IncreaseThreadOperationProperty(int /*i*/,
                                                           uint64_t /*delta*/) {}
 
-#endif  // ROCKSDB_USING_THREAD_STATUS
+#endif  // !NROCKSDB_THREAD_STATUS
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/monitoring/thread_status_updater.h b/monitoring/thread_status_updater.h
index 696063cb46cd..6d3bc74c4510 100644
--- a/monitoring/thread_status_updater.h
+++ b/monitoring/thread_status_updater.h
@@ -47,7 +47,7 @@ class ColumnFamilyHandle;
 
 // The structure that keeps constant information about a column family.
 struct ConstantColumnFamilyInfo {
-#ifdef ROCKSDB_USING_THREAD_STATUS
+#ifndef NROCKSDB_THREAD_STATUS
  public:
   ConstantColumnFamilyInfo(const void* _db_key, const std::string& _db_name,
                            const std::string& _cf_name)
@@ -55,13 +55,13 @@ struct ConstantColumnFamilyInfo {
   const void* db_key;
   const std::string db_name;
   const std::string cf_name;
-#endif  // ROCKSDB_USING_THREAD_STATUS
+#endif  // !NROCKSDB_THREAD_STATUS
 };
 
 // the internal data-structure that is used to reflect the current
 // status of a thread using a set of atomic pointers.
 struct ThreadStatusData {
-#ifdef ROCKSDB_USING_THREAD_STATUS
+#ifndef NROCKSDB_THREAD_STATUS
   explicit ThreadStatusData() {
     enable_tracking.store(false);
     thread_id.store(0);
@@ -86,7 +86,7 @@ struct ThreadStatusData {
   std::atomic<ThreadStatus::OperationStage> operation_stage;
   std::atomic<uint64_t> op_properties[ThreadStatus::kNumOperationProperties];
   std::atomic<ThreadStatus::StateType> state_type;
-#endif  // ROCKSDB_USING_THREAD_STATUS
+#endif  // !NROCKSDB_THREAD_STATUS
 };
 
 // The class that stores and updates the status of the current thread
@@ -190,7 +190,7 @@ class ThreadStatusUpdater {
       const std::vector<ColumnFamilyHandle*>& handles, bool check_exist);
 
  protected:
-#ifdef ROCKSDB_USING_THREAD_STATUS
+#ifndef NROCKSDB_THREAD_STATUS
   // The thread-local variable for storing thread status.
   static thread_local ThreadStatusData* thread_status_data_;
 
@@ -220,7 +220,7 @@ class ThreadStatusUpdater {
 
 #else
   static ThreadStatusData* thread_status_data_;
-#endif  // ROCKSDB_USING_THREAD_STATUS
+#endif  // !NROCKSDB_THREAD_STATUS
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/monitoring/thread_status_updater_debug.cc b/monitoring/thread_status_updater_debug.cc
index 464c23bbaa89..39b3ef2d0167 100644
--- a/monitoring/thread_status_updater_debug.cc
+++ b/monitoring/thread_status_updater_debug.cc
@@ -12,7 +12,7 @@
 namespace ROCKSDB_NAMESPACE {
 
 #ifndef NDEBUG
-#ifdef ROCKSDB_USING_THREAD_STATUS
+#ifndef NROCKSDB_THREAD_STATUS
 void ThreadStatusUpdater::TEST_VerifyColumnFamilyInfoMap(
     const std::vector<ColumnFamilyHandle*>& handles, bool check_exist) {
   std::unique_lock<std::mutex> lock(thread_list_mutex_);
@@ -37,7 +37,7 @@ void ThreadStatusUpdater::TEST_VerifyColumnFamilyInfoMap(
     const std::vector<ColumnFamilyHandle*>& /*handles*/, bool /*check_exist*/) {
 }
 
-#endif  // ROCKSDB_USING_THREAD_STATUS
+#endif  // !NROCKSDB_THREAD_STATUS
 #endif  // !NDEBUG
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/monitoring/thread_status_util.cc b/monitoring/thread_status_util.cc
index d61bcba1ce55..d84f46a681bd 100644
--- a/monitoring/thread_status_util.cc
+++ b/monitoring/thread_status_util.cc
@@ -11,7 +11,7 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-#ifdef ROCKSDB_USING_THREAD_STATUS
+#ifndef NROCKSDB_THREAD_STATUS
 thread_local ThreadStatusUpdater*
     ThreadStatusUtil::thread_updater_local_cache_ = nullptr;
 thread_local bool ThreadStatusUtil::thread_updater_initialized_ = false;
@@ -171,9 +171,10 @@ AutoThreadOperationStageUpdater::~AutoThreadOperationStageUpdater() {
 ThreadStatusUpdater* ThreadStatusUtil::thread_updater_local_cache_ = nullptr;
 bool ThreadStatusUtil::thread_updater_initialized_ = false;
 
-bool ThreadStatusUtil::MaybeInitThreadLocalUpdater(const Env* /*env*/) {
-  return false;
-}
+void ThreadStatusUtil::RegisterThread(
+    const Env* /*env*/, ThreadStatus::ThreadType /*thread_type*/) {}
+
+void ThreadStatusUtil::UnregisterThread() {}
 
 void ThreadStatusUtil::SetEnableTracking(bool /*enable_tracking*/) {}
 
@@ -204,11 +205,15 @@ void ThreadStatusUtil::EraseDatabaseInfo(const DB* /*db*/) {}
 
 void ThreadStatusUtil::ResetThreadStatus() {}
 
+bool ThreadStatusUtil::MaybeInitThreadLocalUpdater(const Env* /*env*/) {
+  return false;
+}
+
 AutoThreadOperationStageUpdater::AutoThreadOperationStageUpdater(
     ThreadStatus::OperationStage /*stage*/) {}
 
 AutoThreadOperationStageUpdater::~AutoThreadOperationStageUpdater() {}
 
-#endif  // ROCKSDB_USING_THREAD_STATUS
+#endif  // !NROCKSDB_THREAD_STATUS
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/monitoring/thread_status_util.h b/monitoring/thread_status_util.h
index df148a039565..082dbd7324b3 100644
--- a/monitoring/thread_status_util.h
+++ b/monitoring/thread_status_util.h
@@ -90,7 +90,7 @@ class ThreadStatusUtil {
   // a non-null pointer.
   static bool MaybeInitThreadLocalUpdater(const Env* env);
 
-#ifdef ROCKSDB_USING_THREAD_STATUS
+#ifndef NROCKSDB_THREAD_STATUS
   // A boolean flag indicating whether thread_updater_local_cache_
   // is initialized.  It is set to true when an Env uses any
   // ThreadStatusUtil functions using the current thread other
@@ -130,7 +130,7 @@ class AutoThreadOperationStageUpdater {
   explicit AutoThreadOperationStageUpdater(ThreadStatus::OperationStage stage);
   ~AutoThreadOperationStageUpdater();
 
-#ifdef ROCKSDB_USING_THREAD_STATUS
+#ifndef NROCKSDB_THREAD_STATUS
  private:
   ThreadStatus::OperationStage prev_stage_;
 #endif
diff --git a/monitoring/thread_status_util_debug.cc b/monitoring/thread_status_util_debug.cc
index a8233f78c623..7b6211bb5448 100644
--- a/monitoring/thread_status_util_debug.cc
+++ b/monitoring/thread_status_util_debug.cc
@@ -50,8 +50,9 @@ Env::IOActivity ThreadStatusUtil::TEST_GetExpectedIOActivity(
       return Env::IOActivity::kGetEntity;
     case ThreadStatus::OperationType::OP_MULTIGETENTITY:
       return Env::IOActivity::kMultiGetEntity;
-    case ThreadStatus::OperationType::OP_READ_MANIFEST:
-      return Env::IOActivity::kReadManifest;
+    case ThreadStatus::OperationType::
+        OP_GET_FILE_CHECKSUMS_FROM_CURRENT_MANIFEST:
+      return Env::IOActivity::kGetFileChecksumsFromCurrentManifest;
     default:
       return Env::IOActivity::kUnknown;
   }
diff --git a/options/cf_options.cc b/options/cf_options.cc
index d50eade93209..2ba56e0f36d8 100644
--- a/options/cf_options.cc
+++ b/options/cf_options.cc
@@ -30,6 +30,7 @@
 #include "rocksdb/utilities/object_registry.h"
 #include "rocksdb/utilities/options_type.h"
 #include "util/cast_util.h"
+#include "util/string_util.h"
 
 // NOTE: in this file, many option flags that were deprecated
 // and removed from the rest of the code have to be kept here
@@ -301,7 +302,24 @@ static std::unordered_map<std::string, OptionTypeInfo>
              OptionTypeInfo::Struct("file_temperature_age_thresholds",
                                     &file_temperature_age_type_info, 0,
                                     OptionVerificationType::kNormal,
-                                    OptionTypeFlags::kMutable))}};
+                                    OptionTypeFlags::kMutable))},
+        {"allow_trivial_copy_when_change_temperature",
+         {offsetof(struct CompactionOptionsFIFO,
+                   allow_trivial_copy_when_change_temperature),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"trivial_copy_buffer_size",
+         {offsetof(struct CompactionOptionsFIFO, trivial_copy_buffer_size),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"max_data_files_size",
+         {offsetof(struct CompactionOptionsFIFO, max_data_files_size),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"use_kv_ratio_compaction",
+         {offsetof(struct CompactionOptionsFIFO, use_kv_ratio_compaction),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}}};
 
 static std::unordered_map<std::string, OptionTypeInfo>
     universal_compaction_options_type_info = {
@@ -340,6 +358,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
           OptionTypeFlags::kMutable}},
         {"allow_trivial_move",
          {offsetof(class CompactionOptionsUniversal, allow_trivial_move),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"reduce_file_locking",
+         {offsetof(class CompactionOptionsUniversal, reduce_file_locking),
           OptionType::kBoolean, OptionVerificationType::kNormal,
           OptionTypeFlags::kMutable}}};
 
@@ -382,6 +404,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct MutableCFOptions, paranoid_file_checks),
           OptionType::kBoolean, OptionVerificationType::kNormal,
           OptionTypeFlags::kMutable}},
+        {"verify_output_flags",
+         {offsetof(struct MutableCFOptions, verify_output_flags),
+          OptionType::kUInt32T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
         {"verify_checksums_in_compaction",
          {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
           OptionTypeFlags::kMutable}},
@@ -437,6 +463,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct MutableCFOptions, target_file_size_multiplier),
           OptionType::kInt, OptionVerificationType::kNormal,
           OptionTypeFlags::kMutable}},
+        {"target_file_size_is_upper_bound",
+         {offsetof(struct MutableCFOptions, target_file_size_is_upper_bound),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
         {"arena_block_size",
          {offsetof(struct MutableCFOptions, arena_block_size),
           OptionType::kSizeT, OptionVerificationType::kNormal,
@@ -649,6 +679,11 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct MutableCFOptions, paranoid_memory_checks),
           OptionType::kBoolean, OptionVerificationType::kNormal,
           OptionTypeFlags::kMutable}},
+        {"memtable_veirfy_per_key_checksum_on_seek",
+         {offsetof(struct MutableCFOptions,
+                   memtable_veirfy_per_key_checksum_on_seek),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
         {kOptNameCompOpts,
          OptionTypeInfo::Struct(
              kOptNameCompOpts, &compression_options_type_info,
@@ -689,12 +724,24 @@ static std::unordered_map<std::string, OptionTypeInfo>
                      name, value, addr);
                }
              })},
+        {"compression_manager",
+         OptionTypeInfo::AsCustomSharedPtr<CompressionManager>(
+             offsetof(struct MutableCFOptions, compression_manager),
+             OptionVerificationType::kByNameAllowNull,
+             (OptionTypeFlags::kMutable | OptionTypeFlags::kAllowNull))},
         // End special case properties
         {"memtable_max_range_deletions",
          {offsetof(struct MutableCFOptions, memtable_max_range_deletions),
           OptionType::kUInt32T, OptionVerificationType::kNormal,
           OptionTypeFlags::kMutable}},
-
+        {"memtable_op_scan_flush_trigger",
+         {offsetof(struct MutableCFOptions, memtable_op_scan_flush_trigger),
+          OptionType::kUInt32T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"memtable_avg_op_scan_flush_trigger",
+         {offsetof(struct MutableCFOptions, memtable_avg_op_scan_flush_trigger),
+          OptionType::kUInt32T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
 };
 
 static std::unordered_map<std::string, OptionTypeInfo>
@@ -736,6 +783,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct ImmutableCFOptions, force_consistency_checks),
           OptionType::kBoolean, OptionVerificationType::kNormal,
           OptionTypeFlags::kNone}},
+        {"disallow_memtable_writes",
+         {offsetof(struct ImmutableCFOptions, disallow_memtable_writes),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
         {"default_temperature",
          {offsetof(struct ImmutableCFOptions, default_temperature),
           OptionType::kTemperature, OptionVerificationType::kNormal,
@@ -745,9 +796,7 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {0, OptionType::kInt, OptionVerificationType::kDeprecated,
           OptionTypeFlags::kNone}},
         {"max_write_buffer_number_to_maintain",
-         {offsetof(struct ImmutableCFOptions,
-                   max_write_buffer_number_to_maintain),
-          OptionType::kInt, OptionVerificationType::kNormal,
+         {0, OptionType::kInt, OptionVerificationType::kDeprecated,
           OptionTypeFlags::kNone, nullptr}},
         {"max_write_buffer_size_to_maintain",
          {offsetof(struct ImmutableCFOptions,
@@ -866,6 +915,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct ImmutableCFOptions, persist_user_defined_timestamps),
           OptionType::kBoolean, OptionVerificationType::kNormal,
           OptionTypeFlags::kCompareLoose}},
+        {"cf_allow_ingest_behind",
+         {offsetof(struct ImmutableCFOptions, cf_allow_ingest_behind),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
 };
 
 const std::string OptionsHelper::kCFOptionsName = "ColumnFamilyOptions";
@@ -983,8 +1036,6 @@ ImmutableCFOptions::ImmutableCFOptions(const ColumnFamilyOptions& cf_options)
       compaction_filter_factory(cf_options.compaction_filter_factory),
       min_write_buffer_number_to_merge(
           cf_options.min_write_buffer_number_to_merge),
-      max_write_buffer_number_to_maintain(
-          cf_options.max_write_buffer_number_to_maintain),
       max_write_buffer_size_to_maintain(
           cf_options.max_write_buffer_size_to_maintain),
       inplace_update_support(cf_options.inplace_update_support),
@@ -998,6 +1049,7 @@ ImmutableCFOptions::ImmutableCFOptions(const ColumnFamilyOptions& cf_options)
       num_levels(cf_options.num_levels),
       optimize_filters_for_hits(cf_options.optimize_filters_for_hits),
       force_consistency_checks(cf_options.force_consistency_checks),
+      disallow_memtable_writes(cf_options.disallow_memtable_writes),
       default_temperature(cf_options.default_temperature),
       memtable_insert_with_hint_prefix_extractor(
           cf_options.memtable_insert_with_hint_prefix_extractor),
@@ -1006,7 +1058,8 @@ ImmutableCFOptions::ImmutableCFOptions(const ColumnFamilyOptions& cf_options)
       sst_partitioner_factory(cf_options.sst_partitioner_factory),
       blob_cache(cf_options.blob_cache),
       persist_user_defined_timestamps(
-          cf_options.persist_user_defined_timestamps) {}
+          cf_options.persist_user_defined_timestamps),
+      cf_allow_ingest_behind(cf_options.cf_allow_ingest_behind) {}
 
 ImmutableOptions::ImmutableOptions() : ImmutableOptions(Options()) {}
 
@@ -1034,10 +1087,12 @@ uint64_t MultiplyCheckOverflow(uint64_t op1, double op2) {
   if (op1 == 0 || op2 <= 0) {
     return 0;
   }
-  if (std::numeric_limits<uint64_t>::max() / op1 < op2) {
-    return op1;
+
+  if (op1 * op2 < static_cast<double>(std::numeric_limits<uint64_t>::max())) {
+    return static_cast<uint64_t>(op1 * op2);
   }
-  return static_cast<uint64_t>(op1 * op2);
+
+  return op1;
 }
 
 // when level_compaction_dynamic_level_bytes is true and leveled compaction
@@ -1132,6 +1187,8 @@ void MutableCFOptions::Dump(Logger* log) const {
                  target_file_size_base);
   ROCKS_LOG_INFO(log, "              target_file_size_multiplier: %d",
                  target_file_size_multiplier);
+  ROCKS_LOG_INFO(log, "         target_file_size_is_upper_bound: %d",
+                 target_file_size_is_upper_bound);
   ROCKS_LOG_INFO(log, "                 max_bytes_for_level_base: %" PRIu64,
                  max_bytes_for_level_base);
   ROCKS_LOG_INFO(log, "           max_bytes_for_level_multiplier: %f",
@@ -1147,6 +1204,8 @@ void MutableCFOptions::Dump(Logger* log) const {
                  preserve_internal_time_seconds);
   ROCKS_LOG_INFO(log, "                   paranoid_memory_checks: %d",
                  paranoid_memory_checks);
+  ROCKS_LOG_INFO(log, "memtable_veirfy_per_key_checksum_on_seek: %d",
+                 memtable_veirfy_per_key_checksum_on_seek);
   std::string result;
   char buf[10];
   for (const auto m : max_bytes_for_level_multiplier_additional) {
@@ -1175,7 +1234,10 @@ void MutableCFOptions::Dump(Logger* log) const {
                  bottommost_file_compaction_delay);
   ROCKS_LOG_INFO(log, "                   uncache_aggressiveness: %" PRIu32,
                  uncache_aggressiveness);
-
+  ROCKS_LOG_INFO(log, "             memtable_op_scan_flush_trigger: %" PRIu32,
+                 memtable_op_scan_flush_trigger);
+  ROCKS_LOG_INFO(log, "         memtable_avg_op_scan_flush_trigger: %" PRIu32,
+                 memtable_avg_op_scan_flush_trigger);
   // Universal Compaction Options
   ROCKS_LOG_INFO(log, "compaction_options_universal.size_ratio : %d",
                  compaction_options_universal.size_ratio);
@@ -1198,12 +1260,18 @@ void MutableCFOptions::Dump(Logger* log) const {
       static_cast<int>(compaction_options_universal.allow_trivial_move));
   ROCKS_LOG_INFO(log, "compaction_options_universal.incremental        : %d",
                  static_cast<int>(compaction_options_universal.incremental));
+  ROCKS_LOG_INFO(log, "compaction_options_universal.reduce_file_locking : %d",
+                 compaction_options_universal.reduce_file_locking);
 
   // FIFO Compaction Options
   ROCKS_LOG_INFO(log, "compaction_options_fifo.max_table_files_size : %" PRIu64,
                  compaction_options_fifo.max_table_files_size);
   ROCKS_LOG_INFO(log, "compaction_options_fifo.allow_compaction : %d",
                  compaction_options_fifo.allow_compaction);
+  ROCKS_LOG_INFO(log, "compaction_options_fifo.max_data_files_size : %" PRIu64,
+                 compaction_options_fifo.max_data_files_size);
+  ROCKS_LOG_INFO(log, "compaction_options_fifo.use_kv_ratio_compaction : %d",
+                 compaction_options_fifo.use_kv_ratio_compaction);
 
   // Blob file related options
   ROCKS_LOG_INFO(log, "                        enable_blob_files: %s",
diff --git a/options/cf_options.h b/options/cf_options.h
index 751e7b46d52b..3f5804445142 100644
--- a/options/cf_options.h
+++ b/options/cf_options.h
@@ -40,8 +40,6 @@ struct ImmutableCFOptions {
 
   int min_write_buffer_number_to_merge;
 
-  int max_write_buffer_number_to_maintain;
-
   int64_t max_write_buffer_size_to_maintain;
 
   bool inplace_update_support;
@@ -68,6 +66,8 @@ struct ImmutableCFOptions {
 
   bool force_consistency_checks;
 
+  bool disallow_memtable_writes;
+
   Temperature default_temperature;
 
   std::shared_ptr<const SliceTransform>
@@ -82,6 +82,8 @@ struct ImmutableCFOptions {
   std::shared_ptr<Cache> blob_cache;
 
   bool persist_user_defined_timestamps;
+
+  bool cf_allow_ingest_behind;
 };
 
 struct ImmutableOptions : public ImmutableDBOptions, public ImmutableCFOptions {
@@ -130,6 +132,8 @@ struct MutableCFOptions {
         max_compaction_bytes(options.max_compaction_bytes),
         target_file_size_base(options.target_file_size_base),
         target_file_size_multiplier(options.target_file_size_multiplier),
+        target_file_size_is_upper_bound(
+            options.target_file_size_is_upper_bound),
         max_bytes_for_level_base(options.max_bytes_for_level_base),
         max_bytes_for_level_multiplier(options.max_bytes_for_level_multiplier),
         ttl(options.ttl),
@@ -141,6 +145,7 @@ struct MutableCFOptions {
         preclude_last_level_data_seconds(
             options.preclude_last_level_data_seconds),
         preserve_internal_time_seconds(options.preserve_internal_time_seconds),
+        verify_output_flags(options.verify_output_flags),
         enable_blob_files(options.enable_blob_files),
         min_blob_size(options.min_blob_size),
         blob_file_size(options.blob_file_size),
@@ -161,19 +166,25 @@ struct MutableCFOptions {
         bottommost_compression(options.bottommost_compression),
         compression_opts(options.compression_opts),
         bottommost_compression_opts(options.bottommost_compression_opts),
+        compression_manager(options.compression_manager),
         last_level_temperature(options.last_level_temperature),
         default_write_temperature(options.default_write_temperature),
         memtable_protection_bytes_per_key(
             options.memtable_protection_bytes_per_key),
         block_protection_bytes_per_key(options.block_protection_bytes_per_key),
         paranoid_memory_checks(options.paranoid_memory_checks),
+        memtable_veirfy_per_key_checksum_on_seek(
+            options.memtable_veirfy_per_key_checksum_on_seek),
         sample_for_compression(
             options.sample_for_compression),  // TODO: is 0 fine here?
         compression_per_level(options.compression_per_level),
         memtable_max_range_deletions(options.memtable_max_range_deletions),
         bottommost_file_compaction_delay(
             options.bottommost_file_compaction_delay),
-        uncache_aggressiveness(options.uncache_aggressiveness) {
+        uncache_aggressiveness(options.uncache_aggressiveness),
+        memtable_op_scan_flush_trigger(options.memtable_op_scan_flush_trigger),
+        memtable_avg_op_scan_flush_trigger(
+            options.memtable_avg_op_scan_flush_trigger) {
     RefreshDerivedOptions(options.num_levels, options.compaction_style);
   }
 
@@ -198,6 +209,7 @@ struct MutableCFOptions {
         max_compaction_bytes(0),
         target_file_size_base(0),
         target_file_size_multiplier(0),
+        target_file_size_is_upper_bound(false),
         max_bytes_for_level_base(0),
         max_bytes_for_level_multiplier(0),
         ttl(0),
@@ -205,6 +217,7 @@ struct MutableCFOptions {
         compaction_options_fifo(),
         preclude_last_level_data_seconds(0),
         preserve_internal_time_seconds(0),
+        verify_output_flags(VerifyOutputFlags::kVerifyNone),
         enable_blob_files(false),
         min_blob_size(0),
         blob_file_size(0),
@@ -225,10 +238,13 @@ struct MutableCFOptions {
         memtable_protection_bytes_per_key(0),
         block_protection_bytes_per_key(0),
         paranoid_memory_checks(false),
+        memtable_veirfy_per_key_checksum_on_seek(false),
         sample_for_compression(0),
         memtable_max_range_deletions(0),
         bottommost_file_compaction_delay(0),
-        uncache_aggressiveness(0) {}
+        uncache_aggressiveness(0),
+        memtable_op_scan_flush_trigger(0),
+        memtable_avg_op_scan_flush_trigger(0) {}
 
   explicit MutableCFOptions(const Options& options);
 
@@ -249,9 +265,7 @@ struct MutableCFOptions {
 
   void Dump(Logger* log) const;
 
-#if __cplusplus >= 202002L
   bool operator==(const MutableCFOptions& rhs) const = default;
-#endif
 
   // Memtable related options
   size_t write_buffer_size;
@@ -295,6 +309,7 @@ struct MutableCFOptions {
   uint64_t max_compaction_bytes;
   uint64_t target_file_size_base;
   int target_file_size_multiplier;
+  bool target_file_size_is_upper_bound;
   uint64_t max_bytes_for_level_base;
   double max_bytes_for_level_multiplier;
   uint64_t ttl;
@@ -304,6 +319,7 @@ struct MutableCFOptions {
   CompactionOptionsUniversal compaction_options_universal;
   uint64_t preclude_last_level_data_seconds;
   uint64_t preserve_internal_time_seconds;
+  VerifyOutputFlags verify_output_flags;
 
   // Blob file related options
   bool enable_blob_files;
@@ -325,17 +341,21 @@ struct MutableCFOptions {
   CompressionType bottommost_compression;
   CompressionOptions compression_opts;
   CompressionOptions bottommost_compression_opts;
+  std::shared_ptr<CompressionManager> compression_manager;
   Temperature last_level_temperature;
   Temperature default_write_temperature;
   uint32_t memtable_protection_bytes_per_key;
   uint8_t block_protection_bytes_per_key;
   bool paranoid_memory_checks;
+  bool memtable_veirfy_per_key_checksum_on_seek;
 
   uint64_t sample_for_compression;
   std::vector<CompressionType> compression_per_level;
   uint32_t memtable_max_range_deletions;
   uint32_t bottommost_file_compaction_delay;
   uint32_t uncache_aggressiveness;
+  uint32_t memtable_op_scan_flush_trigger;
+  uint32_t memtable_avg_op_scan_flush_trigger;
 
   // Derived options
   // Per-level target file size.
diff --git a/options/configurable.cc b/options/configurable.cc
index 76ea54116a23..fe1f7efc9ab7 100644
--- a/options/configurable.cc
+++ b/options/configurable.cc
@@ -272,7 +272,8 @@ Status ConfigurableHelper::ConfigureOptions(
   if (config_options.ignore_unknown_options) {
     s = Status::OK();
   } else if (s.ok() && unused == nullptr && !remaining.empty()) {
-    s = Status::NotFound("Could not find option: ", remaining.begin()->first);
+    s = Status::NotFound("Extra option not recognized",
+                         remaining.begin()->first);
   }
   return s;
 }
@@ -369,7 +370,7 @@ Status ConfigurableHelper::ConfigureSingleOption(
   const auto opt_info =
       FindOption(configurable, opt_name, &elem_name, &opt_ptr);
   if (opt_info == nullptr) {
-    return Status::NotFound("Could not find option: ", name);
+    return Status::NotFound("Could not find option", name);
   } else {
     return ConfigureOption(config_options, configurable, *opt_info, opt_name,
                            elem_name, value, opt_ptr);
@@ -465,7 +466,7 @@ Status ConfigurableHelper::ConfigureOption(
     return configurable.ParseOption(config_options, opt_info, name, value,
                                     opt_ptr);
   } else {
-    return Status::NotFound("Could not find option: ", name);
+    return Status::NotFound("Unknown how to configure option", name);
   }
 }
 
diff --git a/options/customizable_test.cc b/options/customizable_test.cc
index 8549e7947fa8..53eac3cec182 100644
--- a/options/customizable_test.cc
+++ b/options/customizable_test.cc
@@ -1281,8 +1281,6 @@ class MockSliceTransform : public SliceTransform {
   Slice Transform(const Slice& /*key*/) const override { return Slice(); }
 
   bool InDomain(const Slice& /*key*/) const override { return false; }
-
-  bool InRange(const Slice& /*key*/) const override { return false; }
 };
 
 class MockMemoryAllocator : public BaseMemoryAllocator {
diff --git a/options/db_options.cc b/options/db_options.cc
index ea8f4b22d7be..2384355264c2 100644
--- a/options/db_options.cc
+++ b/options/db_options.cc
@@ -124,6 +124,18 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct MutableDBOptions, max_background_flushes),
           OptionType::kInt, OptionVerificationType::kNormal,
           OptionTypeFlags::kMutable}},
+        {"max_manifest_file_size",
+         {offsetof(struct MutableDBOptions, max_manifest_file_size),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"max_manifest_space_amp_pct",
+         {offsetof(struct MutableDBOptions, max_manifest_space_amp_pct),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"manifest_preallocation_size",
+         {offsetof(struct MutableDBOptions, manifest_preallocation_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
         {"daily_offpeak_time_utc",
          {offsetof(struct MutableDBOptions, daily_offpeak_time_utc),
           OptionType::kString, OptionVerificationType::kNormal,
@@ -141,6 +153,7 @@ static std::unordered_map<std::string, OptionTypeInfo>
           std::shared_ptr<Statistics> statistics;
           std::vector<DbPath> db_paths;
           FileTypeSet checksum_handoff_file_types;
+          CompactionStyleSet calculate_sst_write_lifetime_hint_set;
          */
         {"advise_random_on_open",
          {offsetof(struct ImmutableDBOptions, advise_random_on_open),
@@ -246,9 +259,7 @@ static std::unordered_map<std::string, OptionTypeInfo>
           OptionType::kBoolean, OptionVerificationType::kNormal,
           OptionTypeFlags::kNone}},
         {"skip_checking_sst_file_sizes_on_db_open",
-         {offsetof(struct ImmutableDBOptions,
-                   skip_checking_sst_file_sizes_on_db_open),
-          OptionType::kBoolean, OptionVerificationType::kNormal,
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
           OptionTypeFlags::kNone}},
         {"new_table_reader_for_compaction_inputs",
          {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
@@ -287,10 +298,6 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct ImmutableDBOptions, log_file_time_to_roll),
           OptionType::kSizeT, OptionVerificationType::kNormal,
           OptionTypeFlags::kNone}},
-        {"manifest_preallocation_size",
-         {offsetof(struct ImmutableDBOptions, manifest_preallocation_size),
-          OptionType::kSizeT, OptionVerificationType::kNormal,
-          OptionTypeFlags::kNone}},
         {"max_log_file_size",
          {offsetof(struct ImmutableDBOptions, max_log_file_size),
           OptionType::kSizeT, OptionVerificationType::kNormal,
@@ -309,17 +316,12 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct ImmutableDBOptions, WAL_ttl_seconds),
           OptionType::kUInt64T, OptionVerificationType::kNormal,
           OptionTypeFlags::kNone}},
-        {"max_manifest_file_size",
-         {offsetof(struct ImmutableDBOptions, max_manifest_file_size),
-          OptionType::kUInt64T, OptionVerificationType::kNormal,
-          OptionTypeFlags::kNone}},
         {"persist_stats_to_disk",
          {offsetof(struct ImmutableDBOptions, persist_stats_to_disk),
           OptionType::kBoolean, OptionVerificationType::kNormal,
           OptionTypeFlags::kNone}},
         {"fail_if_options_file_error",
-         {offsetof(struct ImmutableDBOptions, fail_if_options_file_error),
-          OptionType::kBoolean, OptionVerificationType::kNormal,
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
           OptionTypeFlags::kNone}},
         {"enable_pipelined_write",
          {offsetof(struct ImmutableDBOptions, enable_pipelined_write),
@@ -657,7 +659,7 @@ class DBOptionsConfigurable : public MutableDBConfigurable {
   explicit DBOptionsConfigurable(
       const DBOptions& opts,
       const std::unordered_map<std::string, std::string>* map = nullptr)
-      : MutableDBConfigurable(MutableDBOptions(opts), map), db_options_(opts) {
+      : MutableDBConfigurable(MutableDBOptions{opts}, map), db_options_(opts) {
     // The ImmutableDBOptions currently requires the env to be non-null.  Make
     // sure it is
     if (opts.env != nullptr) {
@@ -708,7 +710,7 @@ std::unique_ptr<Configurable> DBOptionsAsConfigurable(
   return ptr;
 }
 
-ImmutableDBOptions::ImmutableDBOptions() : ImmutableDBOptions(Options()) {}
+ImmutableDBOptions::ImmutableDBOptions() : ImmutableDBOptions(DBOptions{}) {}
 
 ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
     : create_if_missing(options.create_if_missing),
@@ -737,13 +739,11 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
       log_file_time_to_roll(options.log_file_time_to_roll),
       keep_log_file_num(options.keep_log_file_num),
       recycle_log_file_num(options.recycle_log_file_num),
-      max_manifest_file_size(options.max_manifest_file_size),
       table_cache_numshardbits(options.table_cache_numshardbits),
       WAL_ttl_seconds(options.WAL_ttl_seconds),
       WAL_size_limit_MB(options.WAL_size_limit_MB),
       max_write_batch_group_size_bytes(
           options.max_write_batch_group_size_bytes),
-      manifest_preallocation_size(options.manifest_preallocation_size),
       allow_mmap_reads(options.allow_mmap_reads),
       allow_mmap_writes(options.allow_mmap_writes),
       use_direct_reads(options.use_direct_reads),
@@ -765,13 +765,10 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
       write_thread_max_yield_usec(options.write_thread_max_yield_usec),
       write_thread_slow_yield_usec(options.write_thread_slow_yield_usec),
       skip_stats_update_on_db_open(options.skip_stats_update_on_db_open),
-      skip_checking_sst_file_sizes_on_db_open(
-          options.skip_checking_sst_file_sizes_on_db_open),
       wal_recovery_mode(options.wal_recovery_mode),
       allow_2pc(options.allow_2pc),
       row_cache(options.row_cache),
       wal_filter(options.wal_filter),
-      fail_if_options_file_error(options.fail_if_options_file_error),
       dump_malloc_stats(options.dump_malloc_stats),
       avoid_flush_during_recovery(options.avoid_flush_during_recovery),
       allow_ingest_behind(options.allow_ingest_behind),
@@ -801,7 +798,9 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
       follower_catchup_retry_count(options.follower_catchup_retry_count),
       follower_catchup_retry_wait_ms(options.follower_catchup_retry_wait_ms),
       metadata_write_temperature(options.metadata_write_temperature),
-      wal_write_temperature(options.wal_write_temperature) {
+      wal_write_temperature(options.wal_write_temperature),
+      calculate_sst_write_lifetime_hint_set(
+          options.calculate_sst_write_lifetime_hint_set) {
   fs = env->GetFileSystem();
   clock = env->GetSystemClock().get();
   logger = info_log.get();
@@ -849,9 +848,6 @@ void ImmutableDBOptions::Dump(Logger* log) const {
   ROCKS_LOG_HEADER(
       log, "                      Options.max_log_file_size: %" ROCKSDB_PRIszt,
       max_log_file_size);
-  ROCKS_LOG_HEADER(log,
-                   "                 Options.max_manifest_file_size: %" PRIu64,
-                   max_manifest_file_size);
   ROCKS_LOG_HEADER(
       log, "                  Options.log_file_time_to_roll: %" ROCKSDB_PRIszt,
       log_file_time_to_roll);
@@ -891,9 +887,6 @@ void ImmutableDBOptions::Dump(Logger* log) const {
                    "                       "
                    "Options.max_write_batch_group_size_bytes: %" PRIu64,
                    max_write_batch_group_size_bytes);
-  ROCKS_LOG_HEADER(
-      log, "            Options.manifest_preallocation_size: %" ROCKSDB_PRIszt,
-      manifest_preallocation_size);
   ROCKS_LOG_HEADER(log, "                    Options.is_fd_close_on_exec: %d",
                    is_fd_close_on_exec);
   ROCKS_LOG_HEADER(log, "                  Options.advise_random_on_open: %d",
@@ -1024,24 +1017,7 @@ const std::string& ImmutableDBOptions::GetWalDir(
   }
 }
 
-MutableDBOptions::MutableDBOptions()
-    : max_background_jobs(2),
-      max_background_compactions(-1),
-      max_subcompactions(0),
-      avoid_flush_during_shutdown(false),
-      writable_file_max_buffer_size(1024 * 1024),
-      delayed_write_rate(2 * 1024U * 1024U),
-      max_total_wal_size(0),
-      delete_obsolete_files_period_micros(6ULL * 60 * 60 * 1000000),
-      stats_dump_period_sec(600),
-      stats_persist_period_sec(600),
-      stats_history_buffer_size(1024 * 1024),
-      max_open_files(-1),
-      bytes_per_sync(0),
-      wal_bytes_per_sync(0),
-      strict_bytes_per_sync(false),
-      compaction_readahead_size(0),
-      max_background_flushes(-1) {}
+MutableDBOptions::MutableDBOptions() : MutableDBOptions(DBOptions{}) {}
 
 MutableDBOptions::MutableDBOptions(const DBOptions& options)
     : max_background_jobs(options.max_background_jobs),
@@ -1062,6 +1038,9 @@ MutableDBOptions::MutableDBOptions(const DBOptions& options)
       strict_bytes_per_sync(options.strict_bytes_per_sync),
       compaction_readahead_size(options.compaction_readahead_size),
       max_background_flushes(options.max_background_flushes),
+      max_manifest_file_size(options.max_manifest_file_size),
+      max_manifest_space_amp_pct(options.max_manifest_space_amp_pct),
+      manifest_preallocation_size(options.manifest_preallocation_size),
       daily_offpeak_time_utc(options.daily_offpeak_time_utc) {}
 
 void MutableDBOptions::Dump(Logger* log) const {
@@ -1106,6 +1085,15 @@ void MutableDBOptions::Dump(Logger* log) const {
                    compaction_readahead_size);
   ROCKS_LOG_HEADER(log, "                 Options.max_background_flushes: %d",
                    max_background_flushes);
+  ROCKS_LOG_HEADER(log,
+                   "                 Options.max_manifest_file_size: %" PRIu64,
+                   max_manifest_file_size);
+  ROCKS_LOG_HEADER(log,
+                   "                 Options.max_manifest_space_amp_pct: %d",
+                   max_manifest_space_amp_pct);
+  ROCKS_LOG_HEADER(
+      log, "            Options.manifest_preallocation_size: %" ROCKSDB_PRIszt,
+      manifest_preallocation_size);
   ROCKS_LOG_HEADER(log, "Options.daily_offpeak_time_utc: %s",
                    daily_offpeak_time_utc.c_str());
 }
diff --git a/options/db_options.h b/options/db_options.h
index df0854f1dd61..cc978d907dbb 100644
--- a/options/db_options.h
+++ b/options/db_options.h
@@ -47,12 +47,10 @@ struct ImmutableDBOptions {
   size_t log_file_time_to_roll;
   size_t keep_log_file_num;
   size_t recycle_log_file_num;
-  uint64_t max_manifest_file_size;
   int table_cache_numshardbits;
   uint64_t WAL_ttl_seconds;
   uint64_t WAL_size_limit_MB;
   uint64_t max_write_batch_group_size_bytes;
-  size_t manifest_preallocation_size;
   bool allow_mmap_reads;
   bool allow_mmap_writes;
   bool use_direct_reads;
@@ -72,12 +70,10 @@ struct ImmutableDBOptions {
   uint64_t write_thread_max_yield_usec;
   uint64_t write_thread_slow_yield_usec;
   bool skip_stats_update_on_db_open;
-  bool skip_checking_sst_file_sizes_on_db_open;
   WALRecoveryMode wal_recovery_mode;
   bool allow_2pc;
   std::shared_ptr<Cache> row_cache;
   WalFilter* wal_filter;
-  bool fail_if_options_file_error;
   bool dump_malloc_stats;
   bool avoid_flush_during_recovery;
   bool allow_ingest_behind;
@@ -107,6 +103,7 @@ struct ImmutableDBOptions {
   uint64_t follower_catchup_retry_wait_ms;
   Temperature metadata_write_temperature;
   Temperature wal_write_temperature;
+  CompactionStyleSet calculate_sst_write_lifetime_hint_set;
 
   // Beginning convenience/helper objects that are not part of the base
   // DBOptions
@@ -146,6 +143,9 @@ struct MutableDBOptions {
   bool strict_bytes_per_sync;
   size_t compaction_readahead_size;
   int max_background_flushes;
+  uint64_t max_manifest_file_size;
+  int max_manifest_space_amp_pct;
+  size_t manifest_preallocation_size;
   std::string daily_offpeak_time_utc;
 };
 
diff --git a/options/options.cc b/options/options.cc
index c1e68260a14d..d9f64f93d235 100644
--- a/options/options.cc
+++ b/options/options.cc
@@ -43,8 +43,6 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options)
     : max_write_buffer_number(options.max_write_buffer_number),
       min_write_buffer_number_to_merge(
           options.min_write_buffer_number_to_merge),
-      max_write_buffer_number_to_maintain(
-          options.max_write_buffer_number_to_maintain),
       max_write_buffer_size_to_maintain(
           options.max_write_buffer_size_to_maintain),
       inplace_update_support(options.inplace_update_support),
@@ -65,6 +63,7 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options)
       level0_stop_writes_trigger(options.level0_stop_writes_trigger),
       target_file_size_base(options.target_file_size_base),
       target_file_size_multiplier(options.target_file_size_multiplier),
+      target_file_size_is_upper_bound(options.target_file_size_is_upper_bound),
       level_compaction_dynamic_level_bytes(
           options.level_compaction_dynamic_level_bytes),
       max_bytes_for_level_multiplier(options.max_bytes_for_level_multiplier),
@@ -90,6 +89,7 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options)
       paranoid_file_checks(options.paranoid_file_checks),
       force_consistency_checks(options.force_consistency_checks),
       report_bg_io_stats(options.report_bg_io_stats),
+      disallow_memtable_writes(options.disallow_memtable_writes),
       ttl(options.ttl),
       periodic_compaction_seconds(options.periodic_compaction_seconds),
       sample_for_compression(options.sample_for_compression),
@@ -112,7 +112,10 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options)
       blob_file_starting_level(options.blob_file_starting_level),
       blob_cache(options.blob_cache),
       prepopulate_blob_cache(options.prepopulate_blob_cache),
-      persist_user_defined_timestamps(options.persist_user_defined_timestamps) {
+      persist_user_defined_timestamps(options.persist_user_defined_timestamps),
+      memtable_op_scan_flush_trigger(options.memtable_op_scan_flush_trigger),
+      memtable_avg_op_scan_flush_trigger(
+          options.memtable_avg_op_scan_flush_trigger) {
   assert(memtable_factory.get() != nullptr);
   if (max_bytes_for_level_multiplier_additional.size() <
       static_cast<unsigned int>(num_levels)) {
@@ -191,8 +194,6 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
   ROCKS_LOG_HEADER(log, "            Options.num_levels: %d", num_levels);
   ROCKS_LOG_HEADER(log, "       Options.min_write_buffer_number_to_merge: %d",
                    min_write_buffer_number_to_merge);
-  ROCKS_LOG_HEADER(log, "    Options.max_write_buffer_number_to_maintain: %d",
-                   max_write_buffer_number_to_maintain);
   ROCKS_LOG_HEADER(log,
                    "    Options.max_write_buffer_size_to_maintain: %" PRIu64,
                    max_write_buffer_size_to_maintain);
@@ -269,6 +270,9 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
                    target_file_size_base);
   ROCKS_LOG_HEADER(log, "            Options.target_file_size_multiplier: %d",
                    target_file_size_multiplier);
+  ROCKS_LOG_HEADER(log,
+                   "           Options.target_file_size_is_upper_bound: %d",
+                   target_file_size_is_upper_bound);
   ROCKS_LOG_HEADER(log,
                    "               Options.max_bytes_for_level_base: %" PRIu64,
                    max_bytes_for_level_base);
@@ -286,6 +290,12 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
   ROCKS_LOG_HEADER(log,
                    "      Options.max_sequential_skip_in_iterations: %" PRIu64,
                    max_sequential_skip_in_iterations);
+  ROCKS_LOG_HEADER(log,
+                   "         Options.memtable_op_scan_flush_trigger: %" PRIu32,
+                   memtable_op_scan_flush_trigger);
+  ROCKS_LOG_HEADER(log,
+                   "     Options.memtable_avg_op_scan_flush_trigger: %" PRIu32,
+                   memtable_avg_op_scan_flush_trigger);
   ROCKS_LOG_HEADER(log,
                    "                   Options.max_compaction_bytes: %" PRIu64,
                    max_compaction_bytes);
@@ -352,6 +362,9 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
                    str_compaction_stop_style.c_str());
   ROCKS_LOG_HEADER(log, "Options.compaction_options_universal.max_read_amp: %d",
                    compaction_options_universal.max_read_amp);
+  ROCKS_LOG_HEADER(
+      log, "Options.compaction_options_universal.reduce_file_locking: %d",
+      compaction_options_universal.reduce_file_locking);
   ROCKS_LOG_HEADER(
       log, "Options.compaction_options_fifo.max_table_files_size: %" PRIu64,
       compaction_options_fifo.max_table_files_size);
@@ -395,6 +408,8 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
                    force_consistency_checks);
   ROCKS_LOG_HEADER(log, "               Options.report_bg_io_stats: %d",
                    report_bg_io_stats);
+  ROCKS_LOG_HEADER(log, "               Options.disallow_memtable_writes: %d",
+                   disallow_memtable_writes);
   ROCKS_LOG_HEADER(log, "                              Options.ttl: %" PRIu64,
                    ttl);
   ROCKS_LOG_HEADER(log,
@@ -451,6 +466,8 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
                    experimental_mempurge_threshold);
   ROCKS_LOG_HEADER(log, "           Options.memtable_max_range_deletions: %d",
                    memtable_max_range_deletions);
+  ROCKS_LOG_HEADER(log, "                 Options.cf_allow_ingest_behind: %s",
+                   cf_allow_ingest_behind ? "true" : "false");
 }  // ColumnFamilyOptions::Dump
 
 void Options::Dump(Logger* log) const {
diff --git a/options/options_helper.cc b/options/options_helper.cc
index fad122166a0a..addada94f927 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -99,13 +99,15 @@ void BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
   options.log_file_time_to_roll = immutable_db_options.log_file_time_to_roll;
   options.keep_log_file_num = immutable_db_options.keep_log_file_num;
   options.recycle_log_file_num = immutable_db_options.recycle_log_file_num;
-  options.max_manifest_file_size = immutable_db_options.max_manifest_file_size;
+  options.max_manifest_file_size = mutable_db_options.max_manifest_file_size;
+  options.max_manifest_space_amp_pct =
+      mutable_db_options.max_manifest_space_amp_pct;
   options.table_cache_numshardbits =
       immutable_db_options.table_cache_numshardbits;
   options.WAL_ttl_seconds = immutable_db_options.WAL_ttl_seconds;
   options.WAL_size_limit_MB = immutable_db_options.WAL_size_limit_MB;
   options.manifest_preallocation_size =
-      immutable_db_options.manifest_preallocation_size;
+      mutable_db_options.manifest_preallocation_size;
   options.allow_mmap_reads = immutable_db_options.allow_mmap_reads;
   options.allow_mmap_writes = immutable_db_options.allow_mmap_writes;
   options.use_direct_reads = immutable_db_options.use_direct_reads;
@@ -147,14 +149,10 @@ void BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
       immutable_db_options.write_thread_slow_yield_usec;
   options.skip_stats_update_on_db_open =
       immutable_db_options.skip_stats_update_on_db_open;
-  options.skip_checking_sst_file_sizes_on_db_open =
-      immutable_db_options.skip_checking_sst_file_sizes_on_db_open;
   options.wal_recovery_mode = immutable_db_options.wal_recovery_mode;
   options.allow_2pc = immutable_db_options.allow_2pc;
   options.row_cache = immutable_db_options.row_cache;
   options.wal_filter = immutable_db_options.wal_filter;
-  options.fail_if_options_file_error =
-      immutable_db_options.fail_if_options_file_error;
   options.dump_malloc_stats = immutable_db_options.dump_malloc_stats;
   options.avoid_flush_during_recovery =
       immutable_db_options.avoid_flush_during_recovery;
@@ -199,6 +197,8 @@ void BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
       immutable_db_options.metadata_write_temperature;
   options.wal_write_temperature = immutable_db_options.wal_write_temperature;
   options.compaction_service = immutable_db_options.compaction_service;
+  options.calculate_sst_write_lifetime_hint_set =
+      immutable_db_options.calculate_sst_write_lifetime_hint_set;
 }
 
 ColumnFamilyOptions BuildColumnFamilyOptions(
@@ -232,6 +232,8 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions,
   cf_opts->block_protection_bytes_per_key =
       moptions.block_protection_bytes_per_key;
   cf_opts->paranoid_memory_checks = moptions.paranoid_memory_checks;
+  cf_opts->memtable_veirfy_per_key_checksum_on_seek =
+      moptions.memtable_veirfy_per_key_checksum_on_seek;
   cf_opts->bottommost_file_compaction_delay =
       moptions.bottommost_file_compaction_delay;
 
@@ -250,6 +252,8 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions,
   cf_opts->max_compaction_bytes = moptions.max_compaction_bytes;
   cf_opts->target_file_size_base = moptions.target_file_size_base;
   cf_opts->target_file_size_multiplier = moptions.target_file_size_multiplier;
+  cf_opts->target_file_size_is_upper_bound =
+      moptions.target_file_size_is_upper_bound;
   cf_opts->max_bytes_for_level_base = moptions.max_bytes_for_level_base;
   cf_opts->max_bytes_for_level_multiplier =
       moptions.max_bytes_for_level_multiplier;
@@ -268,6 +272,8 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions,
   cf_opts->compaction_options_fifo = moptions.compaction_options_fifo;
   cf_opts->compaction_options_universal = moptions.compaction_options_universal;
 
+  cf_opts->verify_output_flags = moptions.verify_output_flags;
+
   // Blob file related options
   cf_opts->enable_blob_files = moptions.enable_blob_files;
   cf_opts->min_blob_size = moptions.min_blob_size;
@@ -293,12 +299,17 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions,
   cf_opts->compression_opts = moptions.compression_opts;
   cf_opts->bottommost_compression = moptions.bottommost_compression;
   cf_opts->bottommost_compression_opts = moptions.bottommost_compression_opts;
+  cf_opts->compression_manager = moptions.compression_manager;
   cf_opts->sample_for_compression = moptions.sample_for_compression;
   cf_opts->compression_per_level = moptions.compression_per_level;
   cf_opts->last_level_temperature = moptions.last_level_temperature;
   cf_opts->default_write_temperature = moptions.default_write_temperature;
   cf_opts->memtable_max_range_deletions = moptions.memtable_max_range_deletions;
   cf_opts->uncache_aggressiveness = moptions.uncache_aggressiveness;
+  cf_opts->memtable_op_scan_flush_trigger =
+      moptions.memtable_op_scan_flush_trigger;
+  cf_opts->memtable_avg_op_scan_flush_trigger =
+      moptions.memtable_avg_op_scan_flush_trigger;
 }
 
 void UpdateColumnFamilyOptions(const ImmutableCFOptions& ioptions,
@@ -311,8 +322,6 @@ void UpdateColumnFamilyOptions(const ImmutableCFOptions& ioptions,
   cf_opts->compaction_filter_factory = ioptions.compaction_filter_factory;
   cf_opts->min_write_buffer_number_to_merge =
       ioptions.min_write_buffer_number_to_merge;
-  cf_opts->max_write_buffer_number_to_maintain =
-      ioptions.max_write_buffer_number_to_maintain;
   cf_opts->max_write_buffer_size_to_maintain =
       ioptions.max_write_buffer_size_to_maintain;
   cf_opts->inplace_update_support = ioptions.inplace_update_support;
@@ -326,6 +335,7 @@ void UpdateColumnFamilyOptions(const ImmutableCFOptions& ioptions,
   cf_opts->num_levels = ioptions.num_levels;
   cf_opts->optimize_filters_for_hits = ioptions.optimize_filters_for_hits;
   cf_opts->force_consistency_checks = ioptions.force_consistency_checks;
+  cf_opts->disallow_memtable_writes = ioptions.disallow_memtable_writes;
   cf_opts->memtable_insert_with_hint_prefix_extractor =
       ioptions.memtable_insert_with_hint_prefix_extractor;
   cf_opts->cf_paths = ioptions.cf_paths;
@@ -335,6 +345,7 @@ void UpdateColumnFamilyOptions(const ImmutableCFOptions& ioptions,
   cf_opts->persist_user_defined_timestamps =
       ioptions.persist_user_defined_timestamps;
   cf_opts->default_temperature = ioptions.default_temperature;
+  cf_opts->cf_allow_ingest_behind = ioptions.cf_allow_ingest_behind;
 
   // TODO(yhchiang): find some way to handle the following derived options
   // * max_file_size
@@ -360,10 +371,9 @@ std::map<CompactionStopStyle, std::string>
         {kCompactionStopStyleTotalSize, "kCompactionStopStyleTotalSize"}};
 
 std::map<Temperature, std::string> OptionsHelper::temperature_to_string = {
-    {Temperature::kUnknown, "kUnknown"},
-    {Temperature::kHot, "kHot"},
-    {Temperature::kWarm, "kWarm"},
-    {Temperature::kCold, "kCold"}};
+    {Temperature::kUnknown, "kUnknown"}, {Temperature::kHot, "kHot"},
+    {Temperature::kWarm, "kWarm"},       {Temperature::kCool, "kCool"},
+    {Temperature::kCold, "kCold"},       {Temperature::kIce, "kIce"}};
 
 std::unordered_map<std::string, ChecksumType>
     OptionsHelper::checksum_type_string_map = {{"kNoChecksum", kNoChecksum},
@@ -382,6 +392,133 @@ std::unordered_map<std::string, CompressionType>
         {"kLZ4HCCompression", kLZ4HCCompression},
         {"kXpressCompression", kXpressCompression},
         {"kZSTD", kZSTD},
+        {"kCustomCompression80", kCustomCompression80},
+        {"kCustomCompression81", kCustomCompression81},
+        {"kCustomCompression82", kCustomCompression82},
+        {"kCustomCompression83", kCustomCompression83},
+        {"kCustomCompression84", kCustomCompression84},
+        {"kCustomCompression85", kCustomCompression85},
+        {"kCustomCompression86", kCustomCompression86},
+        {"kCustomCompression87", kCustomCompression87},
+        {"kCustomCompression88", kCustomCompression88},
+        {"kCustomCompression89", kCustomCompression89},
+        {"kCustomCompression8A", kCustomCompression8A},
+        {"kCustomCompression8B", kCustomCompression8B},
+        {"kCustomCompression8C", kCustomCompression8C},
+        {"kCustomCompression8D", kCustomCompression8D},
+        {"kCustomCompression8E", kCustomCompression8E},
+        {"kCustomCompression8F", kCustomCompression8F},
+        {"kCustomCompression90", kCustomCompression90},
+        {"kCustomCompression91", kCustomCompression91},
+        {"kCustomCompression92", kCustomCompression92},
+        {"kCustomCompression93", kCustomCompression93},
+        {"kCustomCompression94", kCustomCompression94},
+        {"kCustomCompression95", kCustomCompression95},
+        {"kCustomCompression96", kCustomCompression96},
+        {"kCustomCompression97", kCustomCompression97},
+        {"kCustomCompression98", kCustomCompression98},
+        {"kCustomCompression99", kCustomCompression99},
+        {"kCustomCompression9A", kCustomCompression9A},
+        {"kCustomCompression9B", kCustomCompression9B},
+        {"kCustomCompression9C", kCustomCompression9C},
+        {"kCustomCompression9D", kCustomCompression9D},
+        {"kCustomCompression9E", kCustomCompression9E},
+        {"kCustomCompression9F", kCustomCompression9F},
+        {"kCustomCompressionA0", kCustomCompressionA0},
+        {"kCustomCompressionA1", kCustomCompressionA1},
+        {"kCustomCompressionA2", kCustomCompressionA2},
+        {"kCustomCompressionA3", kCustomCompressionA3},
+        {"kCustomCompressionA4", kCustomCompressionA4},
+        {"kCustomCompressionA5", kCustomCompressionA5},
+        {"kCustomCompressionA6", kCustomCompressionA6},
+        {"kCustomCompressionA7", kCustomCompressionA7},
+        {"kCustomCompressionA8", kCustomCompressionA8},
+        {"kCustomCompressionA9", kCustomCompressionA9},
+        {"kCustomCompressionAA", kCustomCompressionAA},
+        {"kCustomCompressionAB", kCustomCompressionAB},
+        {"kCustomCompressionAC", kCustomCompressionAC},
+        {"kCustomCompressionAD", kCustomCompressionAD},
+        {"kCustomCompressionAE", kCustomCompressionAE},
+        {"kCustomCompressionAF", kCustomCompressionAF},
+        {"kCustomCompressionB0", kCustomCompressionB0},
+        {"kCustomCompressionB1", kCustomCompressionB1},
+        {"kCustomCompressionB2", kCustomCompressionB2},
+        {"kCustomCompressionB3", kCustomCompressionB3},
+        {"kCustomCompressionB4", kCustomCompressionB4},
+        {"kCustomCompressionB5", kCustomCompressionB5},
+        {"kCustomCompressionB6", kCustomCompressionB6},
+        {"kCustomCompressionB7", kCustomCompressionB7},
+        {"kCustomCompressionB8", kCustomCompressionB8},
+        {"kCustomCompressionB9", kCustomCompressionB9},
+        {"kCustomCompressionBA", kCustomCompressionBA},
+        {"kCustomCompressionBB", kCustomCompressionBB},
+        {"kCustomCompressionBC", kCustomCompressionBC},
+        {"kCustomCompressionBD", kCustomCompressionBD},
+        {"kCustomCompressionBE", kCustomCompressionBE},
+        {"kCustomCompressionBF", kCustomCompressionBF},
+        {"kCustomCompressionC0", kCustomCompressionC0},
+        {"kCustomCompressionC1", kCustomCompressionC1},
+        {"kCustomCompressionC2", kCustomCompressionC2},
+        {"kCustomCompressionC3", kCustomCompressionC3},
+        {"kCustomCompressionC4", kCustomCompressionC4},
+        {"kCustomCompressionC5", kCustomCompressionC5},
+        {"kCustomCompressionC6", kCustomCompressionC6},
+        {"kCustomCompressionC7", kCustomCompressionC7},
+        {"kCustomCompressionC8", kCustomCompressionC8},
+        {"kCustomCompressionC9", kCustomCompressionC9},
+        {"kCustomCompressionCA", kCustomCompressionCA},
+        {"kCustomCompressionCB", kCustomCompressionCB},
+        {"kCustomCompressionCC", kCustomCompressionCC},
+        {"kCustomCompressionCD", kCustomCompressionCD},
+        {"kCustomCompressionCE", kCustomCompressionCE},
+        {"kCustomCompressionCF", kCustomCompressionCF},
+        {"kCustomCompressionD0", kCustomCompressionD0},
+        {"kCustomCompressionD1", kCustomCompressionD1},
+        {"kCustomCompressionD2", kCustomCompressionD2},
+        {"kCustomCompressionD3", kCustomCompressionD3},
+        {"kCustomCompressionD4", kCustomCompressionD4},
+        {"kCustomCompressionD5", kCustomCompressionD5},
+        {"kCustomCompressionD6", kCustomCompressionD6},
+        {"kCustomCompressionD7", kCustomCompressionD7},
+        {"kCustomCompressionD8", kCustomCompressionD8},
+        {"kCustomCompressionD9", kCustomCompressionD9},
+        {"kCustomCompressionDA", kCustomCompressionDA},
+        {"kCustomCompressionDB", kCustomCompressionDB},
+        {"kCustomCompressionDC", kCustomCompressionDC},
+        {"kCustomCompressionDD", kCustomCompressionDD},
+        {"kCustomCompressionDE", kCustomCompressionDE},
+        {"kCustomCompressionDF", kCustomCompressionDF},
+        {"kCustomCompressionE0", kCustomCompressionE0},
+        {"kCustomCompressionE1", kCustomCompressionE1},
+        {"kCustomCompressionE2", kCustomCompressionE2},
+        {"kCustomCompressionE3", kCustomCompressionE3},
+        {"kCustomCompressionE4", kCustomCompressionE4},
+        {"kCustomCompressionE5", kCustomCompressionE5},
+        {"kCustomCompressionE6", kCustomCompressionE6},
+        {"kCustomCompressionE7", kCustomCompressionE7},
+        {"kCustomCompressionE8", kCustomCompressionE8},
+        {"kCustomCompressionE9", kCustomCompressionE9},
+        {"kCustomCompressionEA", kCustomCompressionEA},
+        {"kCustomCompressionEB", kCustomCompressionEB},
+        {"kCustomCompressionEC", kCustomCompressionEC},
+        {"kCustomCompressionED", kCustomCompressionED},
+        {"kCustomCompressionEE", kCustomCompressionEE},
+        {"kCustomCompressionEF", kCustomCompressionEF},
+        {"kCustomCompressionF0", kCustomCompressionF0},
+        {"kCustomCompressionF1", kCustomCompressionF1},
+        {"kCustomCompressionF2", kCustomCompressionF2},
+        {"kCustomCompressionF3", kCustomCompressionF3},
+        {"kCustomCompressionF4", kCustomCompressionF4},
+        {"kCustomCompressionF5", kCustomCompressionF5},
+        {"kCustomCompressionF6", kCustomCompressionF6},
+        {"kCustomCompressionF7", kCustomCompressionF7},
+        {"kCustomCompressionF8", kCustomCompressionF8},
+        {"kCustomCompressionF9", kCustomCompressionF9},
+        {"kCustomCompressionFA", kCustomCompressionFA},
+        {"kCustomCompressionFB", kCustomCompressionFB},
+        {"kCustomCompressionFC", kCustomCompressionFC},
+        {"kCustomCompressionFD", kCustomCompressionFD},
+        {"kCustomCompressionFE", kCustomCompressionFE},
         {"kDisableCompressionOption", kDisableCompressionOption}};
 
 const std::vector<CompressionType>& GetSupportedCompressions() {
@@ -564,7 +701,6 @@ bool SerializeSingleOptionHelper(const void* opt_address,
       return SerializeEnum<CompressionType>(
           compression_type_string_map,
           *(static_cast<const CompressionType*>(opt_address)), value);
-      break;
     case OptionType::kChecksumType:
       return SerializeEnum<ChecksumType>(
           checksum_type_string_map,
@@ -832,10 +968,9 @@ std::unordered_map<std::string, CompactionStopStyle>
 
 std::unordered_map<std::string, Temperature>
     OptionsHelper::temperature_string_map = {
-        {"kUnknown", Temperature::kUnknown},
-        {"kHot", Temperature::kHot},
-        {"kWarm", Temperature::kWarm},
-        {"kCold", Temperature::kCold}};
+        {"kUnknown", Temperature::kUnknown}, {"kHot", Temperature::kHot},
+        {"kWarm", Temperature::kWarm},       {"kCool", Temperature::kCool},
+        {"kCold", Temperature::kCold},       {"kIce", Temperature::kIce}};
 
 std::unordered_map<std::string, PrepopulateBlobCache>
     OptionsHelper::prepopulate_blob_cache_string_map = {
@@ -907,7 +1042,7 @@ Status OptionTypeInfo::Parse(const ConfigOptions& config_options,
                                        : value;
 
     if (opt_ptr == nullptr) {
-      return Status::NotFound("Could not find option", opt_name);
+      return Status::NotFound("Nullptr option", opt_name);
     } else if (parse_func_ != nullptr) {
       ConfigOptions copy = config_options;
       copy.invoke_prepare_options = false;
diff --git a/options/options_helper.h b/options/options_helper.h
index f03179066eaf..74e953b9f507 100644
--- a/options/options_helper.h
+++ b/options/options_helper.h
@@ -72,6 +72,9 @@ std::unique_ptr<Configurable> CFOptionsAsConfigurable(
 Status StringToMap(const std::string& opts_str,
                    std::unordered_map<std::string, std::string>* opts_map);
 
+Status GetStringFromCompressionType(std::string* compression_str,
+                                    CompressionType compression_type);
+
 struct OptionsHelper {
   static const std::string kCFOptionsName /*= "ColumnFamilyOptions"*/;
   static const std::string kDBOptionsName /*= "DBOptions" */;
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index d6660908d8b8..3c12a9e859a9 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -129,6 +129,8 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) {
        sizeof(CacheUsageOptions)},
       {offsetof(struct BlockBasedTableOptions, filter_policy),
        sizeof(std::shared_ptr<const FilterPolicy>)},
+      {offsetof(struct BlockBasedTableOptions, user_defined_index_factory),
+       sizeof(std::shared_ptr<UserDefinedIndexFactory>)},
   };
 
   // In this test, we catch a new option of BlockBasedTableOptions that is not
@@ -180,6 +182,7 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) {
       "pin_l0_filter_and_index_blocks_in_cache=1;"
       "pin_top_level_index_and_filter=1;"
       "index_type=kHashSearch;"
+      "index_block_search_type=kBinary;"
       "data_block_index_type=kDataBlockBinaryAndHash;"
       "index_shortening=kNoShortening;"
       "data_block_hash_table_util_ratio=0.75;"
@@ -198,10 +201,13 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) {
       "verify_compression=true;read_amp_bytes_per_bit=0;"
       "enable_index_compression=false;"
       "block_align=true;"
+      "super_block_alignment_size=65536;"
+      "super_block_alignment_space_overhead_ratio=4096;"
       "max_auto_readahead_size=0;"
       "prepopulate_block_cache=kDisable;"
       "initial_auto_readahead_size=0;"
-      "num_file_reads_for_auto_readahead=0",
+      "num_file_reads_for_auto_readahead=0;"
+      "fail_if_no_udi_on_open=true",
       new_bbto));
 
   ASSERT_EQ(unset_bytes_base,
@@ -272,8 +278,8 @@ TEST_F(OptionsSettableTest, TablePropertiesAllFieldsSettable) {
       "property_collectors_names=;prefix_extractor_name=;db_host_id="
       "64625F686F73745F6964;db_session_id=64625F73657373696F6E5F6964;creation_"
       "time=0;num_data_blocks=123;index_value_is_delta_encoded=0;top_level_"
-      "index_"
-      "size=0;data_size=100;merge_operator_name=;index_partitions=0;file_"
+      "index_size=0;data_size=100;uncompressed_data_size=1234;"
+      "merge_operator_name=;index_partitions=0;file_"
       "creation_time=0;raw_value_size=0;index_size=200;user_collected_"
       "properties={757365725F6B6579=757365725F76616C7565;};tail_start_offset=0;"
       "seqno_to_time_mapping=;raw_key_size=0;slow_compression_estimated_data_"
@@ -286,7 +292,8 @@ TEST_F(OptionsSettableTest, TablePropertiesAllFieldsSettable) {
       "0;column_family_"
       "name=64656661756C74;user_defined_timestamps_persisted=1;num_entries=100;"
       "external_sst_file_global_seqno_offset=0;num_merge_operands=0;index_key_"
-      "is_user_key=0;key_largest_seqno=18446744073709551615;",
+      "is_user_key=0;key_largest_seqno=18446744073709551615;key_smallest_seqno="
+      "18;",
       new_tp));
 
   // All bytes are set from the parse
@@ -342,6 +349,8 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
       {offsetof(struct DBOptions, compaction_service),
        sizeof(std::shared_ptr<CompactionService>)},
       {offsetof(struct DBOptions, daily_offpeak_time_utc), sizeof(std::string)},
+      {offsetof(struct DBOptions, calculate_sst_write_lifetime_hint_set),
+       sizeof(CompactionStyleSet)},
   };
 
   char* options_ptr = new char[sizeof(DBOptions)];
@@ -398,8 +407,8 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
                              "compaction_readahead_size=0;"
                              "keep_log_file_num=4890;"
                              "skip_stats_update_on_db_open=false;"
-                             "skip_checking_sst_file_sizes_on_db_open=false;"
                              "max_manifest_file_size=4295009941;"
+                             "max_manifest_space_amp_pct=321;"
                              "db_log_dir=path/to/db_log_dir;"
                              "writable_file_max_buffer_size=1048576;"
                              "paranoid_checks=true;"
@@ -431,7 +440,6 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
                              "use_direct_io_for_flush_and_compaction=false;"
                              "max_log_file_size=4607;"
                              "advise_random_on_open=true;"
-                             "fail_if_options_file_error=false;"
                              "enable_pipelined_write=false;"
                              "unordered_write=false;"
                              "allow_concurrent_memtable_write=true;"
@@ -500,7 +508,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
   // ColumnFamilyOptions.
   const OffsetGap kColumnFamilyOptionsExcluded = {
       {offsetof(struct ColumnFamilyOptions, inplace_callback),
-       sizeof(UpdateStatus(*)(char*, uint32_t*, Slice, std::string*))},
+       sizeof(UpdateStatus (*)(char*, uint32_t*, Slice, std::string*))},
       {offsetof(struct ColumnFamilyOptions,
                 memtable_insert_with_hint_prefix_extractor),
        sizeof(std::shared_ptr<const SliceTransform>)},
@@ -529,10 +537,10 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
        sizeof(const CompactionFilter*)},
       {offsetof(struct ColumnFamilyOptions, compaction_filter_factory),
        sizeof(std::shared_ptr<CompactionFilterFactory>)},
+      {offsetof(struct ColumnFamilyOptions, compression_manager),
+       sizeof(std::shared_ptr<CompressionManager>)},
       {offsetof(struct ColumnFamilyOptions, prefix_extractor),
        sizeof(std::shared_ptr<const SliceTransform>)},
-      {offsetof(struct ColumnFamilyOptions, snap_refresh_nanos),
-       sizeof(uint64_t)},
       {offsetof(struct ColumnFamilyOptions, table_factory),
        sizeof(std::shared_ptr<TableFactory>)},
       {offsetof(struct ColumnFamilyOptions, cf_paths),
@@ -601,6 +609,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
       "max_sequential_skip_in_iterations=4294971408;"
       "arena_block_size=1893;"
       "target_file_size_multiplier=35;"
+      "target_file_size_is_upper_bound=false;"
       "min_write_buffer_number_to_merge=9;"
       "max_write_buffer_number=84;"
       "write_buffer_size=1653;"
@@ -618,13 +627,13 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
       "strategy=7;max_dict_bytes=8;level=9;window_bits=10;max_compressed_bytes_"
       "per_kb=876;checksum=true};"
       "bottommost_compression=kDisableCompressionOption;"
+      "compression_manager=BuiltinV2;"
       "level0_stop_writes_trigger=33;"
       "num_levels=99;"
       "level0_slowdown_writes_trigger=22;"
       "level0_file_num_compaction_trigger=14;"
       "compaction_filter=urxcqstuwnCompactionFilter;"
       "soft_pending_compaction_bytes_limit=0;"
-      "max_write_buffer_number_to_maintain=84;"
       "max_write_buffer_size_to_maintain=2147483648;"
       "merge_operator=aabcxehazrMergeOperator;"
       "memtable_prefix_bloom_size_ratio=0.4642;"
@@ -644,6 +653,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
       "hard_pending_compaction_bytes_limit=0;"
       "disable_auto_compactions=false;"
       "report_bg_io_stats=true;"
+      "disallow_memtable_writes=true;"
       "ttl=60;"
       "periodic_compaction_seconds=3600;"
       "sample_for_compression=0;"
@@ -665,7 +675,8 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
       "preserve_internal_time_seconds=86400;"
       "compaction_options_fifo={max_table_files_size=3;allow_"
       "compaction=true;age_for_warm=0;file_temperature_age_thresholds={{"
-      "temperature=kCold;age=12345}};};"
+      "temperature=kCold;age=12345}};max_data_files_size=1073741824;"
+      "use_kv_ratio_compaction=false;};"
       "blob_cache=1M;"
       "memtable_protection_bytes_per_key=2;"
       "persist_user_defined_timestamps=true;"
@@ -673,7 +684,12 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
       "memtable_max_range_deletions=999999;"
       "bottommost_file_compaction_delay=7200;"
       "uncache_aggressiveness=1234;"
-      "paranoid_memory_checks=1;",
+      "paranoid_memory_checks=1;"
+      "memtable_veirfy_per_key_checksum_on_seek=1;"
+      "memtable_op_scan_flush_trigger=123;"
+      "memtable_avg_op_scan_flush_trigger=12;"
+      "cf_allow_ingest_behind=1;"
+      "verify_output_flags=2049;",
       new_options));
 
   ASSERT_NE(new_options->blob_cache.get(), nullptr);
@@ -697,6 +713,11 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
       new_options->compaction_options_fifo.file_temperature_age_thresholds[0]
           .age,
       12345);
+  // TODO: try to enhance ObjectLibrary to support singletons
+  // ASSERT_EQ(new_options->compression_manager,
+  //           GetBuiltinV2CompressionManager());
+  ASSERT_STREQ(new_options->compression_manager->Name(),
+               GetBuiltinV2CompressionManager()->Name());
 
   ColumnFamilyOptions rnd_filled_options = *new_options;
 
@@ -716,6 +737,8 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
        sizeof(std::vector<int>)},
       {offsetof(struct MutableCFOptions, compaction_options_fifo),
        sizeof(struct CompactionOptionsFIFO)},
+      {offsetof(struct MutableCFOptions, compression_manager),
+       sizeof(std::shared_ptr<CompressionManager>)},
       {offsetof(struct MutableCFOptions, compression_per_level),
        sizeof(std::vector<CompressionType>)},
       {offsetof(struct MutableCFOptions, max_file_size),
diff --git a/options/options_test.cc b/options/options_test.cc
index 159cfec85570..1828dc9d86a3 100644
--- a/options/options_test.cc
+++ b/options/options_test.cc
@@ -160,6 +160,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
       {"keep_log_file_num", "39"},
       {"recycle_log_file_num", "5"},
       {"max_manifest_file_size", "40"},
+      {"max_manifest_space_amp_pct", "42"},
       {"table_cache_numshardbits", "41"},
       {"WAL_ttl_seconds", "43"},
       {"WAL_size_limit_MB", "44"},
@@ -200,7 +201,6 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
   ASSERT_EQ(new_cf_opt.write_buffer_size, 1U);
   ASSERT_EQ(new_cf_opt.max_write_buffer_number, 2);
   ASSERT_EQ(new_cf_opt.min_write_buffer_number_to_merge, 3);
-  ASSERT_EQ(new_cf_opt.max_write_buffer_number_to_maintain, 99);
   ASSERT_EQ(new_cf_opt.max_write_buffer_size_to_maintain, -99999);
   ASSERT_EQ(new_cf_opt.compression, kSnappyCompression);
   ASSERT_EQ(new_cf_opt.compression_per_level.size(), 8U);
@@ -342,7 +342,8 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
   ASSERT_EQ(new_db_opt.log_file_time_to_roll, 38U);
   ASSERT_EQ(new_db_opt.keep_log_file_num, 39U);
   ASSERT_EQ(new_db_opt.recycle_log_file_num, 5U);
-  ASSERT_EQ(new_db_opt.max_manifest_file_size, static_cast<uint64_t>(40));
+  ASSERT_EQ(new_db_opt.max_manifest_file_size, uint64_t{40});
+  ASSERT_EQ(new_db_opt.max_manifest_space_amp_pct, 42);
   ASSERT_EQ(new_db_opt.table_cache_numshardbits, 41);
   ASSERT_EQ(new_db_opt.WAL_ttl_seconds, static_cast<uint64_t>(43));
   ASSERT_EQ(new_db_opt.WAL_size_limit_MB, static_cast<uint64_t>(44));
@@ -1721,15 +1722,31 @@ TEST_F(OptionsTest, MutableCFOptions) {
 
   ASSERT_OK(GetColumnFamilyOptionsFromString(
       config_options, cf_opts,
-      "paranoid_file_checks=true; block_based_table_factory.block_align=false; "
+      "paranoid_file_checks=true; "
+      "verify_output_flags=2049; "
+      "block_based_table_factory.block_align=false; "
+      "block_based_table_factory.super_block_alignment_size=65536; "
+      "block_based_table_factory.super_block_alignment_space_overhead_ratio="
+      "4096; "
       "block_based_table_factory.block_size=8192;",
       &cf_opts));
   ASSERT_TRUE(cf_opts.paranoid_file_checks);
+  ASSERT_NE(
+      (cf_opts.verify_output_flags & VerifyOutputFlags::kVerifyBlockChecksum),
+      VerifyOutputFlags::kVerifyNone);
+  ASSERT_NE((cf_opts.verify_output_flags &
+             VerifyOutputFlags::kEnableForRemoteCompaction),
+            VerifyOutputFlags::kVerifyNone);
+  ASSERT_EQ((cf_opts.verify_output_flags &
+             VerifyOutputFlags::kEnableForLocalCompaction),
+            VerifyOutputFlags::kVerifyNone);
   ASSERT_NE(cf_opts.table_factory.get(), nullptr);
   auto* bbto = cf_opts.table_factory->GetOptions<BlockBasedTableOptions>();
   ASSERT_NE(bbto, nullptr);
   ASSERT_EQ(bbto->block_size, 8192);
   ASSERT_EQ(bbto->block_align, false);
+  ASSERT_EQ(bbto->super_block_alignment_size, 65536);
+  ASSERT_EQ(bbto->super_block_alignment_space_overhead_ratio, 4096);
   std::unordered_map<std::string, std::string> unused_opts;
   ASSERT_OK(GetColumnFamilyOptionsFromMap(
       config_options, cf_opts, {{"paranoid_file_checks", "false"}}, &cf_opts));
@@ -2032,7 +2049,7 @@ TEST_F(OptionsTest, GetStringFromCompressionType) {
   ASSERT_EQ(res, "kZlibCompression");
 
   ASSERT_NOK(
-      GetStringFromCompressionType(&res, static_cast<CompressionType>(-10)));
+      GetStringFromCompressionType(&res, static_cast<CompressionType>(0x7F)));
 }
 
 TEST_F(OptionsTest, OnlyMutableDBOptions) {
@@ -2400,6 +2417,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) {
       {"max_compaction_bytes", "21"},
       {"soft_rate_limit", "1.1"},
       {"hard_rate_limit", "2.1"},
+      {"snap_refresh_nanos", "1000000"},
       {"rate_limit_delay_max_milliseconds", "100"},
       {"hard_pending_compaction_bytes_limit", "211"},
       {"arena_block_size", "22"},
@@ -2464,6 +2482,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) {
       {"keep_log_file_num", "39"},
       {"recycle_log_file_num", "5"},
       {"max_manifest_file_size", "40"},
+      {"max_manifest_space_amp_pct", "42"},
       {"table_cache_numshardbits", "41"},
       {"WAL_ttl_seconds", "43"},
       {"WAL_size_limit_MB", "44"},
@@ -2498,7 +2517,6 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) {
   ASSERT_EQ(new_cf_opt.write_buffer_size, 1U);
   ASSERT_EQ(new_cf_opt.max_write_buffer_number, 2);
   ASSERT_EQ(new_cf_opt.min_write_buffer_number_to_merge, 3);
-  ASSERT_EQ(new_cf_opt.max_write_buffer_number_to_maintain, 99);
   ASSERT_EQ(new_cf_opt.max_write_buffer_size_to_maintain, -99999);
   ASSERT_EQ(new_cf_opt.compression, kSnappyCompression);
   ASSERT_EQ(new_cf_opt.compression_per_level.size(), 8U);
@@ -2578,6 +2596,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) {
   ASSERT_EQ(new_cf_opt.optimize_filters_for_hits, true);
   ASSERT_EQ(new_cf_opt.prefix_extractor->AsString(), "rocksdb.FixedPrefix.31");
   ASSERT_EQ(new_cf_opt.experimental_mempurge_threshold, 0.003);
+  ASSERT_EQ(new_cf_opt.verify_output_flags, VerifyOutputFlags::kVerifyNone);
   ASSERT_EQ(new_cf_opt.enable_blob_files, true);
   ASSERT_EQ(new_cf_opt.min_blob_size, 1ULL << 10);
   ASSERT_EQ(new_cf_opt.blob_file_size, 1ULL << 30);
@@ -2650,7 +2669,8 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) {
   ASSERT_EQ(new_db_opt.log_file_time_to_roll, 38U);
   ASSERT_EQ(new_db_opt.keep_log_file_num, 39U);
   ASSERT_EQ(new_db_opt.recycle_log_file_num, 5U);
-  ASSERT_EQ(new_db_opt.max_manifest_file_size, static_cast<uint64_t>(40));
+  ASSERT_EQ(new_db_opt.max_manifest_file_size, uint64_t{40});
+  ASSERT_EQ(new_db_opt.max_manifest_space_amp_pct, 42);
   ASSERT_EQ(new_db_opt.table_cache_numshardbits, 41);
   ASSERT_EQ(new_db_opt.WAL_ttl_seconds, static_cast<uint64_t>(43));
   ASSERT_EQ(new_db_opt.WAL_size_limit_MB, static_cast<uint64_t>(44));
diff --git a/port/jemalloc_helper.h b/port/jemalloc_helper.h
index d89d0b8c38f2..1fca386c01c3 100644
--- a/port/jemalloc_helper.h
+++ b/port/jemalloc_helper.h
@@ -59,33 +59,31 @@ static inline bool HasJemalloc() { return true; }
 
 // Declare non-standard jemalloc APIs as weak symbols. We can null-check these
 // symbols to detect whether jemalloc is linked with the binary.
-extern "C" JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW *
+extern "C" JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW*
 mallocx(size_t, int) JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1)
     __attribute__((__weak__));
-extern "C" JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW *
-rallocx(void *, size_t, int) JEMALLOC_ALLOC_SIZE(2) __attribute__((__weak__));
-extern "C" size_t JEMALLOC_NOTHROW xallocx(void *, size_t, size_t, int)
+extern "C" JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW*
+rallocx(void*, size_t, int) JEMALLOC_ALLOC_SIZE(2) __attribute__((__weak__));
+extern "C" size_t JEMALLOC_NOTHROW xallocx(void*, size_t, size_t, int)
     __attribute__((__weak__));
-extern "C" size_t JEMALLOC_NOTHROW sallocx(const void *, int)
-    JEMALLOC_ATTR(pure) __attribute__((__weak__));
-extern "C" void JEMALLOC_NOTHROW dallocx(void *, int) __attribute__((__weak__));
-extern "C" void JEMALLOC_NOTHROW sdallocx(void *, size_t, int)
+extern "C" size_t JEMALLOC_NOTHROW sallocx(const void*, int) JEMALLOC_ATTR(pure)
+    __attribute__((__weak__));
+extern "C" void JEMALLOC_NOTHROW dallocx(void*, int) __attribute__((__weak__));
+extern "C" void JEMALLOC_NOTHROW sdallocx(void*, size_t, int)
     __attribute__((__weak__));
 extern "C" size_t JEMALLOC_NOTHROW nallocx(size_t, int) JEMALLOC_ATTR(pure)
     __attribute__((__weak__));
-extern "C" int JEMALLOC_NOTHROW mallctl(const char *, void *, size_t *, void *,
+extern "C" int JEMALLOC_NOTHROW mallctl(const char*, void*, size_t*, void*,
                                         size_t) __attribute__((__weak__));
-extern "C" int JEMALLOC_NOTHROW mallctlnametomib(const char *, size_t *,
-                                                 size_t *)
-    __attribute__((__weak__));
-extern "C" int JEMALLOC_NOTHROW mallctlbymib(const size_t *, size_t, void *,
-                                             size_t *, void *, size_t)
+extern "C" int JEMALLOC_NOTHROW mallctlnametomib(const char*, size_t*, size_t*)
     __attribute__((__weak__));
-extern "C" void JEMALLOC_NOTHROW
-malloc_stats_print(void (*)(void *, const char *), void *, const char *)
+extern "C" int JEMALLOC_NOTHROW mallctlbymib(const size_t*, size_t, void*,
+                                             size_t*, void*, size_t)
     __attribute__((__weak__));
+extern "C" void JEMALLOC_NOTHROW malloc_stats_print(
+    void (*)(void*, const char*), void*, const char*) __attribute__((__weak__));
 extern "C" size_t JEMALLOC_NOTHROW
-malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void *) JEMALLOC_CXX_THROW
+malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void*) JEMALLOC_CXX_THROW
     __attribute__((__weak__));
 
 // Check if Jemalloc is linked with the binary. Note the main program might be
diff --git a/port/lang.h b/port/lang.h
index ab79f9d22a75..f0418cedaeda 100644
--- a/port/lang.h
+++ b/port/lang.h
@@ -69,6 +69,10 @@ constexpr bool kMustFreeHeapAllocations = false;
 #define TSAN_SUPPRESSION
 #endif  // TSAN_SUPPRESSION
 
+// Fail in debug build with a useful message, for automatically grouping
+// related failures
+#define DEBUG_FAIL(msg) assert(false && msg)
+
 // Compile-time CPU feature testing compatibility
 //
 // A way to be extra sure these defines have been included.
diff --git a/port/mmap.cc b/port/mmap.cc
index 36e8f32617fb..36977f17b9f4 100644
--- a/port/mmap.cc
+++ b/port/mmap.cc
@@ -43,7 +43,7 @@ MemMapping& MemMapping::operator=(MemMapping&& other) noexcept {
     return *this;
   }
   this->~MemMapping();
-  std::memcpy(this, &other, sizeof(*this));
+  std::memcpy(static_cast<void*>(this), &other, sizeof(*this));
   new (&other) MemMapping();
   return *this;
 }
diff --git a/port/port_example.h b/port/port_example.h
index f9e94d00f865..6bbb5b2e330b 100644
--- a/port/port_example.h
+++ b/port/port_example.h
@@ -74,28 +74,5 @@ using OnceType = intptr_t;
 #define LEVELDB_ONCE_INIT 0
 void InitOnce(port::OnceType*, void (*initializer)());
 
-// ------------------ Compression -------------------
-
-// Store the snappy compression of "input[0,input_length-1]" in *output.
-// Returns false if snappy is not supported by this port.
-bool Snappy_Compress(const char* input, size_t input_length,
-                     std::string* output);
-
-// If input[0,input_length-1] looks like a valid snappy compressed
-// buffer, store the size of the uncompressed data in *result and
-// return true.  Else return false.
-bool Snappy_GetUncompressedLength(const char* input, size_t length,
-                                  size_t* result);
-
-// Attempt to snappy uncompress input[0,input_length-1] into *output.
-// Returns true if successful, false if the input is invalid lightweight
-// compressed data.
-//
-// REQUIRES: at least the first "n" bytes of output[] must be writable
-// where "n" is the result of a successful call to
-// Snappy_GetUncompressedLength.
-bool Snappy_Uncompress(const char* input_data, size_t input_length,
-                       char* output);
-
 }  // namespace port
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/port/port_posix.cc b/port/port_posix.cc
index 7042a710dc84..1159d0bf8a63 100644
--- a/port/port_posix.cc
+++ b/port/port_posix.cc
@@ -220,8 +220,9 @@ int GetMaxOpenFiles() {
     return std::numeric_limits<int>::max();
   }
   return static_cast<int>(no_files_limit.rlim_cur);
-#endif
+#else
   return -1;
+#endif
 }
 
 void* cacheline_aligned_alloc(size_t size) {
diff --git a/port/win/io_win.cc b/port/win/io_win.cc
index 2ba64b326554..63e5d6a7e16e 100644
--- a/port/win/io_win.cc
+++ b/port/win/io_win.cc
@@ -242,6 +242,16 @@ size_t WinMmapReadableFile::GetUniqueId(char* id, size_t max_size) const {
   return GetUniqueIdFromFile(hFile_, id, max_size);
 }
 
+IOStatus WinMmapReadableFile::GetFileSize(uint64_t* size) {
+  LARGE_INTEGER fileSize;
+  if (GetFileSizeEx(hFile_, &fileSize)) {
+    *size = fileSize.QuadPart;
+    return IOStatus::OK();
+  } else {
+    return IOStatus::IOError("Failed to get file size", filename_);
+  }
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// WinMmapFile
 
@@ -735,6 +745,16 @@ size_t WinRandomAccessFile::GetRequiredBufferAlignment() const {
   return GetAlignment();
 }
 
+IOStatus WinRandomAccessFile::GetFileSize(uint64_t* size) {
+  LARGE_INTEGER fileSize;
+  if (GetFileSizeEx(hFile_, &fileSize)) {
+    *size = fileSize.QuadPart;
+    return IOStatus::OK();
+  } else {
+    return IOStatus::IOError("Failed to get file size", filename_);
+  }
+}
+
 /////////////////////////////////////////////////////////////////////////////
 // WinWritableImpl
 //
diff --git a/port/win/io_win.h b/port/win/io_win.h
index e1a6197ce86b..29511d47ee68 100644
--- a/port/win/io_win.h
+++ b/port/win/io_win.h
@@ -152,6 +152,8 @@ class WinMmapReadableFile : private WinFileData, public FSRandomAccessFile {
   IOStatus InvalidateCache(size_t offset, size_t length) override;
 
   size_t GetUniqueId(char* id, size_t max_size) const override;
+
+  IOStatus GetFileSize(uint64_t* file_size) override;
 };
 
 // We preallocate and use memcpy to append new
@@ -292,6 +294,8 @@ class WinRandomAccessFile
   IOStatus InvalidateCache(size_t offset, size_t length) override;
 
   size_t GetRequiredBufferAlignment() const override;
+
+  IOStatus GetFileSize(uint64_t* file_size) override;
 };
 
 // This is a sequential write class. It has been mimicked (as others) after
diff --git a/port/win/xpress_win.cc b/port/win/xpress_win.cc
index 21904d502674..a90179bc1283 100644
--- a/port/win/xpress_win.cc
+++ b/port/win/xpress_win.cc
@@ -125,6 +125,57 @@ bool Compress(const char* input, size_t length, std::string* output) {
   return true;
 }
 
+size_t CompressWithMaxSize(const char* input, size_t length, char* output,
+                           size_t max_output_size) {
+  assert(input != nullptr);
+  if (max_output_size == 0) {
+    return 0;
+  }
+  assert(output != nullptr);
+
+  COMPRESS_ALLOCATION_ROUTINES* allocRoutinesPtr = nullptr;
+
+  COMPRESSOR_HANDLE compressor = NULL;
+
+  BOOL success =
+      CreateCompressor(COMPRESS_ALGORITHM_XPRESS,  //  Compression Algorithm
+                       allocRoutinesPtr,  //  Optional allocation routine
+                       &compressor);      //  Handle
+
+  if (!success) {
+#ifdef _DEBUG
+    std::cerr << "XPRESS: Failed to create Compressor LastError: "
+              << GetLastError() << std::endl;
+#endif
+    return 0;
+  }
+
+  std::unique_ptr<void, decltype(CloseCompressorFun)> compressorGuard(
+      compressor, CloseCompressorFun);
+
+  SIZE_T compressed_size = 0;
+  //  Compress
+  success = ::Compress(compressor,                //  Compressor Handle
+                       const_cast<char*>(input),  //  Input buffer
+                       length,                    //  Uncompressed data size
+                       output,                    //  Compressed Buffer
+                       max_output_size,           //  Compressed Buffer size
+                       &compressed_size);         //  Compressed Data size
+
+  if (!success) {
+#ifdef _DEBUG
+    auto error = GetLastError();
+    if (error != ERROR_INSUFFICIENT_BUFFER) {
+      std::cerr << "XPRESS: Failed to compress LastError " << error
+                << std::endl;
+    }
+#endif
+    return 0;
+  } else {
+    return compressed_size;
+  }
+}
+
 char* Decompress(const char* input_data, size_t input_length,
                  size_t* uncompressed_size) {
   assert(input_data != nullptr);
@@ -151,7 +202,7 @@ char* Decompress(const char* input_data, size_t input_length,
     return nullptr;
   }
 
-  std::unique_ptr<void, decltype(CloseDecompressorFun)> compressorGuard(
+  std::unique_ptr<void, decltype(CloseDecompressorFun)> decompressorGuard(
       decompressor, CloseDecompressorFun);
 
   SIZE_T decompressedBufferSize = 0;
@@ -201,6 +252,104 @@ char* Decompress(const char* input_data, size_t input_length,
   // Return the raw buffer to the caller supporting the tradition
   return outputBuffer.release();
 }
+
+int64_t GetDecompressedSize(const char* input_data, size_t input_length) {
+  assert(input_data != nullptr);
+
+  if (input_length == 0) {
+    return 0;
+  }
+
+  COMPRESS_ALLOCATION_ROUTINES* allocRoutinesPtr = nullptr;
+
+  DECOMPRESSOR_HANDLE decompressor = NULL;
+
+  BOOL success =
+      CreateDecompressor(COMPRESS_ALGORITHM_XPRESS,  //  Compression Algorithm
+                         allocRoutinesPtr,  //  Optional allocation routine
+                         &decompressor);    //  Handle
+
+  if (!success) {
+#ifdef _DEBUG
+    std::cerr << "XPRESS: Failed to create Decompressor LastError "
+              << GetLastError() << std::endl;
+#endif
+    return -1;
+  }
+
+  std::unique_ptr<void, decltype(CloseDecompressorFun)> decompressorGuard(
+      decompressor, CloseDecompressorFun);
+
+  SIZE_T decompressedBufferSize = 0;
+
+  success = ::Decompress(decompressor,                   //  Compressor Handle
+                         const_cast<char*>(input_data),  //  Compressed data
+                         input_length,              //  Compressed data size
+                         NULL,                      //  Buffer set to NULL
+                         0,                         //  Buffer size set to 0
+                         &decompressedBufferSize);  //  Decompressed Data size
+
+  assert(!success);
+  auto lastError = GetLastError();
+
+  if (lastError != ERROR_INSUFFICIENT_BUFFER) {
+#ifdef _DEBUG
+    std::cerr
+        << "XPRESS: Failed to estimate decompressed buffer size LastError "
+        << lastError << std::endl;
+#endif
+    return -1;
+  }
+
+  assert(decompressedBufferSize > 0);
+  return static_cast<int64_t>(decompressedBufferSize);
+}
+
+int64_t DecompressToBuffer(const char* input, size_t input_length, char* output,
+                           size_t output_length) {
+  assert(input != nullptr);
+  assert(output != nullptr);
+
+  if (input_length == 0) {
+    return 0;
+  }
+
+  COMPRESS_ALLOCATION_ROUTINES* allocRoutinesPtr = nullptr;
+
+  DECOMPRESSOR_HANDLE decompressor = NULL;
+
+  BOOL success =
+      CreateDecompressor(COMPRESS_ALGORITHM_XPRESS,  //  Compression Algorithm
+                         allocRoutinesPtr,  //  Optional allocation routine
+                         &decompressor);    //  Handle
+
+  if (!success) {
+#ifdef _DEBUG
+    std::cerr << "XPRESS: Failed to create Decompressor LastError "
+              << GetLastError() << std::endl;
+#endif
+    return -1;
+  }
+
+  std::unique_ptr<void, decltype(CloseDecompressorFun)> decompressorGuard(
+      decompressor, CloseDecompressorFun);
+
+  SIZE_T decompressedDataSize = 0;
+
+  success = ::Decompress(decompressor, const_cast<char*>(input), input_length,
+                         output, output_length, &decompressedDataSize);
+
+  if (!success) {
+#ifdef _DEBUG
+    std::cerr << "XPRESS: Failed to decompress LastError " << GetLastError()
+              << std::endl;
+#endif
+    return -1;
+  }
+
+  return static_cast<int64_t>(decompressedDataSize);
+}
+
 }  // namespace xpress
 }  // namespace port
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/port/win/xpress_win.h b/port/win/xpress_win.h
index 187adffa658a..00cc1b9fc3dc 100644
--- a/port/win/xpress_win.h
+++ b/port/win/xpress_win.h
@@ -19,8 +19,18 @@ namespace xpress {
 
 bool Compress(const char* input, size_t length, std::string* output);
 
+// Returns written size or 0 on failure including if buffer is too small.
+size_t CompressWithMaxSize(const char* input, size_t length, char* output,
+                           size_t max_output_size);
+
 char* Decompress(const char* input_data, size_t input_length,
                  size_t* uncompressed_size);
+
+int64_t GetDecompressedSize(const char* input, size_t input_length);
+
+int64_t DecompressToBuffer(const char* input, size_t input_length, char* output,
+                           size_t output_length);
+
 }  // namespace xpress
 }  // namespace port
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/src.mk b/src.mk
index 3f1de6688684..a77efc8f6123 100644
--- a/src.mk
+++ b/src.mk
@@ -80,6 +80,7 @@ LIB_SOURCES =                                                   \
   db/memtable_list.cc                                           \
   db/merge_helper.cc                                            \
   db/merge_operator.cc                                          \
+  db/multi_scan.cc						\
   db/output_validator.cc                                        \
   db/periodic_task_scheduler.cc                                 \
   db/range_del_aggregator.cc                                    \
@@ -205,7 +206,7 @@ LIB_SOURCES =                                                   \
   table/cuckoo/cuckoo_table_builder.cc                          \
   table/cuckoo/cuckoo_table_factory.cc                          \
   table/cuckoo/cuckoo_table_reader.cc                           \
-  table/external_table_reader.cc				\
+  table/external_table.cc					\
   table/format.cc                                               \
   table/get_context.cc                                          \
   table/iterator.cc                                             \
@@ -237,6 +238,7 @@ LIB_SOURCES =                                                   \
   trace_replay/block_cache_tracer.cc                            \
   trace_replay/io_tracer.cc                                     \
   util/async_file_reader.cc					                            \
+  util/auto_tune_compressor.cc                                           \
   util/build_version.cc                                         \
   util/cleanable.cc                                             \
   util/coding.cc                                                \
@@ -256,11 +258,13 @@ LIB_SOURCES =                                                   \
   util/ribbon_config.cc                                         \
   util/slice.cc                                                 \
   util/file_checksum_helper.cc                                  \
+  util/simple_mixed_compressor.cc                               \
   util/status.cc                                                \
   util/stderr_logger.cc                                         \
   util/string_util.cc                                           \
   util/thread_local.cc                                          \
   util/threadpool_imp.cc                                        \
+  util/io_dispatcher_imp.cc                                     \
   util/udt_util.cc                                              \
   util/write_batch_util.cc                                      \
   util/xxhash.cc                                                \
@@ -364,6 +368,7 @@ RANGE_TREE_SOURCES =\
   utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc
 
 TOOL_LIB_SOURCES =                                              \
+  db_stress_tool/db_stress_compression_manager.cc               \
   tools/io_tracer_parser_tool.cc                                \
   tools/ldb_cmd.cc                                              \
   tools/ldb_tool.cc                                             \
@@ -382,19 +387,23 @@ BENCH_LIB_SOURCES =                                             \
   tools/tool_hooks.cc                                           \
   tools/simulated_hybrid_file_system.cc                         \
 
-CACHE_BENCH_LIB_SOURCES =					                              \
+CACHE_BENCH_LIB_SOURCES =                                       \
   cache/cache_bench_tool.cc                                     \
 
+POINT_LOCK_BENCH_LIB_SOURCES =                                  \
+  utilities/transactions/lock/point/point_lock_bench_tool.cc    \
+
 STRESS_LIB_SOURCES =                                           \
   db_stress_tool/batched_ops_stress.cc                         \
   db_stress_tool/cf_consistency_stress.cc                      \
   db_stress_tool/db_stress_common.cc                           \
+  db_stress_tool/db_stress_compaction_service.cc               \
+  db_stress_tool/db_stress_compression_manager.cc              \
   db_stress_tool/db_stress_driver.cc                           \
   db_stress_tool/db_stress_filters.cc                          \
   db_stress_tool/db_stress_gflags.cc                           \
   db_stress_tool/db_stress_listener.cc                         \
   db_stress_tool/db_stress_shared_state.cc                     \
-  db_stress_tool/db_stress_stat.cc                             \
   db_stress_tool/db_stress_test_base.cc                        \
   db_stress_tool/db_stress_tool.cc                             \
   db_stress_tool/db_stress_wide_merge_operator.cc              \
@@ -481,11 +490,13 @@ TEST_MAIN_SOURCES =                                                     \
   db/db_basic_test.cc                                                   \
   db/db_block_cache_test.cc                                             \
   db/db_bloom_filter_test.cc                                            \
+  db/db_compaction_abort_test.cc                                        \
   db/db_compaction_filter_test.cc                                       \
   db/db_compaction_test.cc                                              \
   db/db_clip_test.cc                                                    \
   db/db_dynamic_level_test.cc                                           \
   db/db_encryption_test.cc                                              \
+  db/db_etc3_test.cc                                                    \
   db/db_flush_test.cc                                                   \
   db/db_follower_test.cc						                                    \
   db/db_readonly_with_timestamp_test.cc                                 \
@@ -591,6 +602,7 @@ TEST_MAIN_SOURCES =                                                     \
   table/table_test.cc                                                   \
   table/block_fetcher_test.cc                                           \
   test_util/testutil_test.cc                                            \
+  util/compression_test.cc                                            \
   tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc         \
   tools/io_tracer_parser_test.cc                                        \
   tools/ldb_cmd_test.cc                                                 \
@@ -609,6 +621,8 @@ TEST_MAIN_SOURCES =                                                     \
   util/file_reader_writer_test.cc                                       \
   util/hash_test.cc                                                     \
   util/heap_test.cc                                                     \
+  util/interval_test.cc                                                 \
+  util/io_dispatcher_test.cc                                            \
   util/random_test.cc                                                   \
   util/rate_limiter_test.cc                                             \
   util/repeatable_thread_test.cc                                        \
@@ -645,7 +659,9 @@ TEST_MAIN_SOURCES =                                                     \
   utilities/transactions/lock/range/range_locking_test.cc               \
   utilities/transactions/transaction_test.cc                            \
   utilities/transactions/lock/point/point_lock_manager_test.cc          \
+  utilities/transactions/lock/point/point_lock_manager_stress_test.cc   \
   utilities/transactions/write_prepared_transaction_test.cc             \
+  utilities/transactions/write_prepared_transaction_test_seqno.cc       \
   utilities/transactions/write_unprepared_transaction_test.cc           \
   utilities/transactions/write_committed_transaction_ts_test.cc         \
   utilities/transactions/timestamped_snapshot_test.cc                   \
diff --git a/table/adaptive/adaptive_table_factory.cc b/table/adaptive/adaptive_table_factory.cc
index f06b265328f8..db3f7625a710 100644
--- a/table/adaptive/adaptive_table_factory.cc
+++ b/table/adaptive/adaptive_table_factory.cc
@@ -51,8 +51,7 @@ Status AdaptiveTableFactory::NewTableReader(
       footer.table_magic_number() == kLegacyPlainTableMagicNumber) {
     return plain_table_factory_->NewTableReader(
         table_reader_options, std::move(file), file_size, table);
-  } else if (footer.table_magic_number() == kBlockBasedTableMagicNumber ||
-             footer.table_magic_number() == kLegacyBlockBasedTableMagicNumber) {
+  } else if (footer.table_magic_number() == kBlockBasedTableMagicNumber) {
     return block_based_table_factory_->NewTableReader(
         ro, table_reader_options, std::move(file), file_size, table,
         prefetch_index_and_filter_in_cache);
diff --git a/table/block_based/binary_search_index_reader.cc b/table/block_based/binary_search_index_reader.cc
index abe09d86fb3a..940bb261db23 100644
--- a/table/block_based/binary_search_index_reader.cc
+++ b/table/block_based/binary_search_index_reader.cc
@@ -63,7 +63,8 @@ InternalIteratorBase<IndexValue>* BinarySearchIndexReader::NewIterator(
       internal_comparator()->user_comparator(),
       rep->get_global_seqno(BlockType::kIndex), iter, kNullStats, true,
       index_has_first_key(), index_key_includes_seq(), index_value_is_full(),
-      false /* block_contents_pinned */, user_defined_timestamps_persisted());
+      false /* block_contents_pinned */, user_defined_timestamps_persisted(),
+      nullptr /* prefix_index */, rep->table_options.index_block_search_type);
 
   assert(it != nullptr);
   index_block.TransferTo(it);
diff --git a/table/block_based/block.cc b/table/block_based/block.cc
index ea4d559a2a40..fe316a37be72 100644
--- a/table/block_based/block.cc
+++ b/table/block_based/block.cc
@@ -24,6 +24,7 @@
 #include "table/block_based/data_block_footer.h"
 #include "table/format.h"
 #include "util/coding.h"
+#include "util/math.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -152,6 +153,39 @@ struct DecodeEntryV4 {
   }
 };
 
+// Read first 8 bytes (starting at offset) as big-endian uint64_t, padding
+// with zeros on the right if the key is shorter. This preserves
+// lexicographic ordering.
+//
+// If s.size() >= offset, then returns 0.
+static uint64_t ReadBe64FromKey(Slice s, bool is_user_key, size_t offset) {
+  if (!is_user_key) {
+    assert(s.size() >= kNumInternalBytes);
+    s = Slice(s.data(), s.size() - kNumInternalBytes);
+  }
+  offset = std::min(offset, s.size());
+  size_t remaining = s.size() - offset;
+
+  // fast path
+  if (remaining >= 8) {
+    uint64_t val;
+    memcpy(&val, s.data() + offset, sizeof(val));
+    if (port::kLittleEndian) {
+      return EndianSwapValue(val);
+    }
+    return val;
+  }
+
+  uint64_t val = 0;
+  for (size_t i = 0; i < remaining; i++) {
+    val = (val << 8) | static_cast<uint8_t>(s.data()[offset + i]);
+  }
+  if (remaining > 0) {
+    val <<= (8 - remaining) * 8;  // Pad zeros on the right
+  }
+  return val;
+}
+
 void DataBlockIter::NextImpl() {
 #ifndef NDEBUG
   if (TEST_Corrupt_Callback("DataBlockIter::NextImpl")) {
@@ -307,7 +341,8 @@ void DataBlockIter::SeekImpl(const Slice& target) {
   }
   uint32_t index = 0;
   bool skip_linear_scan = false;
-  bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
+  bool ok = BinarySeekRestartPointIndex<DecodeKey>(seek_key, &index,
+                                                   &skip_linear_scan);
 
   if (!ok) {
     return;
@@ -323,7 +358,8 @@ void MetaBlockIter::SeekImpl(const Slice& target) {
   }
   uint32_t index = 0;
   bool skip_linear_scan = false;
-  bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
+  bool ok = BinarySeekRestartPointIndex<DecodeKey>(seek_key, &index,
+                                                   &skip_linear_scan);
 
   if (!ok) {
     return;
@@ -440,8 +476,8 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) {
     return true;
   }
 
-  if (icmp_->user_comparator()->Compare(raw_key_.GetUserKey(),
-                                        target_user_key) != 0) {
+  if (icmp_.user_comparator()->Compare(raw_key_.GetUserKey(),
+                                       target_user_key) != 0) {
     // the key is not in this block and cannot be at the next block either.
     return false;
   }
@@ -494,10 +530,14 @@ void IndexBlockIter::SeekImpl(const Slice& target) {
     // restart interval must be one when hash search is enabled so the binary
     // search simply lands at the right place.
     skip_linear_scan = true;
-  } else if (value_delta_encoded_) {
-    ok = BinarySeek<DecodeKeyV4>(seek_key, &index, &skip_linear_scan);
   } else {
-    ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
+    if (value_delta_encoded_) {
+      ok = FindRestartPointForSeek<DecodeKeyV4>(seek_key, &index,
+                                                &skip_linear_scan);
+    } else {
+      ok = FindRestartPointForSeek<DecodeKey>(seek_key, &index,
+                                              &skip_linear_scan);
+    }
   }
 
   if (!ok) {
@@ -506,6 +546,18 @@ void IndexBlockIter::SeekImpl(const Slice& target) {
   FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
 }
 
+template <typename DecodeKeyFunc>
+bool IndexBlockIter::FindRestartPointForSeek(const Slice& seek_key,
+                                             uint32_t* index,
+                                             bool* skip_linear_scan) {
+  if (index_search_type_ == BlockBasedTableOptions::kBinary) {
+    return BinarySeekRestartPointIndex<DecodeKeyFunc>(seek_key, index,
+                                                      skip_linear_scan);
+  }
+  return InterpolationSeekRestartPointIndex<DecodeKeyFunc>(seek_key, index,
+                                                           skip_linear_scan);
+}
+
 void DataBlockIter::SeekForPrevImpl(const Slice& target) {
   PERF_TIMER_GUARD(block_seek_nanos);
   Slice seek_key = target;
@@ -514,7 +566,8 @@ void DataBlockIter::SeekForPrevImpl(const Slice& target) {
   }
   uint32_t index = 0;
   bool skip_linear_scan = false;
-  bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
+  bool ok = BinarySeekRestartPointIndex<DecodeKey>(seek_key, &index,
+                                                   &skip_linear_scan);
 
   if (!ok) {
     return;
@@ -540,7 +593,8 @@ void MetaBlockIter::SeekForPrevImpl(const Slice& target) {
   }
   uint32_t index = 0;
   bool skip_linear_scan = false;
-  bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
+  bool ok = BinarySeekRestartPointIndex<DecodeKey>(seek_key, &index,
+                                                   &skip_linear_scan);
 
   if (!ok) {
     return;
@@ -816,9 +870,27 @@ void BlockIter<TValue>::FindKeyAfterBinarySeek(const Slice& target,
   }
 }
 
-// Binary searches in restart array to find the starting restart point for the
-// linear scan, and stores it in `*index`. Assumes restart array does not
-// contain duplicate keys. It is guaranteed that the restart key at `*index + 1`
+// Get the key slice at a given restart point index.
+template <class TValue>
+template <typename DecodeKeyFunc>
+bool BlockIter<TValue>::GetRestartKey(uint32_t index, Slice* key) {
+  uint32_t region_offset = GetRestartPoint(index);
+  uint32_t shared, non_shared;
+  const char* key_ptr = DecodeKeyFunc()(
+      data_ + region_offset, data_ + restarts_, &shared, &non_shared);
+  if (key_ptr == nullptr || (shared != 0)) {
+    CorruptionError();
+    return false;
+  }
+  *key = Slice(key_ptr, non_shared);
+  return true;
+}
+
+// Searches in restart array using binary search to find the starting restart
+// point for the linear scan, and stores it in `*index`. Assumes restart array
+// does not contain duplicate keys.
+//
+// It is guaranteed that the restart key at `*index + 1`
 // is strictly greater than `target` or does not exist (this can be used to
 // elide a comparison when linear scan reaches all the way to the next restart
 // key). Furthermore, `*skip_linear_scan` is set to indicate whether the
@@ -826,15 +898,15 @@ void BlockIter<TValue>::FindKeyAfterBinarySeek(const Slice& target,
 // compared again later.
 template <class TValue>
 template <typename DecodeKeyFunc>
-bool BlockIter<TValue>::BinarySeek(const Slice& target, uint32_t* index,
-                                   bool* skip_linear_scan) {
+bool BlockIter<TValue>::BinarySeekRestartPointIndex(const Slice& target,
+                                                    uint32_t* index,
+                                                    bool* skip_linear_scan) {
   if (restarts_ == 0) {
     // SST files dedicated to range tombstones are written with index blocks
     // that have no keys while also having `num_restarts_ == 1`. This would
-    // cause a problem for `BinarySeek()` as it'd try to access the first key
-    // which does not exist. We identify such blocks by the offset at which
-    // their restarts are stored, and return false to prevent any attempted
-    // key accesses.
+    // cause a problem as we'd try to access the first key which does not exist.
+    // We identify such blocks by the offset at which their restarts are stored,
+    // and return false to prevent any attempted key accesses.
     return false;
   }
 
@@ -842,23 +914,25 @@ bool BlockIter<TValue>::BinarySeek(const Slice& target, uint32_t* index,
   // Loop invariants:
   // - Restart key at index `left` is less than or equal to the target key. The
   //   sentinel index `-1` is considered to have a key that is less than all
-  //   keys.
+  //   keys. Doing this allows us to avoid a bounds check on left.
   // - Any restart keys after index `right` are strictly greater than the target
   //   key.
-  int64_t left = -1, right = num_restarts_ - 1;
+  int64_t left = -1;
+  int64_t right = num_restarts_ - 1;
+
   while (left != right) {
     // The `mid` is computed by rounding up so it lands in (`left`, `right`].
     int64_t mid = left + (right - left + 1) / 2;
-    uint32_t region_offset = GetRestartPoint(static_cast<uint32_t>(mid));
-    uint32_t shared, non_shared;
-    const char* key_ptr = DecodeKeyFunc()(
-        data_ + region_offset, data_ + restarts_, &shared, &non_shared);
-    if (key_ptr == nullptr || (shared != 0)) {
-      CorruptionError();
+
+    assert(left < mid && mid <= right);
+
+    Slice mid_key;
+    if (!GetRestartKey<DecodeKeyFunc>(static_cast<uint32_t>(mid), &mid_key)) {
       return false;
     }
-    Slice mid_key(key_ptr, non_shared);
+
     UpdateRawKeyAndMaybePadMinTimestamp(mid_key);
+
     int cmp = CompareCurrentKey(target);
     if (cmp < 0) {
       // Key at "mid" is smaller than "target". Therefore all
@@ -885,22 +959,317 @@ bool BlockIter<TValue>::BinarySeek(const Slice& target, uint32_t* index,
   return true;
 }
 
+// Similar effects to BinarySeekRestartPointIndex, except it uses a different
+// algorithm to search for the restart point index (i.e. interpolation search).
+// Interpolation search is typically more efficient for uniformly distributed
+// datasets.
+//
+// Typically, interpolation search requires an integer "value". But because we
+// are searching through variable length binary slices, we must estimate an
+// integer value for each key. Currently, the value is set to be the first 8
+// bytes (read big-endian) that do not share a prefix with the start and end
+// key. As a side effect, this can really only be used with the
+// BytewiseComparator().
+template <class TValue>
+template <typename DecodeKeyFunc>
+bool BlockIter<TValue>::InterpolationSeekRestartPointIndex(
+    const Slice& target, uint32_t* index, bool* skip_linear_scan) {
+  static constexpr int64_t kGuardLen = 8;
+  static constexpr uint64_t kMaxPoorSearches = 8;
+
+  if (restarts_ == 0) {
+    return false;
+  }
+
+  *skip_linear_scan = false;
+  // Currently it is assumed that comparator is always bytewise comparator, but
+  // it may also be useful to to generalize to reverse bytewise in the future.
+  assert(icmp_.user_comparator() == BytewiseComparator());
+
+  int64_t left = -1;
+  int64_t right = num_restarts_ - 1;
+  size_t shared_user_prefix_len = 0;
+
+  Slice left_key;
+  Slice right_key;
+  Slice left_key_suffix;
+  Slice right_key_suffix;
+  Slice target_suffix = target;
+  bool seek_failed = false;
+  bool first_iter = true;
+  uint64_t left_val = 0;
+  uint64_t right_val = 0;
+  uint64_t target_val = 0;
+
+  // A poor search is when less than half the search space is reduced, because
+  // binary search would do better. When there are kMaxPoorSearches in a row,
+  // then fallback to binary search. This helps bound worse cast performance.
+  uint64_t continuous_poor_searches = 0;
+
+  // Loop invariants while not first iteration AND seek has not failed:
+  // - arr[usable_left] = left_key, arr[right] = right_key
+  // - left < mid <= right, and arr[left] < target < arr[right + 1]
+  //
+  // The first iteration is used as an early optimization to determine initial
+  // bounds, and whether target is within those bounds.
+  const bool is_user_key = raw_key_.IsUserKey();
+  const Slice target_user_key = is_user_key ? target : ExtractUserKey(target);
+  while (left != right) {
+    int64_t mid = 0;
+
+    // If either search window is small or we've bad numerous bad guesses, then
+    // fallback to binary search
+    seek_failed = (right - left <= kGuardLen) ||
+                  continuous_poor_searches >= kMaxPoorSearches;
+
+    if (!seek_failed) {
+      // Interpolation seek reads left and right boundaries anyways, so we can
+      // set left = 0. The invariant that left <= target is still held because
+      // we early exit if left > target for the first iteration.
+      const uint32_t usable_left =
+          static_cast<uint32_t>(std::max<int64_t>(left, 0));
+
+      // First iteration: decode both boundary keys and compute shared prefix.
+      if (first_iter) {
+        if (!GetRestartKey<DecodeKeyFunc>(usable_left, &left_key)) {
+          return false;
+        }
+
+        if (!GetRestartKey<DecodeKeyFunc>(static_cast<uint32_t>(right),
+                                          &right_key)) {
+          return false;
+        }
+
+        // Compute the shared prefix length between the user key portions of
+        // the boundary keys. This is used to "normalize" the values calculated
+        // during interpolation search.
+        shared_user_prefix_len = left_key.difference_offset(right_key);
+        if (!is_user_key) {
+          // Ensure shared_user_prefix_len is only limited to user key. Suppose
+          // that the shared prefix of both keys are extended into the internal
+          // footer. If they are not the same user keys, then it is guaranteed
+          // left is the shorter one due to bytewise comparator. For reverse
+          // bytewise, this would be flipped.
+          shared_user_prefix_len = std::min<size_t>(
+              shared_user_prefix_len, left_key.size() - kNumInternalBytes);
+          assert(shared_user_prefix_len <=
+                 right_key.size() - kNumInternalBytes);
+        }
+
+        left_val =
+            ReadBe64FromKey(left_key, is_user_key, shared_user_prefix_len);
+        right_val =
+            ReadBe64FromKey(right_key, is_user_key, shared_user_prefix_len);
+        target_val =
+            ReadBe64FromKey(target, is_user_key, shared_user_prefix_len);
+      }
+
+      assert(shared_user_prefix_len <= left_key.size() &&
+             shared_user_prefix_len <= right_key.size());
+
+      if (first_iter && shared_user_prefix_len > 0) {
+        // It is not guaranteed that the shared_prefix of the left and right
+        // boundaries is a valid prefix of the target. If it is not, then we can
+        // early exit.
+        size_t cmp_len =
+            std::min(target_user_key.size(), shared_user_prefix_len);
+        int cmp = memcmp(target_user_key.data(), left_key.data(), cmp_len);
+        if (cmp < 0 || (cmp == 0 && cmp_len < shared_user_prefix_len)) {
+#ifndef NDEBUG
+          IterKey tmp_key;
+          tmp_key.SetIsUserKey(is_user_key);
+          UpdateRawKeyAndMaybePadMinTimestamp(tmp_key, left_key);
+          assert(CompareKey(tmp_key, target) >= 0);
+#endif
+          // if target size is less than shared_prefix length, and cmp == 0,
+          // then it is guaranteed <= left
+          *skip_linear_scan = true;
+          *index = usable_left;
+          return true;
+        } else if (cmp > 0) {
+#ifndef NDEBUG
+          IterKey tmp_key;
+          tmp_key.SetIsUserKey(is_user_key);
+          UpdateRawKeyAndMaybePadMinTimestamp(tmp_key, right_key);
+          assert(CompareKey(tmp_key, target) < 0);
+#endif
+          *index = static_cast<uint32_t>(right);
+          return true;
+        }
+      }
+
+      assert(shared_user_prefix_len <= target_user_key.size());
+      assert(memcmp(left_key.data(), target_user_key.data(),
+                    shared_user_prefix_len) == 0);
+      assert(memcmp(right_key.data(), target_user_key.data(),
+                    shared_user_prefix_len) == 0);
+
+      if (first_iter) {
+        left_key_suffix = Slice(left_key.data() + shared_user_prefix_len,
+                                left_key.size() - shared_user_prefix_len);
+        right_key_suffix = Slice(right_key.data() + shared_user_prefix_len,
+                                 right_key.size() - shared_user_prefix_len);
+        target_suffix = Slice(target.data() + shared_user_prefix_len,
+                              target.size() - shared_user_prefix_len);
+      }
+
+      if (left_val > right_val) {
+        CorruptionError("left key is greater than right key");
+        return false;
+      }
+
+      bool lte_left = false;
+      bool gt_right = false;
+
+      if (target_val < left_val) {
+        assert(first_iter);
+        assert(CompareKey(left_key_suffix, target_suffix) > 0);
+        lte_left = true;
+      } else if (target_val == left_val) {
+        // target_val == left_val doesn't imply target == left_key
+        // because ReadBe64FromKey only reads 8 bytes and skips sequence
+        // numbers. We need to check actual key order.
+        if (CompareKey(left_key_suffix, target_suffix) >= 0) {
+          assert(first_iter);
+          lte_left = true;
+        }
+      }
+
+      if (!lte_left && !seek_failed) {
+        if (target_val > right_val) {
+          // note that we only ever guarantee arr[target] < arr[right + 1], so
+          // it is possible to end up here even on non-first iteration
+          assert(CompareKey(right_key_suffix, target_suffix) < 0);
+          gt_right = true;
+        } else if (right_val == left_val) {
+          // cannot divide by 0
+          seek_failed = true;
+        }
+      }
+
+      // early exit if key is not within bounds
+      if (lte_left) {
+#ifndef NDEBUG
+        assert(!seek_failed);
+        IterKey tmp_key;
+        tmp_key.SetIsUserKey(is_user_key);
+        UpdateRawKeyAndMaybePadMinTimestamp(tmp_key, left_key);
+        assert(CompareKey(tmp_key, target) >= 0);
+#endif
+        *skip_linear_scan = true;
+        *index = usable_left;
+        return true;
+      }
+      if (gt_right) {
+#ifndef NDEBUG
+        assert(!seek_failed);
+        IterKey tmp_key;
+        tmp_key.SetIsUserKey(is_user_key);
+        UpdateRawKeyAndMaybePadMinTimestamp(tmp_key, right_key);
+        assert(CompareKey(tmp_key, target) < 0);
+#endif
+        *index = static_cast<uint32_t>(right);
+        return true;
+      }
+
+      if (!seek_failed) {
+#ifdef HAVE_UINT128_EXTENSION
+        __uint128_t range = right - usable_left;
+        __uint128_t target_delta = target_val - left_val;
+        uint64_t range_delta = right_val - left_val;
+        int64_t offset =
+            static_cast<int64_t>(range * target_delta / range_delta);
+#else
+        double ratio = static_cast<double>(target_val - left_val) /
+                       static_cast<double>(right_val - left_val);
+        assert(0 <= ratio && ratio <= 1);
+        int64_t range = right - usable_left;
+        int64_t offset = static_cast<int64_t>(range * ratio);
+#endif
+        left = usable_left;  // can reduce search space by 1
+        mid = usable_left + offset;
+        assert(mid <= right);
+        if (mid == usable_left) {
+          // this is to guarantee progress and avoid infinite loop
+          ++mid;
+        }
+      }
+    }
+
+    if (seek_failed) {
+      // Fallback to binary seek
+      mid = left + (right - left + 1) / 2;
+    }
+
+    assert(left < mid && mid <= right);
+
+    Slice mid_key;
+    if (!GetRestartKey<DecodeKeyFunc>(static_cast<uint32_t>(mid), &mid_key)) {
+      return false;
+    }
+
+    Slice mid_key_suffix(mid_key.data() + shared_user_prefix_len,
+                         mid_key.size() - shared_user_prefix_len);
+
+    UpdateRawKeyAndMaybePadMinTimestamp(mid_key_suffix);
+    int cmp = CompareCurrentKey(target_suffix);
+
+    int64_t previous_search_space = right - left;
+    if (cmp < 0) {
+      left = mid;
+      left_key = mid_key;
+      left_key_suffix = mid_key_suffix;
+      left_val = ReadBe64FromKey(left_key, is_user_key, shared_user_prefix_len);
+    } else if (cmp > 0) {
+      right = mid - 1;
+      if (!seek_failed && left != right) {
+        if (!GetRestartKey<DecodeKeyFunc>(static_cast<uint32_t>(right),
+                                          &right_key)) {
+          return false;
+        }
+        right_key_suffix = Slice(right_key.data() + shared_user_prefix_len,
+                                 right_key.size() - shared_user_prefix_len);
+        right_val =
+            ReadBe64FromKey(right_key, is_user_key, shared_user_prefix_len);
+      }
+    } else {
+      *skip_linear_scan = true;
+      left = right = mid;
+    }
+
+    // If seach space is not reduced by at least half, good chance this data is
+    // not uniform.
+    int64_t new_search_space = right - left;
+    if (new_search_space > previous_search_space / 2) {
+      ++continuous_poor_searches;
+    } else {
+      continuous_poor_searches = 0;
+    }
+
+    first_iter = false;
+  }
+
+  if (left == -1) {
+    // All keys in the block were strictly greater than `target`. So the very
+    // first key in the block is the final seek result.
+    *skip_linear_scan = true;
+    *index = 0;
+  } else {
+    *index = static_cast<uint32_t>(left);
+  }
+  return true;
+}
+
 // Compare target key and the block key of the block of `block_index`.
 // Return -1 if error.
 int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) {
-  uint32_t region_offset = GetRestartPoint(block_index);
-  uint32_t shared, non_shared;
-  const char* key_ptr =
-      value_delta_encoded_
-          ? DecodeKeyV4()(data_ + region_offset, data_ + restarts_, &shared,
-                          &non_shared)
-          : DecodeKey()(data_ + region_offset, data_ + restarts_, &shared,
-                        &non_shared);
-  if (key_ptr == nullptr || (shared != 0)) {
-    CorruptionError();
+  Slice block_key;
+  bool ok = value_delta_encoded_
+                ? GetRestartKey<DecodeKeyV4>(block_index, &block_key)
+                : GetRestartKey<DecodeKey>(block_index, &block_key);
+  if (!ok) {
     return 1;  // Return target is smaller
   }
-  Slice block_key(key_ptr, non_shared);
   UpdateRawKeyAndMaybePadMinTimestamp(block_key);
   return CompareCurrentKey(target);
 }
@@ -1015,39 +1384,12 @@ bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index,
   }
 }
 
-uint32_t Block::NumRestarts() const {
-  assert(size_ >= 2 * sizeof(uint32_t));
-  uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t));
-  uint32_t num_restarts = block_footer;
-  if (size_ > kMaxBlockSizeSupportedByHashIndex) {
-    // In BlockBuilder, we have ensured a block with HashIndex is less than
-    // kMaxBlockSizeSupportedByHashIndex (64KiB).
-    //
-    // Therefore, if we encounter a block with a size > 64KiB, the block
-    // cannot have HashIndex. So the footer will directly interpreted as
-    // num_restarts.
-    //
-    // Such check is for backward compatibility. We can ensure legacy block
-    // with a vary large num_restarts i.e. >= 0x80000000 can be interpreted
-    // correctly as no HashIndex even if the MSB of num_restarts is set.
-    return num_restarts;
-  }
-  BlockBasedTableOptions::DataBlockIndexType index_type;
-  UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
-  return num_restarts;
-}
-
 BlockBasedTableOptions::DataBlockIndexType Block::IndexType() const {
-  assert(size_ >= 2 * sizeof(uint32_t));
-  if (size_ > kMaxBlockSizeSupportedByHashIndex) {
-    // The check is for the same reason as that in NumRestarts()
-    return BlockBasedTableOptions::kDataBlockBinarySearch;
-  }
-  uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t));
-  uint32_t num_restarts = block_footer;
-  BlockBasedTableOptions::DataBlockIndexType index_type;
-  UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
-  return index_type;
+  assert(size() >= DataBlockFooter::kMinEncodedLength);
+  Slice input(data(), size());
+  DataBlockFooter footer;
+  footer.DecodeFrom(&input).PermitUncheckedError();
+  return footer.index_type;
 }
 
 Block::~Block() {
@@ -1057,56 +1399,73 @@ Block::~Block() {
   delete[] kv_checksum_;
 }
 
+Status Block::GetCorruptionStatus() const {
+  // Re-process the footer to get a detailed error status.
+  // This should only be called when size() == 0 (error marker).
+  assert(size() == 0);
+  // When size() == 0 and restart_offset_ != 0, restart_offset_ stores the
+  // original data size for re-decoding the footer to get detailed error.
+  if (restart_offset_ == 0) {
+    return Status::Corruption("bad block contents");
+  }
+  Slice input(contents_.data.data(), restart_offset_);
+  DataBlockFooter footer;
+  Status s = footer.DecodeFrom(&input);
+  if (!s.ok()) {
+    return s;  // Return the detailed error from DecodeFrom
+  }
+  // Footer decoded OK, so error was in later processing (shouldn't happen)
+  DEBUG_FAIL("ok status on presumed bad block contents");
+  return Status::Corruption("presumed bad block contents");
+}
+
 Block::Block(BlockContents&& contents, size_t read_amp_bytes_per_bit,
              Statistics* statistics)
-    : contents_(std::move(contents)),
-      data_(contents_.data.data()),
-      size_(contents_.data.size()),
-      restart_offset_(0),
-      num_restarts_(0) {
+    : contents_(std::move(contents)), restart_offset_(0), num_restarts_(0) {
   TEST_SYNC_POINT("Block::Block:0");
-  if (size_ < sizeof(uint32_t)) {
-    size_ = 0;  // Error marker
+  auto& size = contents_.data.size_;
+  // `contents` is assumed to be uncompressed in the proper format
+  Slice input(contents_.data.data(), size);
+  DataBlockFooter footer;
+  Status s = footer.DecodeFrom(&input);
+  if (!s.ok()) {
+    // Save original size for GetCorruptionStatus() to re-decode footer
+    restart_offset_ = static_cast<uint32_t>(size);
+    size = 0;  // Error marker
   } else {
-    // Should only decode restart points for uncompressed blocks
-    num_restarts_ = NumRestarts();
-    switch (IndexType()) {
+    // After DecodeFrom, input has the footer removed. Each case below
+    // may strip additional suffix (e.g., hash index) so that input ends
+    // with just the restart array.
+    num_restarts_ = footer.num_restarts;
+    switch (footer.index_type) {
       case BlockBasedTableOptions::kDataBlockBinarySearch:
-        restart_offset_ = static_cast<uint32_t>(size_) -
-                          (1 + num_restarts_) * sizeof(uint32_t);
-        if (restart_offset_ > size_ - sizeof(uint32_t)) {
-          // The size is too small for NumRestarts() and therefore
-          // restart_offset_ wrapped around.
-          size_ = 0;
-        }
         break;
       case BlockBasedTableOptions::kDataBlockBinaryAndHash:
-        if (size_ < sizeof(uint32_t) /* block footer */ +
-                        sizeof(uint16_t) /* NUM_BUCK */) {
-          size_ = 0;
+        if (input.size() < sizeof(uint16_t) /* NUM_BUCK */) {
+          size = 0;
           break;
         }
-
         uint16_t map_offset;
-        data_block_hash_index_.Initialize(
-            data_, static_cast<uint16_t>(size_ - sizeof(uint32_t)), /*chop off
-                                                                NUM_RESTARTS*/
-            &map_offset);
-
-        restart_offset_ = map_offset - num_restarts_ * sizeof(uint32_t);
-
-        if (restart_offset_ > map_offset) {
-          // map_offset is too small for NumRestarts() and
-          // therefore restart_offset_ wrapped around.
-          size_ = 0;
-          break;
-        }
+        data_block_hash_index_.Initialize(contents_.data.data(),
+                                          static_cast<uint16_t>(input.size()),
+                                          &map_offset);
+        // Strip the hash index, leaving just data + restarts
+        input.remove_suffix(input.size() - map_offset);
         break;
       default:
-        size_ = 0;  // Error marker
+        size = 0;  // Error marker
+    }
+    // After the switch, input should end with restarts[num_restarts_]
+    if (size != 0) {
+      if (input.size() < num_restarts_ * sizeof(uint32_t)) {
+        size = 0;  // Block too small for the declared number of restarts
+      } else {
+        restart_offset_ = static_cast<uint32_t>(input.size()) -
+                          num_restarts_ * sizeof(uint32_t);
+      }
     }
   }
-  if (read_amp_bytes_per_bit != 0 && statistics && size_ != 0) {
+  if (read_amp_bytes_per_bit != 0 && statistics && size != 0) {
     read_amp_bitmap_.reset(new BlockReadAmpBitmap(
         restart_offset_, read_amp_bytes_per_bit, statistics));
   }
@@ -1148,7 +1507,7 @@ void Block::InitializeDataBlockProtectionInfo(uint8_t protection_bytes_per_key,
       assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key);
     }
     if (!iter->status().ok()) {
-      size_ = 0;  // Error marker
+      contents_.data.size_ = 0;  // Error marker
       return;
     }
     protection_bytes_per_key_ = protection_bytes_per_key;
@@ -1197,7 +1556,7 @@ void Block::InitializeIndexBlockProtectionInfo(uint8_t protection_bytes_per_key,
       assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key);
     }
     if (!iter->status().ok()) {
-      size_ = 0;  // Error marker
+      contents_.data.size_ = 0;  // Error marker
       return;
     }
     protection_bytes_per_key_ = protection_bytes_per_key;
@@ -1231,7 +1590,7 @@ void Block::InitializeMetaIndexBlockProtectionInfo(
       assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key);
     }
     if (!iter->status().ok()) {
-      size_ = 0;  // Error marker
+      contents_.data.size_ = 0;  // Error marker
       return;
     }
     protection_bytes_per_key_ = protection_bytes_per_key;
@@ -1240,14 +1599,14 @@ void Block::InitializeMetaIndexBlockProtectionInfo(
 
 MetaBlockIter* Block::NewMetaIterator(bool block_contents_pinned) {
   MetaBlockIter* iter = new MetaBlockIter();
-  if (size_ < 2 * sizeof(uint32_t)) {
-    iter->Invalidate(Status::Corruption("bad block contents"));
+  if (size() < 2 * sizeof(uint32_t)) {
+    iter->Invalidate(GetCorruptionStatus());
     return iter;
   } else if (num_restarts_ == 0) {
     // Empty block.
     iter->Invalidate(Status::OK());
   } else {
-    iter->Initialize(data_, restart_offset_, num_restarts_,
+    iter->Initialize(data(), restart_offset_, num_restarts_,
                      block_contents_pinned, protection_bytes_per_key_,
                      kv_checksum_, block_restart_interval_);
   }
@@ -1265,8 +1624,8 @@ DataBlockIter* Block::NewDataIterator(const Comparator* raw_ucmp,
   } else {
     ret_iter = new DataBlockIter;
   }
-  if (size_ < 2 * sizeof(uint32_t)) {
-    ret_iter->Invalidate(Status::Corruption("bad block contents"));
+  if (size() < 2 * sizeof(uint32_t)) {
+    ret_iter->Invalidate(GetCorruptionStatus());
     return ret_iter;
   }
   if (num_restarts_ == 0) {
@@ -1275,7 +1634,7 @@ DataBlockIter* Block::NewDataIterator(const Comparator* raw_ucmp,
     return ret_iter;
   } else {
     ret_iter->Initialize(
-        raw_ucmp, data_, restart_offset_, num_restarts_, global_seqno,
+        raw_ucmp, data(), restart_offset_, num_restarts_, global_seqno,
         read_amp_bitmap_.get(), block_contents_pinned,
         user_defined_timestamps_persisted,
         data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr,
@@ -1296,15 +1655,16 @@ IndexBlockIter* Block::NewIndexIterator(
     IndexBlockIter* iter, Statistics* /*stats*/, bool total_order_seek,
     bool have_first_key, bool key_includes_seq, bool value_is_full,
     bool block_contents_pinned, bool user_defined_timestamps_persisted,
-    BlockPrefixIndex* prefix_index) {
+    BlockPrefixIndex* prefix_index,
+    BlockBasedTableOptions::BlockSearchType index_block_search_type) {
   IndexBlockIter* ret_iter;
   if (iter != nullptr) {
     ret_iter = iter;
   } else {
     ret_iter = new IndexBlockIter;
   }
-  if (size_ < 2 * sizeof(uint32_t)) {
-    ret_iter->Invalidate(Status::Corruption("bad block contents"));
+  if (size() < 2 * sizeof(uint32_t)) {
+    ret_iter->Invalidate(GetCorruptionStatus());
     return ret_iter;
   }
   if (num_restarts_ == 0) {
@@ -1314,11 +1674,12 @@ IndexBlockIter* Block::NewIndexIterator(
   } else {
     BlockPrefixIndex* prefix_index_ptr =
         total_order_seek ? nullptr : prefix_index;
-    ret_iter->Initialize(
-        raw_ucmp, data_, restart_offset_, num_restarts_, global_seqno,
-        prefix_index_ptr, have_first_key, key_includes_seq, value_is_full,
-        block_contents_pinned, user_defined_timestamps_persisted,
-        protection_bytes_per_key_, kv_checksum_, block_restart_interval_);
+    ret_iter->Initialize(raw_ucmp, data(), restart_offset_, num_restarts_,
+                         global_seqno, prefix_index_ptr, have_first_key,
+                         key_includes_seq, value_is_full, block_contents_pinned,
+                         user_defined_timestamps_persisted,
+                         protection_bytes_per_key_, kv_checksum_,
+                         block_restart_interval_, index_block_search_type);
   }
 
   return ret_iter;
diff --git a/table/block_based/block.h b/table/block_based/block.h
index 2cd2918a82d7..2187ff8c1e3b 100644
--- a/table/block_based/block.h
+++ b/table/block_based/block.h
@@ -163,11 +163,11 @@ class Block {
 
   ~Block();
 
-  size_t size() const { return size_; }
-  const char* data() const { return data_; }
+  size_t size() const { return contents_.data.size(); }
+  const char* data() const { return contents_.data.data(); }
   // The additional memory space taken by the block data.
   size_t usable_size() const { return contents_.usable_size(); }
-  uint32_t NumRestarts() const;
+  uint32_t NumRestarts() const { return num_restarts_; }
   bool own_bytes() const { return contents_.own_bytes(); }
 
   BlockBasedTableOptions::DataBlockIndexType IndexType() const;
@@ -233,13 +233,19 @@ class Block {
   // It is determined by IndexType property of the table.
   // `user_defined_timestamps_persisted` controls whether a min timestamp is
   // padded while key is being parsed from the block.
+  // `index_block_search_type` controls which search algorithm to use when
+  // reading the index block. kBinary uses binary search, while
+  // kInterpolation uses interpolation search which can be faster
+  // for uniformly distributed keys.
   IndexBlockIter* NewIndexIterator(
       const Comparator* raw_ucmp, SequenceNumber global_seqno,
       IndexBlockIter* iter, Statistics* stats, bool total_order_seek,
       bool have_first_key, bool key_includes_seq, bool value_is_full,
       bool block_contents_pinned = false,
       bool user_defined_timestamps_persisted = true,
-      BlockPrefixIndex* prefix_index = nullptr);
+      BlockPrefixIndex* prefix_index = nullptr,
+      BlockBasedTableOptions::BlockSearchType index_block_search_type =
+          BlockBasedTableOptions::kBinary);
 
   // Report an approximation of how much memory has been used.
   size_t ApproximateMemoryUsage() const;
@@ -276,10 +282,15 @@ class Block {
   const char* TEST_GetKVChecksum() const { return kv_checksum_; }
 
  private:
+  // Returns a detailed error status by re-processing the footer.
+  // Should only be called when size() == 0 (error marker).
+  Status GetCorruptionStatus() const;
+
   BlockContents contents_;
-  const char* data_;         // contents_.data.data()
-  size_t size_;              // contents_.data.size()
-  uint32_t restart_offset_;  // Offset in data_ of restart array
+  // Normal state: offset in data_ of restart array.
+  // Error state (size()==0): original data size if footer decode failed,
+  //   otherwise 0. Used by GetCorruptionStatus() to re-decode footer.
+  uint32_t restart_offset_;
   uint32_t num_restarts_;
   std::unique_ptr<BlockReadAmpBitmap> read_amp_bitmap_;
   char* kv_checksum_{nullptr};
@@ -428,7 +439,7 @@ class BlockIter : public InternalIteratorBase<TValue> {
   Cache::Handle* cache_handle() { return cache_handle_; }
 
  protected:
-  std::unique_ptr<InternalKeyComparator> icmp_;
+  InternalKeyComparator icmp_;
   const char* data_;       // underlying block contents
   uint32_t num_restarts_;  // Number of uint32_t entries in restart array
 
@@ -530,17 +541,15 @@ class BlockIter : public InternalIteratorBase<TValue> {
                       uint32_t block_restart_interval) {
     assert(data_ == nullptr);  // Ensure it is called only once
     assert(num_restarts > 0);  // Ensure the param is valid
-
-    icmp_ = std::make_unique<InternalKeyComparator>(raw_ucmp);
+    assert(raw_ucmp != nullptr);
+    icmp_ = InternalKeyComparator(raw_ucmp);
     data_ = data;
     restarts_ = restarts;
     num_restarts_ = num_restarts;
     current_ = restarts_;
     restart_index_ = num_restarts_;
     global_seqno_ = global_seqno;
-    if (raw_ucmp != nullptr) {
-      ts_sz_ = raw_ucmp->timestamp_size();
-    }
+    ts_sz_ = raw_ucmp->timestamp_size();
     pad_min_timestamp_ = ts_sz_ > 0 && !user_defined_timestamp_persisted;
     block_contents_pinned_ = block_contents_pinned;
     cache_handle_ = nullptr;
@@ -573,14 +582,18 @@ class BlockIter : public InternalIteratorBase<TValue> {
     CorruptionError(error_msg);
   }
 
-  void UpdateRawKeyAndMaybePadMinTimestamp(const Slice& key) {
+  void UpdateRawKeyAndMaybePadMinTimestamp(IterKey& raw_key, const Slice& key) {
     if (pad_min_timestamp_) {
-      raw_key_.SetKeyWithPaddedMinTimestamp(key, ts_sz_);
+      raw_key.SetKeyWithPaddedMinTimestamp(key, ts_sz_);
     } else {
-      raw_key_.SetKey(key, false /* copy */);
+      raw_key.SetKey(key, false /* copy */);
     }
   }
 
+  void UpdateRawKeyAndMaybePadMinTimestamp(const Slice& key) {
+    UpdateRawKeyAndMaybePadMinTimestamp(raw_key_, key);
+  }
+
   // Must be called every time a key is found that needs to be returned to user,
   // and may be called when no key is found (as a no-op). Updates `key_`,
   // `key_buf_`, and `key_pinned_` with info about the found key.
@@ -620,18 +633,31 @@ class BlockIter : public InternalIteratorBase<TValue> {
     }
   }
 
-  // Returns the result of `Comparator::Compare()`, where the appropriate
-  // comparator is used for the block contents, the LHS argument is the current
-  // key with global seqno applied, and the RHS argument is `other`.
-  int CompareCurrentKey(const Slice& other) {
+  // Compares two keys using the appropriate comparator for the block contents.
+  // Uses user comparator when the block stores user keys, otherwise uses the
+  // internal key comparator. When global_seqno is not disabled, applies it to
+  // the LHS key for comparison.
+  int CompareKey(const Slice& a, const Slice& b) const {
+    assert(icmp_.user_comparator() != nullptr);
     if (raw_key_.IsUserKey()) {
       assert(global_seqno_ == kDisableGlobalSequenceNumber);
-      return icmp_->user_comparator()->Compare(raw_key_.GetUserKey(), other);
+      return icmp_.user_comparator()->Compare(a, b);
     } else if (global_seqno_ == kDisableGlobalSequenceNumber) {
-      return icmp_->Compare(raw_key_.GetInternalKey(), other);
+      return icmp_.Compare(a, b);
+    }
+    return icmp_.Compare(a, global_seqno_, b, kDisableGlobalSequenceNumber);
+  }
+
+  int CompareKey(const IterKey& a, const Slice& b) const {
+    if (a.IsUserKey()) {
+      return CompareKey(a.GetUserKey(), b);
     }
-    return icmp_->Compare(raw_key_.GetInternalKey(), global_seqno_, other,
-                          kDisableGlobalSequenceNumber);
+    return CompareKey(a.GetInternalKey(), b);
+  }
+
+  // Compares the current key (with global seqno applied) against `other`.
+  int CompareCurrentKey(const Slice& other) const {
+    return CompareKey(raw_key_, other);
   }
 
  private:
@@ -666,8 +692,16 @@ class BlockIter : public InternalIteratorBase<TValue> {
 
  protected:
   template <typename DecodeKeyFunc>
-  inline bool BinarySeek(const Slice& target, uint32_t* index,
-                         bool* is_index_key_result);
+  inline bool GetRestartKey(uint32_t index, Slice* key);
+
+  template <typename DecodeKeyFunc>
+  inline bool BinarySeekRestartPointIndex(const Slice& target, uint32_t* index,
+                                          bool* is_index_key_result);
+
+  template <typename DecodeKeyFunc>
+  inline bool InterpolationSeekRestartPointIndex(const Slice& target,
+                                                 uint32_t* index,
+                                                 bool* is_index_key_result);
 
   // Find the first key in restart interval `index` that is >= `target`.
   // If there is no such key, iterator is positioned at the first key in
@@ -831,14 +865,14 @@ class IndexBlockIter final : public BlockIter<IndexValue> {
   // format.
   // value_is_full, default true, means that no delta encoding is
   // applied to values.
-  void Initialize(const Comparator* raw_ucmp, const char* data,
-                  uint32_t restarts, uint32_t num_restarts,
-                  SequenceNumber global_seqno, BlockPrefixIndex* prefix_index,
-                  bool have_first_key, bool key_includes_seq,
-                  bool value_is_full, bool block_contents_pinned,
-                  bool user_defined_timestamps_persisted,
-                  uint8_t protection_bytes_per_key, const char* kv_checksum,
-                  uint32_t block_restart_interval) {
+  void Initialize(
+      const Comparator* raw_ucmp, const char* data, uint32_t restarts,
+      uint32_t num_restarts, SequenceNumber global_seqno,
+      BlockPrefixIndex* prefix_index, bool have_first_key,
+      bool key_includes_seq, bool value_is_full, bool block_contents_pinned,
+      bool user_defined_timestamps_persisted, uint8_t protection_bytes_per_key,
+      const char* kv_checksum, uint32_t block_restart_interval,
+      BlockBasedTableOptions::BlockSearchType index_block_search_type) {
     InitializeBase(raw_ucmp, data, restarts, num_restarts,
                    kDisableGlobalSequenceNumber, block_contents_pinned,
                    user_defined_timestamps_persisted, protection_bytes_per_key,
@@ -847,6 +881,7 @@ class IndexBlockIter final : public BlockIter<IndexValue> {
     prefix_index_ = prefix_index;
     value_delta_encoded_ = !value_is_full;
     have_first_key_ = have_first_key;
+    index_search_type_ = index_block_search_type;
     if (have_first_key_ && global_seqno != kDisableGlobalSequenceNumber) {
       global_seqno_state_.reset(new GlobalSeqnoState(global_seqno));
     } else {
@@ -941,6 +976,10 @@ class IndexBlockIter final : public BlockIter<IndexValue> {
   // `pad_min_timestamp_` is true.
   std::string first_internal_key_with_ts_;
 
+  // The search algorithm to use when reading the index block.
+  BlockBasedTableOptions::BlockSearchType index_search_type_ =
+      BlockBasedTableOptions::kBinary;
+
   // Set *prefix_may_exist to false if no key possibly share the same prefix
   // as `target`. If not set, the result position should be the same as total
   // order Seek.
@@ -953,6 +992,10 @@ class IndexBlockIter final : public BlockIter<IndexValue> {
                             bool* prefix_may_exist);
   inline int CompareBlockKey(uint32_t block_index, const Slice& target);
 
+  template <typename DecodeKeyFunc>
+  bool FindRestartPointForSeek(const Slice& seek_key, uint32_t* index,
+                               bool* skip_linear_scan);
+
   inline bool ParseNextIndexKey();
 
   // When value_delta_encoded_ is enabled it decodes the value which is assumed
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index 3fb7b2dbdaf4..c080dcb5cca1 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -46,14 +46,17 @@
 #include "table/block_based/filter_policy_internal.h"
 #include "table/block_based/full_filter_block.h"
 #include "table/block_based/partitioned_filter_block.h"
+#include "table/block_based/user_defined_index_wrapper.h"
 #include "table/format.h"
 #include "table/meta_blocks.h"
 #include "table/table_builder.h"
+#include "util/bit_fields.h"
 #include "util/coding.h"
 #include "util/compression.h"
+#include "util/defer.h"
+#include "util/semaphore.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
-#include "util/work_queue.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -107,90 +110,20 @@ FilterBlockBuilder* CreateFilterBlockBuilder(
   }
 }
 
-bool GoodCompressionRatio(size_t compressed_size, size_t uncomp_size,
-                          int max_compressed_bytes_per_kb) {
-  // For efficiency, avoid floating point and division
-  return compressed_size <=
-         (static_cast<uint64_t>(max_compressed_bytes_per_kb) * uncomp_size) >>
-         10;
-}
-
-}  // namespace
-
-// format_version is the block format as defined in include/rocksdb/table.h
-Slice CompressBlock(const Slice& uncompressed_data, const CompressionInfo& info,
-                    CompressionType* type, uint32_t format_version,
-                    bool allow_sample, std::string* compressed_output,
-                    std::string* sampled_output_fast,
-                    std::string* sampled_output_slow) {
-  assert(type);
-  assert(compressed_output);
-  assert(compressed_output->empty());
-
-  // If requested, we sample one in every N block with a
-  // fast and slow compression algorithm and report the stats.
-  // The users can use these stats to decide if it is worthwhile
-  // enabling compression and they also get a hint about which
-  // compression algorithm wil be beneficial.
-  if (allow_sample && info.SampleForCompression() &&
-      Random::GetTLSInstance()->OneIn(
-          static_cast<int>(info.SampleForCompression()))) {
-    // Sampling with a fast compression algorithm
-    if (sampled_output_fast && (LZ4_Supported() || Snappy_Supported())) {
-      CompressionType c =
-          LZ4_Supported() ? kLZ4Compression : kSnappyCompression;
-      CompressionOptions options;
-      CompressionContext context(c, options);
-      CompressionInfo info_tmp(options, context,
-                               CompressionDict::GetEmptyDict(), c,
-                               info.SampleForCompression());
-
-      CompressData(uncompressed_data, info_tmp,
-                   GetCompressFormatForVersion(format_version),
-                   sampled_output_fast);
-    }
-
-    // Sampling with a slow but high-compression algorithm
-    if (sampled_output_slow && (ZSTD_Supported() || Zlib_Supported())) {
-      CompressionType c = ZSTD_Supported() ? kZSTD : kZlibCompression;
-      CompressionOptions options;
-      CompressionContext context(c, options);
-      CompressionInfo info_tmp(options, context,
-                               CompressionDict::GetEmptyDict(), c,
-                               info.SampleForCompression());
-
-      CompressData(uncompressed_data, info_tmp,
-                   GetCompressFormatForVersion(format_version),
-                   sampled_output_slow);
-    }
-  }
-
-  int max_compressed_bytes_per_kb = info.options().max_compressed_bytes_per_kb;
-  if (info.type() == kNoCompression || max_compressed_bytes_per_kb <= 0) {
-    *type = kNoCompression;
-    return uncompressed_data;
-  }
-
-  // Actually compress the data; if the compression method is not supported,
-  // or the compression fails etc., just fall back to uncompressed
-  if (!CompressData(uncompressed_data, info,
-                    GetCompressFormatForVersion(format_version),
-                    compressed_output)) {
-    *type = kNoCompression;
-    return uncompressed_data;
-  }
-
-  // Check the compression ratio; if it's not good enough, just fall back to
-  // uncompressed
-  if (!GoodCompressionRatio(compressed_output->size(), uncompressed_data.size(),
-                            max_compressed_bytes_per_kb)) {
-    *type = kNoCompression;
-    return uncompressed_data;
+// A convenience function for populating the Compressor* fields; see ~Rep()
+Compressor* MaybeCloneSpecialized(
+    Compressor* compressor, CacheEntryRole block_type,
+    Compressor::DictConfigArgs&& dict_config = Compressor::DictDisabled{}) {
+  auto specialized =
+      compressor->MaybeCloneSpecialized(block_type, std::move(dict_config));
+  if (specialized) {
+    // Caller is responsible for freeing when distinct
+    return specialized.release();
+  } else {
+    return compressor;
   }
-
-  *type = info.type();
-  return *compressed_output;
 }
+}  // namespace
 
 // kBlockBasedTableMagicNumber was picked by running
 //    echo rocksdb.table.block_based | sha1sum
@@ -201,9 +134,6 @@ Slice CompressBlock(const Slice& uncompressed_data, const CompressionInfo& info,
 // allocated
 // it must be not extern in one place.
 const uint64_t kBlockBasedTableMagicNumber = 0x88e241b785f4cff7ull;
-// We also support reading and writing legacy block based table format (for
-// backwards compatibility)
-const uint64_t kLegacyBlockBasedTableMagicNumber = 0xdb4775248b80fb57ull;
 
 // A collector that collects properties of interest to block-based table.
 // For now this class looks heavy-weight since we only write one additional
@@ -268,6 +198,587 @@ class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector
   bool decoupled_partitioned_filters_;
 };
 
+struct BlockBasedTableBuilder::WorkingAreaPair {
+  Compressor::ManagedWorkingArea compress;
+  Decompressor::ManagedWorkingArea verify;
+};
+
+// ParallelCompressionRep essentially defines a framework for parallelizing
+// block generation ("emit"), block compression, and block writing to storage.
+// The synchronization is lock-free/wait-free, so thread waiting only happens
+// when work-order dependencies are unsatisfied, though sleeping/idle threads
+// might be kept idle when it seems unlikely they would improve throughput by
+// waking them up (essentially auto-tuned parallelism). But because all threads
+// are capable of 2 out of 3 kinds of work, in a quasi-work-stealing system,
+// running threads can usually expect that compatible work is available.
+//
+// This is currently activated with CompressionOptions::parallel_threads > 1
+// but that is a somewhat crude API that would ideally be adapted along with
+// the implementation in the future to allow threads to serve multiple
+// flush/compaction jobs, though the available improvement might be small.
+// Even within the scope of a single file it might be nice to use a general
+// framework for distributing work across threads, but (a) different threads
+// are limited to which work they can do because of technical challenges, (b)
+// being largely CPU bound on small work units means such a framework would
+// likely have big overheads compared to this hand-optimized solution.
+struct BlockBasedTableBuilder::ParallelCompressionRep {
+  // The framework has two kinds of threads: the calling thread from
+  // flush/compaction/SstFileWriter is called the "emit thread" (kEmitter).
+  // Other threads cannot generally take over "emit" work because that is
+  // largely happening up the call stack from BlockBasedTableBuilder.
+  // The emit thread can also take on compression work in a quasi-work-stealing
+  // manner when the buffer for emitting new blocks is full.
+  //
+  // When parallelism is enabled, there are also "worker" threads that
+  // can handle compressing blocks and (one worker thread at a time) write them
+  // to the SST file (and handle other single-threaded wrap-up of each block).
+  //
+  // NOTE: when parallelism is enabled, the emit thread is not permitted to
+  // write to the SST file because that is the potential "output" bottleneck,
+  // and it's generally bad for parallelism to allow the only thread that can
+  // serve the "input" bottleneck (emit work) to also spend exclusive time on
+  // the output bottleneck.
+  enum class ThreadKind {
+    kEmitter,
+    kWorker,
+  };
+
+  // ThreadState allows each thread to track its work assignment. In addition to
+  // the cases already mentioned, kEmitting, kCompressing, and kWriting to the
+  // SST file writer,
+  // * Threads can enter the kIdle state so that they can sleep when no work is
+  // available for them, to be woken up when appropriate.
+  // * The kEnd state means the thread is not doing any more work items, which
+  // for worker threads means they will end soon.
+  // * The kCompressingAndWriting state means a worker can compress and write a
+  // block without additional state updates because the same block to be
+  // compressed is the next to be written.
+  enum class ThreadState {
+    /* BEGIN Emitter only states */
+    kEmitting,
+    /* END Emitter only states */
+    /* BEGIN states for emitter and worker */
+    kIdle,
+    kCompressing,
+    kEnd,
+    /* END states for emitter and worker */
+    /* BEGIN Worker only states */
+    kCompressingAndWriting,
+    kWriting,
+    /* END Worker only states */
+  };
+
+  // BlockRep instances are used and reused in a ring buffer (below), so that
+  // many blocks can be in an intermediate state between serialized into
+  // uncompressed bytes and written to the SST file. Notably, each block is
+  // "emitted" in uncompressed form into a BlockRep, compressed (at least
+  // attempted, when configured) for updated BlockRep, and then written from the
+  // BlockRep to the writer for the SST file bytes.
+  struct ALIGN_AS(CACHE_LINE_SIZE) BlockRep {
+    // Uncompressed block contents
+    std::string uncompressed;
+    GrowableBuffer compressed;
+    CompressionType compression_type = kNoCompression;
+    std::unique_ptr<IndexBuilder::PreparedIndexEntry> prepared_index_entry;
+  };
+
+  // Ring buffer of emitted blocks that may or may not yet be compressed.
+  std::unique_ptr<BlockRep[]> ring_buffer;
+  // log_2(ring buffer size), where ring buffer size must be a power of two
+  const int ring_buffer_nbits;
+  // ring buffer size - 1, to function as a bit mask for ring buffer positions
+  // (e.g. given the ordinal number of a block)
+  const uint32_t ring_buffer_mask;
+  // Number of threads in worker_threads. (Emit thread doesn't count)
+  const uint32_t num_worker_threads;
+
+  // Rough upper bound on the sst file size contribution from blocks emitted
+  // into the parallel compression ring buffer but not yet written. Tracks
+  // uncompressed size, with trailer, until a block is compressed, then
+  // compressed size until the block is written. (TODO: does not currently
+  // account for block_align)
+  RelaxedAtomic<uint64_t> estimated_inflight_size{0};
+  // Thread objects for worker threads
+  std::vector<port::Thread> worker_threads;
+  // Working areas for data_block_compressor for each worker thread
+  std::vector<WorkingAreaPair> working_areas;
+
+  // Semaphores for threads to sleep when there's no available work for them
+  // and to wake back up when someone determines there is available work (most
+  // likely). Split between worker threads and emit thread because they can do
+  // different kinds of work.
+  CountingSemaphore idle_worker_sem{0};
+  BinarySemaphore idle_emit_sem{0};
+
+  // Primary atomic state of parallel compression, which includes a number of
+  // state fields that are best updated atomically to avoid locking and/or to
+  // simplify the interesting interleavings that have to be considered and
+  // accommodated.
+  struct State : public BitFields<uint64_t, State> {};
+  ALIGN_AS(CACHE_LINE_SIZE) BitFieldsAtomic<State> atomic_state;
+
+  // The first field is a bit for each ring buffer slot (max 32) for whether
+  // that slot is ready to be claimed for writing by a worker thread. Because
+  // compressions might finish out-of-order, we need to track individually
+  // whether they are finished, though this field doesn't differentiate
+  // "compression completed" from "compression not started" because that can be
+  // inferred from NextToCompress. A block might not enter this state, because
+  // the same thread that compresses it can also immediately write the block if
+  // it notices that the block is next to write.
+  using NeedsWriter = UnsignedBitField<State, 32, NoPrevBitField>;
+  // Track how many worker threads are in an idle state because there was no
+  // available work and haven't been selected to wake back up.
+  using IdleWorkerCount = UnsignedBitField<State, 5, NeedsWriter>;
+  // Track whether the emit thread is an idle state because there was no
+  // available work and hasn't been triggered to wake back up. The nature of
+  // available work and atomic CAS assignment of work ensures at least one
+  // thread is kept out of the idle state.
+  using IdleEmitFlag = BoolBitField<State, IdleWorkerCount>;
+  // Track whether threads should end when they finish available work because no
+  // more blocks will be emitted.
+  using NoMoreToEmitFlag = BoolBitField<State, IdleEmitFlag>;
+  // Track whether threads should abort ASAP because of an error.
+  using AbortFlag = BoolBitField<State, NoMoreToEmitFlag>;
+  // Track three "NextTo" counters for the positions of the next block to write,
+  // to start compression, and to emit into the ring buffer. If these counters
+  // never overflowed / wrapped around, we would have next_to_write <=
+  // next_to_compress <= next_to_emit because a block must be emitted before
+  // compressed, and compressed (at least attempted) before writing. We need to
+  // track more than ring_buffer_nbits of these counters to be able to
+  // distinguish an empty ring buffer (next_to_write == next_to_emit) from a
+  // full ring buffer (next_to_write != next_to_emit but equal under
+  // ring_buffer_mask).
+  using NextToWrite = UnsignedBitField<State, 8, AbortFlag>;
+  using NextToCompress = UnsignedBitField<State, 8, NextToWrite>;
+  using NextToEmit = UnsignedBitField<State, 8, NextToCompress>;
+  static_assert(NextToEmit::kEndBit == 64);
+
+  // BEGIN fields for use by the emit thread only. These can't live on the stack
+  // because the emit thread frequently returns out of BlockBasedTableBuilder.
+  ALIGN_AS(CACHE_LINE_SIZE)
+  ThreadState emit_thread_state = ThreadState::kEmitting;
+  // Ring buffer index that emit thread is operating on (for emitting and
+  // compressing states)
+  uint32_t emit_slot = 0;
+  // Including some data to inform when to wake up idle worker threads (see
+  // implementation for details)
+  int32_t emit_counter_toward_wake_up = 0;
+  int32_t emit_counter_for_wake_up = 0;
+  static constexpr int32_t kMaxWakeupInterval = 8;
+  // END fields for use by the emit thread only
+
+  // TSAN on GCC has bugs that report false positives on this watchdog code.
+  // Other efforts to work around the bug have failed, so to avoid those false
+  // positive reports, we simply disable the watchdog when running under GCC
+  // TSAN.
+#if !defined(NDEBUG) && !(defined(__GNUC__) && defined(__SANITIZE_THREAD__))
+#define BBTB_PC_WATCHDOG 1
+#endif
+#ifdef BBTB_PC_WATCHDOG
+  // These are for an extra "watchdog" thread in DEBUG builds that heuristically
+  // checks for the most likely deadlock conditions. False positives and false
+  // negatives are technically possible.
+  std::thread watchdog_thread;
+  std::mutex watchdog_mutex;
+  std::condition_variable watchdog_cv;
+  bool shutdown_watchdog = false;
+  RelaxedAtomic<uint32_t> live_workers{0};
+  RelaxedAtomic<uint32_t> idling_workers{0};
+  RelaxedAtomic<bool> live_emit{0};
+  RelaxedAtomic<bool> idling_emit{0};
+#endif  // BBTB_PC_WATCHDOG
+
+  int ComputeRingBufferNbits(uint32_t parallel_threads) {
+    // Ring buffer size is a power of two not to exceed 32 but otherwise
+    // at least twice the number of threads.
+    if (parallel_threads >= 9) {
+      return 5;
+    } else if (parallel_threads >= 5) {
+      return 4;
+    } else if (parallel_threads >= 3) {
+      return 3;
+    } else {
+      assert(parallel_threads > 1);
+      return 2;
+    }
+  }
+
+  explicit ParallelCompressionRep(uint32_t parallel_threads)
+      : ring_buffer_nbits(ComputeRingBufferNbits(parallel_threads)),
+        ring_buffer_mask((uint32_t{1} << ring_buffer_nbits) - 1),
+        num_worker_threads(std::min(parallel_threads, ring_buffer_mask)) {
+    assert(num_worker_threads <= IdleWorkerCount::kMask);
+
+    ring_buffer = std::make_unique<BlockRep[]>(ring_buffer_mask + 1);
+
+    // Start by aggressively waking up idle workers
+    emit_counter_for_wake_up = -static_cast<int32_t>(num_worker_threads);
+  }
+
+  ~ParallelCompressionRep() {
+#ifndef NDEBUG
+    auto state = atomic_state.Load();
+    if (state.Get<AbortFlag>() == false) {
+      // Should be clear / cancelled out with normal shutdown
+      assert(state.Get<NeedsWriter>() == 0);
+
+      // Ring buffer reached empty state
+      assert(state.Get<NextToWrite>() == state.Get<NextToCompress>());
+      assert(state.Get<NextToCompress>() == state.Get<NextToEmit>());
+
+      // Everything cancels out in inflight size
+      assert(estimated_inflight_size.LoadRelaxed() == 0);
+    }
+    // All idling metadata cleaned up, properly tracked
+    assert(state.Get<IdleWorkerCount>() == 0);
+    assert(state.Get<IdleEmitFlag>() == false);
+
+    // No excess in semaphores
+    assert(!idle_emit_sem.TryAcquire());
+    assert(!idle_worker_sem.TryAcquire());
+#endif  // !NDEBUG
+  }
+
+  // The primary function for a thread transitioning from one state or work
+  // assignment to the next. `slot` refers to a position in the ring buffer
+  // for assigned emit, compression, or write work.
+  //
+  // Because both the emit thread and worker threads can work on compression,
+  // this is a quasi-work-stealing parallel algorithm. (Enabling other threads
+  // to do emit work would be quite challenging, and allowing the emit thread
+  // to handle writes could create a bottle-neck.)
+  //
+  // This function is basically a CAS loop trying to pick the next piece of work
+  // for this thread and retrying if CAS fails. This function also handles
+  // thread idling when that's the appropriate assignment, continuing the loop
+  // looking for productive work when woken from an idle state.
+  //
+  // Precondition: thread_state is appropriate for thread_kind and not kEnd. It
+  // must match the previously returned state for that thread, and is only kIdle
+  // for the thread on startup (though the kIdle state is used internal to the
+  // function).
+  //
+  // Postcondition: thread_state is appropriate for thread_kind and not kIdle.
+  // Except for kEnd state, the calling thread has exclusive access to
+  // ring_buffer[slot] until next StateTransition().
+  template <ThreadKind thread_kind>
+  void StateTransition(
+      /*in/out*/ ThreadState& thread_state,
+      /*in/out*/ uint32_t& slot) {
+    assert(slot <= ring_buffer_mask);
+    // Last known value for atomic_state
+    State seen_state = atomic_state.Load();
+
+    for (;;) {
+      if (seen_state.Get<AbortFlag>()) {
+        thread_state = ThreadState::kEnd;
+        return;
+      }
+
+      assert(static_cast<uint8_t>(seen_state.Get<NextToEmit>() -
+                                  seen_state.Get<NextToCompress>()) <=
+             ring_buffer_mask + 1);
+      assert(static_cast<uint8_t>(seen_state.Get<NextToCompress>() -
+                                  seen_state.Get<NextToWrite>()) <=
+             ring_buffer_mask + 1);
+      assert(static_cast<uint8_t>(seen_state.Get<NextToEmit>() -
+                                  seen_state.Get<NextToWrite>()) <=
+             ring_buffer_mask + 1);
+
+      // Draft of the next proposed atomic_state. Start by marking completion of
+      // the current thread's last work.
+      State next_state = seen_state;
+      bool wake_idle = false;
+      switch (thread_state) {
+        case ThreadState::kEmitting: {
+          assert(thread_kind == ThreadKind::kEmitter);
+          assert(slot == (next_state.Get<NextToEmit>() & ring_buffer_mask));
+          next_state.Ref<NextToEmit>() += 1;
+          // Check whether to wake up idle worker thread
+          if (next_state.Get<IdleWorkerCount>() > 0 &&
+              // The number of blocks for which compression hasn't started
+              // is well over the number of active threads.
+              static_cast<uint8_t>(next_state.Get<NextToEmit>() -
+                                   next_state.Get<NextToCompress>()) >=
+                  (ring_buffer_mask + 1) / 4 +
+                      (num_worker_threads -
+                       next_state.Get<IdleWorkerCount>())) {
+            // At first, emit_counter_for_wake_up is negative to aggressively
+            // wake up idle worker threads. Then it backs off the interval at
+            // which we wake up, up to some maximum that attempts to balance
+            // maximum throughput and minimum CPU overhead.
+            if (emit_counter_toward_wake_up >= emit_counter_for_wake_up) {
+              // We reached a threshold to justify a wake-up.
+              wake_idle = true;
+              // Adjust idle count assuming we are going to own waking it up,
+              // so no one else can duplicate that. (The idle count is really
+              // the number idling for which no one yet owns waking them up.)
+              next_state.Ref<IdleWorkerCount>() -= 1;
+              // Reset the counter toward the threshold for wake-up
+              emit_counter_toward_wake_up = 0;
+              // Raise the threshold (up to some limit) to stabilize the number
+              // of active threads after some ramp-up period.
+              emit_counter_for_wake_up =
+                  std::min(emit_counter_for_wake_up + 1,
+                           static_cast<int32_t>(num_worker_threads +
+                                                kMaxWakeupInterval));
+            } else {
+              // Advance closer to the threshold for justifying a wake-up
+              emit_counter_toward_wake_up++;
+            }
+          }
+          break;
+        }
+        case ThreadState::kIdle:
+          // NOTE: thread that signalled to wake up already updated idle count
+          // or marker. This is required to avoid overflow on the semaphore,
+          // especially the binary semaphore for idle_emit_sem, and likely
+          // desirable to avoid spurious/extra Release().
+          break;
+        case ThreadState::kCompressing:
+          next_state.Ref<NeedsWriter>() |= uint32_t{1} << slot;
+          if constexpr (thread_kind == ThreadKind::kEmitter) {
+            if (next_state.Get<IdleWorkerCount>() == num_worker_threads) {
+              // Work is available for a worker thread and none are running
+              wake_idle = true;
+              // Adjust idle count assuming we are going to own waking it up
+              next_state.Ref<IdleWorkerCount>() -= 1;
+            }
+          }
+          break;
+        case ThreadState::kEnd:
+          // Should have already recognized the end state
+          assert(thread_state != ThreadState::kEnd);
+          return;
+        case ThreadState::kCompressingAndWriting:
+        case ThreadState::kWriting:
+          assert(thread_kind == ThreadKind::kWorker);
+          assert((next_state.Get<NextToWrite>() & ring_buffer_mask) == slot);
+          assert(next_state.Get<NextToCompress>() !=
+                 next_state.Get<NextToWrite>());
+          assert(next_state.Get<NextToEmit>() != next_state.Get<NextToWrite>());
+          assert((next_state.Get<NeedsWriter>() & (uint32_t{1} << slot)) == 0);
+          next_state.Ref<NextToWrite>() += 1;
+          if (next_state.Get<IdleEmitFlag>()) {
+            wake_idle = true;
+            // Clear idle emit flag assuming we are going to own waking it up
+            next_state.Set<IdleEmitFlag>(false);
+          }
+          break;
+      }
+
+      // Find the next state, depending on the kind of thread
+      ThreadState next_thread_state = ThreadState::kEnd;
+      uint32_t next_slot = 0;
+      if constexpr (thread_kind == ThreadKind::kEmitter) {
+        // First priority is emitting more uncompressed blocks, if there's
+        // room in the ring buffer.
+        if (static_cast<uint8_t>(next_state.Get<NextToEmit>() -
+                                 next_state.Get<NextToWrite>()) <=
+            ring_buffer_mask) {
+          // There is room
+          next_thread_state = ThreadState::kEmitting;
+          next_slot = next_state.Get<NextToEmit>() & ring_buffer_mask;
+        }
+      }
+      if constexpr (thread_kind == ThreadKind::kWorker) {
+        // First priority is writing next block to write, if it needs a writer
+        // assigned to it
+        uint32_t next_to_write_slot =
+            next_state.Get<NextToWrite>() & ring_buffer_mask;
+        uint32_t needs_writer_bit = uint32_t{1} << next_to_write_slot;
+        if (next_state.Get<NeedsWriter>() & needs_writer_bit) {
+          // Clear the "needs writer" marker on the slot
+          next_state.Ref<NeedsWriter>() &= ~needs_writer_bit;
+          // Take ownership of writing it
+          next_thread_state = ThreadState::kWriting;
+          next_slot = next_to_write_slot;
+        }
+      }
+
+      // If didn't find higher priority work
+      if (next_thread_state == ThreadState::kEnd) {
+        if (next_state.Get<NextToCompress>() != next_state.Get<NextToEmit>()) {
+          // Compression work is available, select that
+          if (thread_kind == ThreadKind::kWorker &&
+              next_state.Get<NextToCompress>() ==
+                  next_state.Get<NextToWrite>()) {
+            next_thread_state = ThreadState::kCompressingAndWriting;
+          } else {
+            next_thread_state = ThreadState::kCompressing;
+          }
+          next_slot = next_state.Get<NextToCompress>() & ring_buffer_mask;
+          next_state.Ref<NextToCompress>() += 1;
+        } else if constexpr (thread_kind == ThreadKind::kEmitter) {
+          // Emitter thread goes idle
+          next_thread_state = ThreadState::kIdle;
+          assert(next_state.Get<IdleEmitFlag>() == false);
+          assert(next_state.Get<NoMoreToEmitFlag>() == false);
+          next_state.Set<IdleEmitFlag>(true);
+        } else if (next_state.Get<NoMoreToEmitFlag>()) {
+          // Worker thread shall not idle if we are done emitting. At least
+          // one worker will remain unblocked to finish writing
+          next_thread_state = ThreadState::kEnd;
+        } else {
+          // Worker thread goes idle
+          next_thread_state = ThreadState::kIdle;
+          assert(next_state.Get<IdleWorkerCount>() < IdleWorkerCount::kMask);
+          next_state.Ref<IdleWorkerCount>() += 1;
+        }
+      }
+      assert(thread_state != ThreadState::kEnd);
+
+      // Attempt to atomically apply the desired/computed state transition
+      if (atomic_state.CasWeak(seen_state, next_state)) {
+        // Success
+        thread_state = next_thread_state;
+        slot = next_slot;
+        seen_state = next_state;
+        if (wake_idle) {
+          if constexpr (thread_kind == ThreadKind::kEmitter) {
+            idle_worker_sem.Release();
+          } else {
+            idle_emit_sem.Release();
+          }
+        }
+        if (thread_state != ThreadState::kIdle) {
+          // Successfully transitioned to another useful state
+          return;
+        }
+        // Handle idle state
+        if constexpr (thread_kind == ThreadKind::kEmitter) {
+#ifdef BBTB_PC_WATCHDOG
+          idling_emit.StoreRelaxed(true);
+          Defer decr{[this]() { idling_emit.StoreRelaxed(false); }};
+#endif  // BBTB_PC_WATCHDOG
+
+          // Likely go to sleep
+          idle_emit_sem.Acquire();
+        } else {
+#ifdef BBTB_PC_WATCHDOG
+          // Tracking for watchdog
+          idling_workers.FetchAddRelaxed(1);
+          Defer decr{[this]() { idling_workers.FetchSubRelaxed(1); }};
+#endif  // BBTB_PC_WATCHDOG
+
+          // Likely go to sleep
+          idle_worker_sem.Acquire();
+        }
+        // Update state after sleep
+        seen_state = atomic_state.Load();
+      }
+      // else loop and try again
+    }
+  }
+
+  void EmitterStateTransition(
+      /*in/out*/ ThreadState& thread_state,
+      /*in/out*/ uint32_t& slot) {
+    StateTransition<ThreadKind::kEmitter>(thread_state, slot);
+  }
+
+  void WorkerStateTransition(
+      /*in/out*/ ThreadState& thread_state,
+      /*in/out*/ uint32_t& slot) {
+    StateTransition<ThreadKind::kWorker>(thread_state, slot);
+  }
+
+  // Exactly wake all idling threads (for an end state)
+  void WakeAllIdle() {
+    State old_state, new_state;
+    auto transform =
+        IdleEmitFlag::ClearTransform() + IdleWorkerCount::ClearTransform();
+    atomic_state.Apply(transform, &old_state, &new_state);
+    assert(new_state.Get<IdleEmitFlag>() == false);
+    assert(new_state.Get<IdleWorkerCount>() == 0);
+    if (old_state.Get<IdleEmitFlag>()) {
+      idle_emit_sem.Release();
+    }
+    idle_worker_sem.Release(old_state.Get<IdleWorkerCount>());
+  }
+
+  // Called by emit thread if it is decided no more blocks will be emitted into
+  // this SST file.
+  void SetNoMoreToEmit(/*in/out*/ ThreadState& thread_state,
+                       /*in/out*/ uint32_t& slot) {
+    (void)slot;
+    State old_state;
+    atomic_state.Apply(NoMoreToEmitFlag::SetTransform(), &old_state);
+    assert(old_state.Get<NoMoreToEmitFlag>() == false);
+    assert(slot == BitwiseAnd(old_state.Get<NextToEmit>(), ring_buffer_mask));
+    assert(thread_state == ThreadState::kEmitting);
+    thread_state = ThreadState::kEnd;
+    WakeAllIdle();
+  }
+
+  // Called by any thread to abort parallel compression, etc. because of an
+  // error.
+  void SetAbort(/*in/out*/ ThreadState& thread_state) {
+    State old_state;
+    atomic_state.Apply(AbortFlag::SetTransform(), &old_state);
+    if (old_state.Get<AbortFlag>() == false) {
+      // First to set abort. Wake all workers and emitter
+      WakeAllIdle();
+    }
+    thread_state = ThreadState::kEnd;
+  }
+
+#ifdef BBTB_PC_WATCHDOG
+  // Logic for the extra "watchdog" thread in DEBUG builds that heuristically
+  // checks for the most likely deadlock conditions.
+  //
+  // Some ways to manually validate the watchdog:
+  // * Insert
+  //      if (Random::GetTLSInstance()->OneIn(100)) {
+  //        sleep(100);
+  //      }
+  //   after either of the calls to semaphore Acquire above.
+  // * Miss some Release()s in WakeAllIdle()
+  //
+  // and run table_test unit tests.
+  void BGWatchdog() {
+    int count_toward_deadlock_judgment = 0;
+    for (;;) {
+      // Check for termination condition: All workers and emit thread have
+      // completed.
+      if (live_workers.LoadRelaxed() == 0 && live_emit.LoadRelaxed() == false) {
+        return;
+      }
+
+      // Check for potential deadlock condition
+      if (idling_workers.LoadRelaxed() < live_workers.LoadRelaxed() ||
+          (live_emit.LoadRelaxed() && !idling_emit.LoadRelaxed())) {
+        // Someone is working, all good
+        count_toward_deadlock_judgment = 0;
+      } else {
+        // Could be a deadlock state, but could also be a transient
+        // state where someone has woken up but not cleared their idling flag.
+        // Give it plenty of time and watchdog thread wake-ups before
+        // declaring deadlock.
+        count_toward_deadlock_judgment++;
+        if (count_toward_deadlock_judgment >= 70) {
+          fprintf(stderr,
+                  "Error: apparent deadlock in parallel compression. "
+                  "Aborting. %u / %u, %d / %d, %llx\n",
+                  (unsigned)idling_workers.LoadRelaxed(),
+                  (unsigned)live_workers.LoadRelaxed(),
+                  (int)idling_emit.LoadRelaxed(), (int)live_emit.LoadRelaxed(),
+                  (long long)atomic_state.Load().underlying);
+          std::terminate();
+        }
+      }
+
+      // Sleep for 1s at a time unless we are woken up because other threads
+      // ended.
+      std::unique_lock<std::mutex> lock(watchdog_mutex);
+      if (!shutdown_watchdog) {
+        watchdog_cv.wait_for(lock, std::chrono::seconds{1});
+      }
+    }
+  }
+#endif  // BBTB_PC_WATCHDOG
+};
+
 struct BlockBasedTableBuilder::Rep {
   const ImmutableOptions ioptions;
   // BEGIN from MutableCFOptions
@@ -291,7 +802,9 @@ struct BlockBasedTableBuilder::Rep {
   // user key should contain the minimum timestamp.
   bool persist_user_defined_timestamps;
   WritableFileWriter* file;
-  std::atomic<uint64_t> offset;
+  // The current offset is only written by the current designated writer thread
+  // but can be read by other threads to estimate current file size
+  RelaxedAtomic<uint64_t> offset{0};
   size_t alignment;
   BlockBuilder data_block;
   // Buffers uncompressed data blocks to replay later. Needed when
@@ -306,19 +819,59 @@ struct BlockBasedTableBuilder::Rep {
   PartitionedIndexBuilder* p_index_builder_ = nullptr;
 
   std::string last_ikey;  // Internal key or empty (unset)
-  const Slice* first_key_in_next_block = nullptr;
-  CompressionType compression_type;
+  bool warm_cache = false;
+  bool uses_explicit_compression_manager = false;
+
   uint64_t sample_for_compression;
-  std::atomic<uint64_t> compressible_input_data_bytes;
-  std::atomic<uint64_t> uncompressible_input_data_bytes;
-  std::atomic<uint64_t> sampled_input_data_bytes;
-  std::atomic<uint64_t> sampled_output_slow_data_bytes;
-  std::atomic<uint64_t> sampled_output_fast_data_bytes;
-  CompressionOptions compression_opts;
-  std::unique_ptr<CompressionDict> compression_dict;
-  std::vector<std::unique_ptr<CompressionContext>> compression_ctxs;
-  std::vector<std::unique_ptr<UncompressionContext>> verify_ctxs;
-  std::unique_ptr<UncompressionDict> verify_dict;
+  RelaxedAtomic<uint64_t> compressible_input_data_bytes{0};
+  RelaxedAtomic<uint64_t> uncompressible_input_data_bytes{0};
+  RelaxedAtomic<uint64_t> sampled_input_data_bytes{0};
+  RelaxedAtomic<uint64_t> sampled_output_slow_data_bytes{0};
+  RelaxedAtomic<uint64_t> sampled_output_fast_data_bytes{0};
+  uint32_t compression_parallel_threads;
+  int max_compressed_bytes_per_kb;
+  // Dictionary guidance for data blocks (from GetDictGuidance())
+  Compressor::DictConfig data_block_dict_guidance;
+
+  // *** Compressors & decompressors - Yes, it seems like a lot here but ***
+  // *** these are distinct fields to minimize extra conditionals and    ***
+  // *** field reads on hot code paths. And to avoid interlocked         ***
+  // *** instructions associated with shared_ptr.                        ***
+
+  // A compressor for blocks in general, without dictionary compression
+  std::unique_ptr<Compressor> basic_compressor;
+  // Built-in compressors for compression size sampling
+  std::unique_ptr<Compressor> fast_sample_compressor;
+  std::unique_ptr<Compressor> slow_sample_compressor;
+  // A compressor for data blocks, which might be tuned differently and might
+  // use dictionary compression (when applicable). See ~Rep() for some details.
+  UnownedPtr<Compressor> data_block_compressor = nullptr;
+  // A compressor for index blocks, which might be tuned differently from
+  // basic_compressor. See ~Rep() for some details.
+  UnownedPtr<Compressor> index_block_compressor = nullptr;
+  // A decompressor corresponding to basic_compressor (when non-nullptr).
+  // Used for verification and cache warming.
+  std::shared_ptr<Decompressor> basic_decompressor;
+  // When needed, a decompressor for verifying compression using a
+  // dictionary sampled/trained from this file.
+  std::unique_ptr<Decompressor> verify_decompressor_with_dict;
+  // When non-nullptr, compression should be verified with this corresponding
+  // decompressor, except for data blocks. (Points to same as basic_decompressor
+  // when verify_compression is set.)
+  UnownedPtr<Decompressor> verify_decompressor;
+  // Once configured/determined, points to one of the above Decompressors to use
+  // in verifying data blocks.
+  UnownedPtr<Decompressor> data_block_verify_decompressor;
+
+  // Set of compression types used for blocks in this file (mixing compression
+  // algorithms in a single file is allowed, using a CompressionManager)
+  SmallEnumSet<CompressionType, kDisableCompressionOption>
+      compression_types_used;
+
+  // Working area for basic_compressor when compression_parallel_threads==1
+  WorkingAreaPair index_block_working_area;
+  // Working area for data_block_compressor, for emit/compaction thread
+  WorkingAreaPair data_block_working_area;
 
   size_t data_begin_offset = 0;
 
@@ -347,103 +900,119 @@ struct BlockBasedTableBuilder::Rep {
     kUnbuffered,
     kClosed,
   };
-  State state;
+  State state = State::kUnbuffered;
   // `kBuffered` state is allowed only as long as the buffering of uncompressed
   // data blocks (see `data_block_buffers`) does not exceed `buffer_limit`.
-  uint64_t buffer_limit;
+  uint64_t buffer_limit = 0;
   std::shared_ptr<CacheReservationManager>
       compression_dict_buffer_cache_res_mgr;
   const bool use_delta_encoding_for_index_values;
   std::unique_ptr<FilterBlockBuilder> filter_builder;
   OffsetableCacheKey base_cache_key;
   const TableFileCreationReason reason;
+  const bool target_file_size_is_upper_bound;
 
   BlockHandle pending_handle;  // Handle to add to index block
 
-  std::string compressed_output;
+  GrowableBuffer single_threaded_compressed_output;
   std::unique_ptr<FlushBlockPolicy> flush_block_policy;
 
   std::vector<std::unique_ptr<InternalTblPropColl>> table_properties_collectors;
 
   std::unique_ptr<ParallelCompressionRep> pc_rep;
+  RelaxedAtomic<uint64_t> worker_cpu_micros{0};
   BlockCreateContext create_context;
 
   // The size of the "tail" part of a SST file. "Tail" refers to
   // all blocks after data blocks till the end of the SST file.
   uint64_t tail_size;
 
+  // The total size of all blocks in this file before they are compressed.
+  // This is used for logging compaction stats.
+  uint64_t pre_compression_size = 0;
+
   // See class Footer
   uint32_t base_context_checksum;
 
-  uint64_t get_offset() { return offset.load(std::memory_order_relaxed); }
-  void set_offset(uint64_t o) { offset.store(o, std::memory_order_relaxed); }
+  uint64_t get_offset() { return offset.LoadRelaxed(); }
+  void set_offset(uint64_t o) { offset.StoreRelaxed(o); }
 
-  bool IsParallelCompressionEnabled() const {
-    return compression_opts.parallel_threads > 1;
-  }
+  bool IsParallelCompressionActive() const { return pc_rep != nullptr; }
 
-  Status GetStatus() {
-    // We need to make modifications of status visible when status_ok is set
-    // to false, and this is ensured by status_mutex, so no special memory
-    // order for status_ok is required.
-    if (status_ok.load(std::memory_order_relaxed)) {
-      return Status::OK();
-    } else {
-      return CopyStatus();
-    }
-  }
+  Status GetStatus() { return GetIOStatus(); }
 
-  Status CopyStatus() {
-    std::lock_guard<std::mutex> lock(status_mutex);
-    return status;
+  bool StatusOk() {
+    // The OK case is optimized with an atomic. Relaxed is sufficient because
+    // if a thread other than the emit/compaction thread sets to non-OK it
+    // will synchronize that in aborting parallel compression.
+    bool ok = io_status_ok.LoadRelaxed();
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+    if (ok) {
+      std::lock_guard<std::mutex> lock(io_status_mutex);
+      // Double-check
+      if (io_status_ok.LoadRelaxed()) {
+        io_status.PermitUncheckedError();
+        assert(io_status.ok());
+      } else {
+        ok = false;
+      }
+    }
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    return ok;
   }
 
   IOStatus GetIOStatus() {
-    // We need to make modifications of io_status visible when status_ok is set
-    // to false, and this is ensured by io_status_mutex, so no special memory
-    // order for io_status_ok is required.
-    if (io_status_ok.load(std::memory_order_relaxed)) {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED  // Avoid unnecessary lock acquisition
-      auto ios = CopyIOStatus();
-      ios.PermitUncheckedError();
-      // Assume no races in unit tests
-      assert(ios.ok());
+    // See StatusOk, which is optimized to avoid Status object copies
+    if (LIKELY(io_status_ok.LoadRelaxed())) {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+      std::lock_guard<std::mutex> lock(io_status_mutex);
+      // Double-check
+      if (io_status_ok.LoadRelaxed()) {
+        io_status.PermitUncheckedError();
+        assert(io_status.ok());
+      } else {
+        return io_status;
+      }
 #endif  // ROCKSDB_ASSERT_STATUS_CHECKED
       return IOStatus::OK();
     } else {
-      return CopyIOStatus();
+      std::lock_guard<std::mutex> lock(io_status_mutex);
+      return io_status;
     }
   }
 
-  IOStatus CopyIOStatus() {
-    std::lock_guard<std::mutex> lock(io_status_mutex);
-    return io_status;
+  // Avoid copying Status and IOStatus objects as much as possible.
+  // Never erase an existing I/O status that is not OK.
+  void SetStatus(Status&& s) {
+    if (UNLIKELY(!s.ok()) && io_status_ok.LoadRelaxed()) {
+      SetFailedIOStatus(status_to_io_status(std::move(s)));
+    }
   }
-
-  // Never erase an existing status that is not OK.
-  void SetStatus(Status s) {
-    if (!s.ok() && status_ok.load(std::memory_order_relaxed)) {
-      // Locking is an overkill for non compression_opts.parallel_threads
-      // case but since it's unlikely that s is not OK, we take this cost
-      // to be simplicity.
-      std::lock_guard<std::mutex> lock(status_mutex);
-      status = s;
-      status_ok.store(false, std::memory_order_relaxed);
+  void SetStatus(const Status& s) {
+    if (UNLIKELY(!s.ok()) && io_status_ok.LoadRelaxed()) {
+      SetFailedIOStatus(status_to_io_status(Status(s)));
+    }
+  }
+  void SetIOStatus(IOStatus&& ios) {
+    if (UNLIKELY(!ios.ok()) && io_status_ok.LoadRelaxed()) {
+      SetFailedIOStatus(std::move(ios));
+    }
+  }
+  void SetIOStatus(const IOStatus& ios) {
+    if (UNLIKELY(!ios.ok()) && io_status_ok.LoadRelaxed()) {
+      SetFailedIOStatus(IOStatus(ios));
     }
   }
 
-  // Never erase an existing I/O status that is not OK.
-  // Calling this will also SetStatus(ios)
-  void SetIOStatus(IOStatus ios) {
-    if (!ios.ok() && io_status_ok.load(std::memory_order_relaxed)) {
-      // Locking is an overkill for non compression_opts.parallel_threads
-      // case but since it's unlikely that s is not OK, we take this cost
-      // to be simplicity.
-      std::lock_guard<std::mutex> lock(io_status_mutex);
-      io_status = ios;
-      io_status_ok.store(false, std::memory_order_relaxed);
+  void SetFailedIOStatus(IOStatus&& ios) {
+    assert(!ios.ok());
+    // Because !s.ok() is rare, locking is acceptable even in non-parallel case.
+    std::lock_guard<std::mutex> lock(io_status_mutex);
+    // Double-check
+    if (io_status.ok()) {
+      io_status = std::move(ios);
+      io_status_ok.StoreRelaxed(false);
     }
-    SetStatus(ios);
   }
 
   Rep(const BlockBasedTableOptions& table_opt, const TableBuilderOptions& tbo,
@@ -457,7 +1026,6 @@ struct BlockBasedTableBuilder::Rep {
         persist_user_defined_timestamps(
             tbo.ioptions.persist_user_defined_timestamps),
         file(f),
-        offset(0),
         alignment(table_options.block_align
                       ? std::min(static_cast<size_t>(table_options.block_size),
                                  kDefaultPageSize)
@@ -478,45 +1046,166 @@ struct BlockBasedTableBuilder::Rep {
             0.75 /* data_block_hash_table_util_ratio */, ts_sz,
             persist_user_defined_timestamps),
         internal_prefix_transform(prefix_extractor.get()),
-        compression_type(tbo.compression_type),
         sample_for_compression(tbo.moptions.sample_for_compression),
-        compressible_input_data_bytes(0),
-        uncompressible_input_data_bytes(0),
-        sampled_input_data_bytes(0),
-        sampled_output_slow_data_bytes(0),
-        sampled_output_fast_data_bytes(0),
-        compression_opts(tbo.compression_opts),
-        compression_dict(),
-        compression_ctxs(tbo.compression_opts.parallel_threads),
-        verify_ctxs(tbo.compression_opts.parallel_threads),
-        verify_dict(),
-        state((tbo.compression_opts.max_dict_bytes > 0 &&
-               tbo.compression_type != kNoCompression)
-                  ? State::kBuffered
-                  : State::kUnbuffered),
+        compression_parallel_threads(
+            ((table_opt.partition_filters &&
+              !table_opt.decouple_partitioned_filters) ||
+             table_options.user_defined_index_factory)
+                ? uint32_t{1}
+                : tbo.compression_opts.parallel_threads),
+        max_compressed_bytes_per_kb(
+            tbo.compression_opts.max_compressed_bytes_per_kb),
         use_delta_encoding_for_index_values(table_opt.format_version >= 4 &&
                                             !table_opt.block_align),
         reason(tbo.reason),
+        target_file_size_is_upper_bound(
+            tbo.moptions.target_file_size_is_upper_bound),
         flush_block_policy(
             table_options.flush_block_policy_factory->NewFlushBlockPolicy(
                 table_options, data_block)),
         create_context(&table_options, &ioptions, ioptions.stats,
-                       compression_type == kZSTD,
+                       /*decompressor=*/nullptr,
                        tbo.moptions.block_protection_bytes_per_key,
                        tbo.internal_comparator.user_comparator(),
                        !use_delta_encoding_for_index_values,
                        table_opt.index_type ==
                            BlockBasedTableOptions::kBinarySearchWithFirstKey),
-        tail_size(0),
-        status_ok(true),
-        io_status_ok(true) {
-    if (tbo.target_file_size == 0) {
-      buffer_limit = compression_opts.max_dict_buffer_bytes;
-    } else if (compression_opts.max_dict_buffer_bytes == 0) {
-      buffer_limit = tbo.target_file_size;
+        tail_size(0) {
+    FilterBuildingContext filter_context(table_options);
+
+    filter_context.info_log = ioptions.logger;
+    filter_context.column_family_name = tbo.column_family_name;
+    filter_context.reason = reason;
+
+    // Only populate other fields if known to be in LSM rather than
+    // generating external SST file
+    if (reason != TableFileCreationReason::kMisc) {
+      filter_context.compaction_style = ioptions.compaction_style;
+      filter_context.num_levels = ioptions.num_levels;
+      filter_context.level_at_creation = tbo.level_at_creation;
+      filter_context.is_bottommost = tbo.is_bottommost;
+      assert(filter_context.level_at_creation < filter_context.num_levels);
+    }
+
+    props.compression_options =
+        CompressionOptionsToString(tbo.compression_opts);
+
+    auto* mgr = tbo.moptions.compression_manager.get();
+    if (mgr == nullptr) {
+      uses_explicit_compression_manager = false;
+      mgr = GetBuiltinV2CompressionManager().get();
     } else {
-      buffer_limit = std::min(tbo.target_file_size,
-                              compression_opts.max_dict_buffer_bytes);
+      uses_explicit_compression_manager = true;
+
+      // Stuff some extra debugging info as extra pseudo-options. Using
+      // underscore prefix to indicate they are special.
+      props.compression_options.append("_compression_manager=");
+      props.compression_options.append(mgr->GetId());
+      props.compression_options.append("; ");
+    }
+
+    // Sanitize to only allowing compression when it saves space.
+    max_compressed_bytes_per_kb =
+        std::min(int{1023}, tbo.compression_opts.max_compressed_bytes_per_kb);
+
+    basic_compressor = mgr->GetCompressorForSST(
+        filter_context, tbo.compression_opts, tbo.compression_type);
+    if (basic_compressor) {
+      if (table_options.enable_index_compression) {
+        index_block_compressor = MaybeCloneSpecialized(
+            basic_compressor.get(), CacheEntryRole::kIndexBlock);
+        index_block_working_area.compress =
+            index_block_compressor->ObtainWorkingArea();
+      }
+      data_block_dict_guidance =
+          basic_compressor->GetDictGuidance(CacheEntryRole::kDataBlock);
+      if (auto* sampling =
+              std::get_if<Compressor::DictSampling>(&data_block_dict_guidance);
+          sampling != nullptr && sampling->max_sample_bytes > 0) {
+        // Sampling mode: collect samples up to max_sample_bytes
+        state = State::kBuffered;
+        if (tbo.target_file_size == 0) {
+          buffer_limit = tbo.compression_opts.max_dict_buffer_bytes;
+        } else if (tbo.compression_opts.max_dict_buffer_bytes == 0) {
+          buffer_limit = tbo.target_file_size;
+        } else {
+          buffer_limit = std::min(tbo.target_file_size,
+                                  tbo.compression_opts.max_dict_buffer_bytes);
+        }
+      } else if (auto* predef = std::get_if<Compressor::DictPreDefined>(
+                     &data_block_dict_guidance);
+                 predef != nullptr && !predef->dict_data.empty()) {
+        // Pre-defined dictionary mode: use it immediately, no buffering
+        data_block_compressor = MaybeCloneSpecialized(
+            basic_compressor.get(), CacheEntryRole::kDataBlock,
+            Compressor::DictPreDefined{std::string{predef->dict_data}});
+        data_block_working_area.compress =
+            data_block_compressor->ObtainWorkingArea();
+      } else {
+        assert(std::holds_alternative<Compressor::DictSampling>(
+                   data_block_dict_guidance) ||
+               std::holds_alternative<Compressor::DictPreDefined>(
+                   data_block_dict_guidance) ||
+               std::holds_alternative<Compressor::DictDisabled>(
+                   data_block_dict_guidance));
+        // No distinct data block compressor using dictionary, but
+        // implementation might still want to specialize for data blocks
+        data_block_compressor = MaybeCloneSpecialized(
+            basic_compressor.get(), CacheEntryRole::kDataBlock);
+        data_block_working_area.compress =
+            data_block_compressor->ObtainWorkingArea();
+      }
+      basic_decompressor = basic_compressor->GetOptimizedDecompressor();
+      if (basic_decompressor == nullptr) {
+        // Optimized version not available
+        basic_decompressor = mgr->GetDecompressor();
+      }
+      create_context.decompressor = basic_decompressor.get();
+
+      if (table_options.verify_compression) {
+        verify_decompressor = basic_decompressor.get();
+        if (table_options.enable_index_compression) {
+          index_block_working_area.verify =
+              verify_decompressor->ObtainWorkingArea(
+                  index_block_compressor->GetPreferredCompressionType());
+        }
+        if (state == State::kUnbuffered) {
+          assert(data_block_compressor);
+          data_block_verify_decompressor = verify_decompressor.get();
+          data_block_working_area.verify =
+              data_block_verify_decompressor->ObtainWorkingArea(
+                  data_block_compressor->GetPreferredCompressionType());
+        }
+      }
+    }
+
+    if (sample_for_compression > 0) {
+      auto builtin = GetBuiltinV2CompressionManager();
+      if (builtin->SupportsCompressionType(kLZ4Compression)) {
+        fast_sample_compressor = builtin->GetCompressor({}, kLZ4Compression);
+      } else if (builtin->SupportsCompressionType(kSnappyCompression)) {
+        fast_sample_compressor = builtin->GetCompressor({}, kSnappyCompression);
+      }
+      if (builtin->SupportsCompressionType(kZSTD)) {
+        slow_sample_compressor = builtin->GetCompressor({}, kZSTD);
+      } else if (builtin->SupportsCompressionType(kZlibCompression)) {
+        slow_sample_compressor = builtin->GetCompressor({}, kZlibCompression);
+      }
+      // NOTE: even if both sampling compressors are nullptr, we still populate
+      // the table properties with placeholder info
+    }
+
+    switch (table_options.prepopulate_block_cache) {
+      case BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly:
+        warm_cache = (reason == TableFileCreationReason::kFlush);
+        break;
+      case BlockBasedTableOptions::PrepopulateBlockCache::kDisable:
+        warm_cache = false;
+        break;
+      default:
+        // missing case
+        assert(false);
+        warm_cache = false;
     }
 
     const auto compress_dict_build_buffer_charged =
@@ -536,11 +1225,6 @@ struct BlockBasedTableBuilder::Rep {
       compression_dict_buffer_cache_res_mgr = nullptr;
     }
 
-    assert(compression_ctxs.size() >= compression_opts.parallel_threads);
-    for (uint32_t i = 0; i < compression_opts.parallel_threads; i++) {
-      compression_ctxs[i].reset(
-          new CompressionContext(compression_type, compression_opts));
-    }
     if (table_options.index_type ==
         BlockBasedTableOptions::kTwoLevelIndexSearch) {
       p_index_builder_ = PartitionedIndexBuilder::CreateIndexBuilder(
@@ -553,33 +1237,42 @@ struct BlockBasedTableBuilder::Rep {
           &this->internal_prefix_transform, use_delta_encoding_for_index_values,
           table_options, ts_sz, persist_user_defined_timestamps));
     }
+
+    // If user_defined_index_factory is provided, wrap the index builder with
+    // UserDefinedIndexWrapper
+    if (table_options.user_defined_index_factory != nullptr) {
+      if (tbo.moptions.compression_opts.parallel_threads > 1 ||
+          tbo.moptions.bottommost_compression_opts.parallel_threads > 1) {
+        SetStatus(
+            Status::InvalidArgument("user_defined_index_factory not supported "
+                                    "with parallel compression"));
+      } else {
+        std::unique_ptr<UserDefinedIndexBuilder> user_defined_index_builder;
+        UserDefinedIndexOption udi_options;
+        udi_options.comparator = internal_comparator.user_comparator();
+        auto s = table_options.user_defined_index_factory->NewBuilder(
+            udi_options, user_defined_index_builder);
+        if (!s.ok()) {
+          SetStatus(s);
+        } else {
+          if (user_defined_index_builder != nullptr) {
+            index_builder = std::make_unique<UserDefinedIndexBuilderWrapper>(
+                std::string(table_options.user_defined_index_factory->Name()),
+                std::move(index_builder), std::move(user_defined_index_builder),
+                &internal_comparator, ts_sz, persist_user_defined_timestamps);
+          }
+        }
+      }
+    }
+
     if (ioptions.optimize_filters_for_hits && tbo.is_bottommost) {
       // Apply optimize_filters_for_hits setting here when applicable by
       // skipping filter generation
       filter_builder.reset();
-    } else if (tbo.skip_filters) {
-      // For SstFileWriter skip_filters
-      filter_builder.reset();
     } else if (!table_options.filter_policy) {
       // Null filter_policy -> no filter
       filter_builder.reset();
     } else {
-      FilterBuildingContext filter_context(table_options);
-
-      filter_context.info_log = ioptions.logger;
-      filter_context.column_family_name = tbo.column_family_name;
-      filter_context.reason = reason;
-
-      // Only populate other fields if known to be in LSM rather than
-      // generating external SST file
-      if (reason != TableFileCreationReason::kMisc) {
-        filter_context.compaction_style = ioptions.compaction_style;
-        filter_context.num_levels = ioptions.num_levels;
-        filter_context.level_at_creation = tbo.level_at_creation;
-        filter_context.is_bottommost = tbo.is_bottommost;
-        assert(filter_context.level_at_creation < filter_context.num_levels);
-      }
-
       filter_builder.reset(CreateFilterBlockBuilder(
           ioptions, tbo.moptions, filter_context,
           use_delta_encoding_for_index_values, p_index_builder_, ts_sz,
@@ -600,20 +1293,15 @@ struct BlockBasedTableBuilder::Rep {
       }
     }
     table_properties_collectors.emplace_back(
-        new BlockBasedTablePropertiesCollector(
+        std::make_unique<BlockBasedTablePropertiesCollector>(
             table_options.index_type, table_options.whole_key_filtering,
             prefix_extractor != nullptr,
             table_options.decouple_partitioned_filters));
     if (ts_sz > 0 && persist_user_defined_timestamps) {
       table_properties_collectors.emplace_back(
-          new TimestampTablePropertiesCollector(
+          std::make_unique<TimestampTablePropertiesCollector>(
               tbo.internal_comparator.user_comparator()));
     }
-    if (table_options.verify_compression) {
-      for (uint32_t i = 0; i < compression_opts.parallel_threads; i++) {
-        verify_ctxs[i].reset(new UncompressionContext(compression_type));
-      }
-    }
 
     // These are only needed for populating table properties
     props.column_family_id = tbo.column_family_id;
@@ -632,6 +1320,9 @@ struct BlockBasedTableBuilder::Rep {
     // Default is UINT64_MAX for unknown. Setting it to 0 here
     // to allow updating it by taking max in BlockBasedTableBuilder::Add().
     props.key_largest_seqno = 0;
+    // Default is UINT64_MAX for unknown.
+    props.key_smallest_seqno = UINT64_MAX;
+    PrePopulateCompressionProperties(mgr);
 
     if (FormatVersionUsesContextChecksum(table_options.format_version)) {
       // Must be non-zero and semi- or quasi-random
@@ -644,7 +1335,7 @@ struct BlockBasedTableBuilder::Rep {
       base_context_checksum = 0;
     }
 
-    if (alignment > 0 && compression_type != kNoCompression) {
+    if (alignment > 0 && basic_compressor) {
       // With better sanitization in `CompactionPicker::CompactFiles()`, we
       // would not need to handle this case here and could change it to an
       // assertion instead.
@@ -653,347 +1344,129 @@ struct BlockBasedTableBuilder::Rep {
     }
   }
 
-  Rep(const Rep&) = delete;
-  Rep& operator=(const Rep&) = delete;
-
- private:
-  // Synchronize status & io_status accesses across threads from main thread,
-  // compression thread and write thread in parallel compression.
-  std::mutex status_mutex;
-  std::atomic<bool> status_ok;
-  Status status;
-  std::mutex io_status_mutex;
-  std::atomic<bool> io_status_ok;
-  IOStatus io_status;
-};
-
-struct BlockBasedTableBuilder::ParallelCompressionRep {
-  // TODO: consider replacing with autovector or similar
-  // Keys is a wrapper of vector of strings avoiding
-  // releasing string memories during vector clear()
-  // in order to save memory allocation overhead
-  class Keys {
-   public:
-    Keys() : keys_(kKeysInitSize), size_(0) {}
-    void PushBack(const Slice& key) {
-      if (size_ == keys_.size()) {
-        keys_.emplace_back(key.data(), key.size());
-      } else {
-        keys_[size_].assign(key.data(), key.size());
-      }
-      size_++;
-    }
-    void SwapAssign(std::vector<std::string>& keys) {
-      size_ = keys.size();
-      std::swap(keys_, keys);
-    }
-    void Clear() { size_ = 0; }
-    size_t Size() { return size_; }
-    std::string& Back() { return keys_[size_ - 1]; }
-    std::string& operator[](size_t idx) {
-      assert(idx < size_);
-      return keys_[idx];
-    }
-
-   private:
-    const size_t kKeysInitSize = 32;
-    std::vector<std::string> keys_;
-    size_t size_;
-  };
-  std::unique_ptr<Keys> curr_block_keys;
-
-  class BlockRepSlot;
-
-  // BlockRep instances are fetched from and recycled to
-  // block_rep_pool during parallel compression.
-  struct BlockRep {
-    Slice contents;
-    Slice compressed_contents;
-    std::unique_ptr<std::string> data;
-    std::unique_ptr<std::string> compressed_data;
-    CompressionType compression_type;
-    std::unique_ptr<std::string> first_key_in_next_block;
-    std::unique_ptr<Keys> keys;
-    std::unique_ptr<BlockRepSlot> slot;
-    Status status;
-  };
-  // Use a vector of BlockRep as a buffer for a determined number
-  // of BlockRep structures. All data referenced by pointers in
-  // BlockRep will be freed when this vector is destructed.
-  using BlockRepBuffer = std::vector<BlockRep>;
-  BlockRepBuffer block_rep_buf;
-  // Use a thread-safe queue for concurrent access from block
-  // building thread and writer thread.
-  using BlockRepPool = WorkQueue<BlockRep*>;
-  BlockRepPool block_rep_pool;
-
-  // Use BlockRepSlot to keep block order in write thread.
-  // slot_ will pass references to BlockRep
-  class BlockRepSlot {
-   public:
-    BlockRepSlot() : slot_(1) {}
-    template <typename T>
-    void Fill(T&& rep) {
-      slot_.push(std::forward<T>(rep));
-    }
-    void Take(BlockRep*& rep) { slot_.pop(rep); }
-
-   private:
-    // slot_ will pass references to BlockRep in block_rep_buf,
-    // and those references are always valid before the destruction of
-    // block_rep_buf.
-    WorkQueue<BlockRep*> slot_;
-  };
-
-  // Compression queue will pass references to BlockRep in block_rep_buf,
-  // and those references are always valid before the destruction of
-  // block_rep_buf.
-  using CompressQueue = WorkQueue<BlockRep*>;
-  CompressQueue compress_queue;
-  std::vector<port::Thread> compress_thread_pool;
-
-  // Write queue will pass references to BlockRep::slot in block_rep_buf,
-  // and those references are always valid before the corresponding
-  // BlockRep::slot is destructed, which is before the destruction of
-  // block_rep_buf.
-  using WriteQueue = WorkQueue<BlockRepSlot*>;
-  WriteQueue write_queue;
-  std::unique_ptr<port::Thread> write_thread;
-
-  // Estimate output file size when parallel compression is enabled. This is
-  // necessary because compression & flush are no longer synchronized,
-  // and BlockBasedTableBuilder::FileSize() is no longer accurate.
-  // memory_order_relaxed suffices because accurate statistics is not required.
-  class FileSizeEstimator {
-   public:
-    explicit FileSizeEstimator()
-        : uncomp_bytes_compressed(0),
-          uncomp_bytes_curr_block(0),
-          uncomp_bytes_curr_block_set(false),
-          uncomp_bytes_inflight(0),
-          blocks_inflight(0),
-          curr_compression_ratio(0),
-          estimated_file_size(0) {}
-
-    // Estimate file size when a block is about to be emitted to
-    // compression thread
-    void EmitBlock(uint64_t uncomp_block_size, uint64_t curr_file_size) {
-      uint64_t new_uncomp_bytes_inflight =
-          uncomp_bytes_inflight.fetch_add(uncomp_block_size,
-                                          std::memory_order_relaxed) +
-          uncomp_block_size;
-
-      uint64_t new_blocks_inflight =
-          blocks_inflight.fetch_add(1, std::memory_order_relaxed) + 1;
-
-      estimated_file_size.store(
-          curr_file_size +
-              static_cast<uint64_t>(
-                  static_cast<double>(new_uncomp_bytes_inflight) *
-                  curr_compression_ratio.load(std::memory_order_relaxed)) +
-              new_blocks_inflight * kBlockTrailerSize,
-          std::memory_order_relaxed);
-    }
-
-    // Estimate file size when a block is already reaped from
-    // compression thread
-    void ReapBlock(uint64_t compressed_block_size, uint64_t curr_file_size) {
-      assert(uncomp_bytes_curr_block_set);
-
-      uint64_t new_uncomp_bytes_compressed =
-          uncomp_bytes_compressed + uncomp_bytes_curr_block;
-      assert(new_uncomp_bytes_compressed > 0);
-
-      curr_compression_ratio.store(
-          (curr_compression_ratio.load(std::memory_order_relaxed) *
-               uncomp_bytes_compressed +
-           compressed_block_size) /
-              static_cast<double>(new_uncomp_bytes_compressed),
-          std::memory_order_relaxed);
-      uncomp_bytes_compressed = new_uncomp_bytes_compressed;
-
-      uint64_t new_uncomp_bytes_inflight =
-          uncomp_bytes_inflight.fetch_sub(uncomp_bytes_curr_block,
-                                          std::memory_order_relaxed) -
-          uncomp_bytes_curr_block;
-
-      uint64_t new_blocks_inflight =
-          blocks_inflight.fetch_sub(1, std::memory_order_relaxed) - 1;
-
-      estimated_file_size.store(
-          curr_file_size +
-              static_cast<uint64_t>(
-                  static_cast<double>(new_uncomp_bytes_inflight) *
-                  curr_compression_ratio.load(std::memory_order_relaxed)) +
-              new_blocks_inflight * kBlockTrailerSize,
-          std::memory_order_relaxed);
-
-      uncomp_bytes_curr_block_set = false;
-    }
-
-    void SetEstimatedFileSize(uint64_t size) {
-      estimated_file_size.store(size, std::memory_order_relaxed);
-    }
-
-    uint64_t GetEstimatedFileSize() {
-      return estimated_file_size.load(std::memory_order_relaxed);
-    }
-
-    void SetCurrBlockUncompSize(uint64_t size) {
-      uncomp_bytes_curr_block = size;
-      uncomp_bytes_curr_block_set = true;
-    }
-
-   private:
-    // Input bytes compressed so far.
-    uint64_t uncomp_bytes_compressed;
-    // Size of current block being appended.
-    uint64_t uncomp_bytes_curr_block;
-    // Whether uncomp_bytes_curr_block has been set for next
-    // ReapBlock call.
-    bool uncomp_bytes_curr_block_set;
-    // Input bytes under compression and not appended yet.
-    std::atomic<uint64_t> uncomp_bytes_inflight;
-    // Number of blocks under compression and not appended yet.
-    std::atomic<uint64_t> blocks_inflight;
-    // Current compression ratio, maintained by BGWorkWriteMaybeCompressedBlock.
-    std::atomic<double> curr_compression_ratio;
-    // Estimated SST file size.
-    std::atomic<uint64_t> estimated_file_size;
-  };
-  FileSizeEstimator file_size_estimator;
-
-  // Facilities used for waiting first block completion. Need to Wait for
-  // the completion of first block compression and flush to get a non-zero
-  // compression ratio.
-  std::atomic<bool> first_block_processed;
-  std::condition_variable first_block_cond;
-  std::mutex first_block_mutex;
-
-  explicit ParallelCompressionRep(uint32_t parallel_threads)
-      : curr_block_keys(new Keys()),
-        block_rep_buf(parallel_threads),
-        block_rep_pool(parallel_threads),
-        compress_queue(parallel_threads),
-        write_queue(parallel_threads),
-        first_block_processed(false) {
-    for (uint32_t i = 0; i < parallel_threads; i++) {
-      block_rep_buf[i].contents = Slice();
-      block_rep_buf[i].compressed_contents = Slice();
-      block_rep_buf[i].data.reset(new std::string());
-      block_rep_buf[i].compressed_data.reset(new std::string());
-      block_rep_buf[i].compression_type = CompressionType();
-      block_rep_buf[i].first_key_in_next_block.reset(new std::string());
-      block_rep_buf[i].keys.reset(new Keys());
-      block_rep_buf[i].slot.reset(new BlockRepSlot());
-      block_rep_buf[i].status = Status::OK();
-      block_rep_pool.push(&block_rep_buf[i]);
-    }
-  }
-
-  ~ParallelCompressionRep() { block_rep_pool.finish(); }
-
-  // Make a block prepared to be emitted to compression thread
-  // Used in non-buffered mode
-  BlockRep* PrepareBlock(CompressionType compression_type,
-                         const Slice* first_key_in_next_block,
-                         BlockBuilder* data_block) {
-    BlockRep* block_rep =
-        PrepareBlockInternal(compression_type, first_key_in_next_block);
-    assert(block_rep != nullptr);
-    data_block->SwapAndReset(*(block_rep->data));
-    block_rep->contents = *(block_rep->data);
-    std::swap(block_rep->keys, curr_block_keys);
-    curr_block_keys->Clear();
-    return block_rep;
-  }
-
-  // Used in EnterUnbuffered
-  BlockRep* PrepareBlock(CompressionType compression_type,
-                         const Slice* first_key_in_next_block,
-                         std::string* data_block,
-                         std::vector<std::string>* keys) {
-    BlockRep* block_rep =
-        PrepareBlockInternal(compression_type, first_key_in_next_block);
-    assert(block_rep != nullptr);
-    std::swap(*(block_rep->data), *data_block);
-    block_rep->contents = *(block_rep->data);
-    block_rep->keys->SwapAssign(*keys);
-    return block_rep;
-  }
-
-  // Emit a block to compression thread
-  void EmitBlock(BlockRep* block_rep) {
-    assert(block_rep != nullptr);
-    assert(block_rep->status.ok());
-    if (!write_queue.push(block_rep->slot.get())) {
-      return;
-    }
-    if (!compress_queue.push(block_rep)) {
-      return;
+  ~Rep() {
+    // Delete working areas before their compressors.
+    index_block_working_area = {};
+    data_block_working_area = {};
+    // Must have been cleaned up by StopParallelCompression
+    assert(pc_rep == nullptr);
+    // Delete specialized compressors if they were distinct (avoiding extra
+    // fields and interlocked instructions with shared_ptr)
+    if (data_block_compressor.get() != basic_compressor.get()) {
+      delete data_block_compressor.get();
     }
-
-    if (!first_block_processed.load(std::memory_order_relaxed)) {
-      std::unique_lock<std::mutex> lock(first_block_mutex);
-      first_block_cond.wait(lock, [this] {
-        return first_block_processed.load(std::memory_order_relaxed);
-      });
+    if (index_block_compressor.get() != basic_compressor.get()) {
+      delete index_block_compressor.get();
     }
   }
 
-  // Reap a block from compression thread
-  void ReapBlock(BlockRep* block_rep) {
-    assert(block_rep != nullptr);
-    block_rep->compressed_data->clear();
-    block_rep_pool.push(block_rep);
+  Rep(const Rep&) = delete;
+  Rep& operator=(const Rep&) = delete;
 
-    if (!first_block_processed.load(std::memory_order_relaxed)) {
-      std::lock_guard<std::mutex> lock(first_block_mutex);
-      first_block_processed.store(true, std::memory_order_relaxed);
-      first_block_cond.notify_one();
+  void PrePopulateCompressionProperties(UnownedPtr<CompressionManager> mgr) {
+    if (FormatVersionUsesCompressionManagerName(table_options.format_version)) {
+      assert(mgr);
+      // Use newer compression_name property
+      props.compression_name.reserve(32);
+      // If compression is disabled, use empty manager name
+      if (basic_compressor) {
+        props.compression_name.append(mgr->CompatibilityName());
+      }
+      props.compression_name.push_back(';');
+      // Rest of property to be filled out at the end of building the file
+    } else {
+      // Use legacy compression_name property, populated at the end of
+      // building the file. Not compatible with compression managers using
+      // custom algorithms / compression types.
+      assert(
+          Slice(mgr->CompatibilityName())
+              .compare(GetBuiltinV2CompressionManager()->CompatibilityName()) ==
+          0);
+    }
+  }
+  void PostPopulateCompressionProperties() {
+    // Do not include "no compression" in the set. It's not really useful
+    // information whether there are any uncompressed blocks. Some kinds of
+    // blocks are never compressed anyway.
+    compression_types_used.Remove(kNoCompression);
+    size_t ctype_count = compression_types_used.count();
+
+    if (uses_explicit_compression_manager) {
+      // Stuff some extra debugging info as extra pseudo-options. Using
+      // underscore prefix to indicate they are special.
+      std::string& compression_options = props.compression_options;
+      compression_options.append("_compressor=");
+      compression_options.append(data_block_compressor
+                                     ? data_block_compressor->GetId()
+                                     : std::string{});
+      compression_options.append("; ");
+    } else {
+      // No explicit compression manager
+      assert(compression_types_used.count() <= 1);
+    }
+
+    std::string& compression_name = props.compression_name;
+    if (FormatVersionUsesCompressionManagerName(table_options.format_version)) {
+      // Fill in extended field of "compression name" property, which is the
+      // set of compression types used, sorted by unsigned byte and then hex
+      // encoded with two digits each (so that table properties are human
+      // readable).
+      assert(*compression_name.rbegin() == ';');
+      size_t pos = compression_name.size();
+      // Make space for the field contents
+      compression_name.append(ctype_count * 2, '\0');
+      char* ptr = compression_name.data() + pos;
+      // Populate the field contents
+      for (CompressionType t : compression_types_used) {
+        PutBaseChars<16>(&ptr, /*n=*/2, static_cast<unsigned char>(t),
+                         /*uppercase=*/true);
+      }
+      assert(ptr == compression_name.data() + pos + ctype_count * 2);
+      // Allow additional fields in the future
+      compression_name.push_back(';');
+    } else {
+      // Use legacy compression naming. To adhere to requirements described in
+      // TableProperties::compression_name, we might have to replace the name
+      // based on the legacy configured compression type.
+      assert(compression_name.empty());
+      if (ctype_count == 0) {
+        // We could get a slight performance boost in the reader by marking
+        // the file as "no compression" if compression is configured but
+        // consistently rejected, but that would give misleading info for
+        // debugging purposes. So instead we record the configured compression
+        // type, matching the historical behavior.
+        if (data_block_compressor) {
+          compression_name = CompressionTypeToString(
+              data_block_compressor->GetPreferredCompressionType());
+        } else {
+          assert(basic_compressor == nullptr);
+          compression_name = CompressionTypeToString(kNoCompression);
+        }
+      } else if (compression_types_used.Contains(kZSTD)) {
+        compression_name = CompressionTypeToString(kZSTD);
+      } else {
+        compression_name =
+            CompressionTypeToString(*compression_types_used.begin());
+      }
     }
   }
 
  private:
-  BlockRep* PrepareBlockInternal(CompressionType compression_type,
-                                 const Slice* first_key_in_next_block) {
-    BlockRep* block_rep = nullptr;
-    block_rep_pool.pop(block_rep);
-    assert(block_rep != nullptr);
-
-    assert(block_rep->data);
-
-    block_rep->compression_type = compression_type;
-
-    if (first_key_in_next_block == nullptr) {
-      block_rep->first_key_in_next_block.reset(nullptr);
-    } else {
-      block_rep->first_key_in_next_block->assign(
-          first_key_in_next_block->data(), first_key_in_next_block->size());
-    }
-
-    return block_rep;
-  }
+  // Synchronize io_status to be readable/writable across threads, but
+  // optimize for the OK case
+  std::mutex io_status_mutex;
+  RelaxedAtomic<bool> io_status_ok{true};
+  IOStatus io_status;
 };
 
 BlockBasedTableBuilder::BlockBasedTableBuilder(
     const BlockBasedTableOptions& table_options, const TableBuilderOptions& tbo,
     WritableFileWriter* file) {
   BlockBasedTableOptions sanitized_table_options(table_options);
-  if (sanitized_table_options.format_version == 0 &&
-      sanitized_table_options.checksum != kCRC32c) {
-    ROCKS_LOG_WARN(
-        tbo.ioptions.logger,
-        "Silently converting format_version to 1 because checksum is "
-        "non-default");
-    // silently convert format_version to 1 to keep consistent with current
-    // behavior
-    sanitized_table_options.format_version = 1;
-  }
   auto ucmp = tbo.internal_comparator.user_comparator();
   assert(ucmp);
   (void)ucmp;  // avoids unused variable error.
-  rep_ = new Rep(sanitized_table_options, tbo, file);
+  rep_ = std::make_unique<Rep>(sanitized_table_options, tbo, file);
 
   TEST_SYNC_POINT_CALLBACK(
       "BlockBasedTableBuilder::BlockBasedTableBuilder:PreSetupBaseCacheKey",
@@ -1002,92 +1475,58 @@ BlockBasedTableBuilder::BlockBasedTableBuilder(
   BlockBasedTable::SetupBaseCacheKey(&rep_->props, tbo.db_session_id,
                                      tbo.cur_file_num, &rep_->base_cache_key);
 
-  if (rep_->IsParallelCompressionEnabled()) {
-    StartParallelCompression();
+  MaybeStartParallelCompression();
+  if (!rep_->IsParallelCompressionActive() && rep_->basic_compressor) {
+    rep_->single_threaded_compressed_output.ResetForSize(
+        table_options.block_size);
   }
 }
 
 BlockBasedTableBuilder::~BlockBasedTableBuilder() {
   // Catch errors where caller forgot to call Finish()
   assert(rep_->state == Rep::State::kClosed);
-  delete rep_;
 }
 
 void BlockBasedTableBuilder::Add(const Slice& ikey, const Slice& value) {
-  Rep* r = rep_;
+  Rep* r = rep_.get();
   assert(rep_->state != Rep::State::kClosed);
-  if (!ok()) {
+  if (UNLIKELY(!ok())) {
     return;
   }
   ValueType value_type;
   SequenceNumber seq;
   UnPackSequenceAndType(ExtractInternalKeyFooter(ikey), &seq, &value_type);
   r->props.key_largest_seqno = std::max(r->props.key_largest_seqno, seq);
+  r->props.key_smallest_seqno = std::min(r->props.key_smallest_seqno, seq);
   if (IsValueType(value_type)) {
 #ifndef NDEBUG
     if (r->props.num_entries > r->props.num_range_deletions) {
       assert(r->internal_comparator.Compare(ikey, Slice(r->last_ikey)) > 0);
     }
+    bool skip = false;
+    TEST_SYNC_POINT_CALLBACK("BlockBasedTableBuilder::Add::skip", (void*)&skip);
+    if (skip) {
+      return;
+    }
 #endif  // !NDEBUG
 
     auto should_flush = r->flush_block_policy->Update(ikey, value);
     if (should_flush) {
       assert(!r->data_block.empty());
-      r->first_key_in_next_block = &ikey;
-      Flush();
-      if (r->state == Rep::State::kBuffered) {
-        bool exceeds_buffer_limit =
-            (r->buffer_limit != 0 && r->data_begin_offset > r->buffer_limit);
-        bool exceeds_global_block_cache_limit = false;
-
-        // Increase cache charging for the last buffered data block
-        // only if the block is not going to be unbuffered immediately
-        // and there exists a cache reservation manager
-        if (!exceeds_buffer_limit &&
-            r->compression_dict_buffer_cache_res_mgr != nullptr) {
-          Status s =
-              r->compression_dict_buffer_cache_res_mgr->UpdateCacheReservation(
-                  r->data_begin_offset);
-          exceeds_global_block_cache_limit = s.IsMemoryLimit();
-        }
-
-        if (exceeds_buffer_limit || exceeds_global_block_cache_limit) {
-          EnterUnbuffered();
-        }
-      }
-
-      // Add item to index block.
-      // We do not emit the index entry for a block until we have seen the
-      // first key for the next data block.  This allows us to use shorter
-      // keys in the index block.  For example, consider a block boundary
-      // between the keys "the quick brown fox" and "the who".  We can use
-      // "the r" as the key for the index block entry since it is >= all
-      // entries in the first block and < all entries in subsequent
-      // blocks.
-      if (ok() && r->state == Rep::State::kUnbuffered) {
-        if (r->IsParallelCompressionEnabled()) {
-          r->pc_rep->curr_block_keys->Clear();
-        } else {
-          r->index_builder->AddIndexEntry(r->last_ikey, &ikey,
-                                          r->pending_handle,
-                                          &r->index_separator_scratch);
-        }
-      }
+      Flush(/*first_key_in_next_block=*/&ikey);
     }
 
-    // Note: PartitionedFilterBlockBuilder requires key being added to filter
-    // builder after being added to index builder.
+    // Note: PartitionedFilterBlockBuilder with
+    // decouple_partitioned_filters=false requires key being added to filter
+    // builder after being added to and "finished" in the index builder, so
+    // forces no parallel compression (logic in Rep constructor).
     if (r->state == Rep::State::kUnbuffered) {
-      if (r->IsParallelCompressionEnabled()) {
-        r->pc_rep->curr_block_keys->PushBack(ikey);
-      } else {
-        if (r->filter_builder != nullptr) {
-          r->filter_builder->AddWithPrevKey(
-              ExtractUserKeyAndStripTimestamp(ikey, r->ts_sz),
-              r->last_ikey.empty()
-                  ? Slice{}
-                  : ExtractUserKeyAndStripTimestamp(r->last_ikey, r->ts_sz));
-        }
+      if (r->filter_builder != nullptr) {
+        r->filter_builder->AddWithPrevKey(
+            ExtractUserKeyAndStripTimestamp(ikey, r->ts_sz),
+            r->last_ikey.empty()
+                ? Slice{}
+                : ExtractUserKeyAndStripTimestamp(r->last_ikey, r->ts_sz));
       }
     }
 
@@ -1098,9 +1537,7 @@ void BlockBasedTableBuilder::Add(const Slice& ikey, const Slice& value) {
       // Buffered keys will be replayed from data_block_buffers during
       // `Finish()` once compression dictionary has been finalized.
     } else {
-      if (!r->IsParallelCompressionEnabled()) {
-        r->index_builder->OnKeyAdded(ikey);
-      }
+      r->index_builder->OnKeyAdded(ikey, value);
     }
     // TODO offset passed in is not accurate for parallel compression case
     NotifyCollectTableCollectorsOnAdd(ikey, value, r->get_offset(),
@@ -1147,214 +1584,405 @@ void BlockBasedTableBuilder::Add(const Slice& ikey, const Slice& value) {
   }
 }
 
-void BlockBasedTableBuilder::Flush() {
-  Rep* r = rep_;
+void BlockBasedTableBuilder::Flush(const Slice* first_key_in_next_block) {
+  Rep* r = rep_.get();
   assert(rep_->state != Rep::State::kClosed);
-  if (!ok()) {
+  if (UNLIKELY(!ok())) {
     return;
   }
   if (r->data_block.empty()) {
     return;
   }
-  if (r->IsParallelCompressionEnabled() &&
-      r->state == Rep::State::kUnbuffered) {
-    r->data_block.Finish();
-    ParallelCompressionRep::BlockRep* block_rep = r->pc_rep->PrepareBlock(
-        r->compression_type, r->first_key_in_next_block, &(r->data_block));
-    assert(block_rep != nullptr);
-    r->pc_rep->file_size_estimator.EmitBlock(block_rep->data->size(),
-                                             r->get_offset());
-    r->pc_rep->EmitBlock(block_rep);
+  Slice uncompressed_block_data = r->data_block.Finish();
+
+  // NOTE: compression sampling is done here in the same thread as building
+  // the uncompressed block because of the requirements to call table
+  // property collectors:
+  // * BlockAdd function expects block_compressed_bytes_{fast,slow} for
+  //   historical reasons. Probably a hassle to remove.
+  // * Collector is not thread safe so calls need to be
+  // serialized/synchronized.
+  // * Ideally, AddUserKey and BlockAdd calls need to line up such that a
+  //   reported block corresponds to all the keys reported since the previous
+  //   block.
+
+  // If requested, we sample one in every N block with a
+  // fast and slow compression algorithm and report the stats.
+  // The users can use these stats to decide if it is worthwhile
+  // enabling compression and they also get a hint about which
+  // compression algorithm wil be beneficial.
+  if (r->sample_for_compression > 0 &&
+      Random::GetTLSInstance()->OneIn(
+          static_cast<int>(r->sample_for_compression))) {
+    GrowableBuffer sampled_output;
+    sampled_output.ResetForSize(uncompressed_block_data.size());
+    size_t fast_size = uncompressed_block_data.size();
+    size_t slow_size = uncompressed_block_data.size();
+
+    // Sampling with a fast compression algorithm
+    if (r->fast_sample_compressor) {
+      CompressionType result_type = kNoCompression;
+      Status s = r->fast_sample_compressor->CompressBlock(
+          uncompressed_block_data, sampled_output.data(), &fast_size,
+          &result_type, /*working_area=*/nullptr);
+      if (!s.ok() || result_type == kNoCompression) {
+        // For accounting, fall back on no compression
+        fast_size = uncompressed_block_data.size();
+      }
+    }
+
+    // Sampling with a slow but high-compression algorithm
+    if (r->slow_sample_compressor) {
+      CompressionType result_type = kNoCompression;
+      Status s = r->slow_sample_compressor->CompressBlock(
+          uncompressed_block_data, sampled_output.data(), &slow_size,
+          &result_type, /*working_area=*/nullptr);
+      if (!s.ok() || result_type == kNoCompression) {
+        // For accounting, fall back on no compression
+        slow_size = uncompressed_block_data.size();
+      }
+    }
+
+    // NOTE: Currently compression sampling is only enabled for data block.
+    r->sampled_input_data_bytes.FetchAddRelaxed(uncompressed_block_data.size());
+    r->sampled_output_slow_data_bytes.FetchAddRelaxed(slow_size);
+    r->sampled_output_fast_data_bytes.FetchAddRelaxed(fast_size);
+
+    NotifyCollectTableCollectorsOnBlockAdd(r->table_properties_collectors,
+                                           uncompressed_block_data.size(),
+                                           slow_size, fast_size);
   } else {
-    WriteBlock(&r->data_block, &r->pending_handle, BlockType::kData);
+    NotifyCollectTableCollectorsOnBlockAdd(
+        r->table_properties_collectors, uncompressed_block_data.size(),
+        0 /*block_compressed_bytes_slow*/, 0 /*block_compressed_bytes_fast*/);
   }
-}
 
-void BlockBasedTableBuilder::WriteBlock(BlockBuilder* block,
-                                        BlockHandle* handle,
-                                        BlockType block_type) {
-  block->Finish();
-  std::string uncompressed_block_data;
-  uncompressed_block_data.reserve(rep_->table_options.block_size);
-  block->SwapAndReset(uncompressed_block_data);
   if (rep_->state == Rep::State::kBuffered) {
-    assert(block_type == BlockType::kData);
-    rep_->data_block_buffers.emplace_back(std::move(uncompressed_block_data));
-    rep_->data_begin_offset += rep_->data_block_buffers.back().size();
-    return;
+    std::string uncompressed_block_holder;
+    uncompressed_block_holder.reserve(rep_->table_options.block_size);
+    r->data_block.SwapAndReset(uncompressed_block_holder);
+    assert(uncompressed_block_data.size() == uncompressed_block_holder.size());
+    rep_->data_block_buffers.emplace_back(std::move(uncompressed_block_holder));
+    rep_->data_begin_offset += uncompressed_block_data.size();
+    MaybeEnterUnbuffered(first_key_in_next_block);
+  } else {
+    // Increment num_data_blocks when a data block is finalized in the
+    // emit thread to avoid data races with write worker threads
+    ++r->props.num_data_blocks;
+
+    // Notify filter builder that a data block has been finalized
+    // This must happen on the emit thread before the block is added to the
+    // ring buffer to avoid race conditions with worker threads
+    if (r->filter_builder) {
+      r->filter_builder->OnDataBlockFinalized(r->props.num_data_blocks);
+    }
+
+    if (r->IsParallelCompressionActive()) {
+      EmitBlockForParallel(r->data_block.MutableBuffer(), r->last_ikey,
+                           first_key_in_next_block);
+    } else {
+      EmitBlock(r->data_block.MutableBuffer(), r->last_ikey,
+                first_key_in_next_block);
+    }
+    r->data_block.Reset();
+  }
+}
+
+void BlockBasedTableBuilder::EmitBlockForParallel(
+    std::string& uncompressed, const Slice& last_key_in_current_block,
+    const Slice* first_key_in_next_block) {
+  Rep* r = rep_.get();
+  assert(r->state == Rep::State::kUnbuffered);
+  assert(uncompressed.size() > 0);
+  auto& pc_rep = *r->pc_rep;
+  // Can emit the uncompressed block into the ring buffer
+  assert(pc_rep.emit_thread_state ==
+         ParallelCompressionRep::ThreadState::kEmitting);
+  auto* block_rep = &pc_rep.ring_buffer[pc_rep.emit_slot];
+  pc_rep.estimated_inflight_size.FetchAddRelaxed(uncompressed.size() +
+                                                 kBlockTrailerSize);
+  std::swap(uncompressed, block_rep->uncompressed);
+  r->index_builder->PrepareIndexEntry(last_key_in_current_block,
+                                      first_key_in_next_block,
+                                      block_rep->prepared_index_entry.get());
+  block_rep->compressed.Reset();
+  block_rep->compression_type = kNoCompression;
+
+  // Might need to take up some compression work before we are able to
+  // resume emitting the next uncompressed block.
+  for (;;) {
+    pc_rep.EmitterStateTransition(pc_rep.emit_thread_state, pc_rep.emit_slot);
+
+    if (pc_rep.emit_thread_state ==
+        ParallelCompressionRep::ThreadState::kCompressing) {
+      // Took up some compression work to help unblock ourself
+      block_rep = &pc_rep.ring_buffer[pc_rep.emit_slot];
+      Status s = CompressAndVerifyBlock(
+          block_rep->uncompressed, /*is_data_block=*/true,
+          r->data_block_working_area, &block_rep->compressed,
+          &block_rep->compression_type);
+      if (UNLIKELY(!s.ok())) {
+        r->SetStatus(s);
+        pc_rep.SetAbort(pc_rep.emit_thread_state);
+        break;
+      }
+    } else {
+      assert(pc_rep.emit_thread_state !=
+             ParallelCompressionRep::ThreadState::kCompressingAndWriting);
+      assert(pc_rep.emit_thread_state !=
+             ParallelCompressionRep::ThreadState::kWriting);
+      assert(pc_rep.emit_thread_state !=
+             ParallelCompressionRep::ThreadState::kIdle);
+      // Either emitting or end state.
+      // Detect nothing more to emit and set if so.
+      if (first_key_in_next_block == nullptr &&
+          pc_rep.emit_thread_state ==
+              ParallelCompressionRep::ThreadState::kEmitting) {
+        pc_rep.SetNoMoreToEmit(pc_rep.emit_thread_state, pc_rep.emit_slot);
+      }
+      break;
+    }
+  }
+}
+void BlockBasedTableBuilder::EmitBlock(std::string& uncompressed,
+                                       const Slice& last_key_in_current_block,
+                                       const Slice* first_key_in_next_block) {
+  Rep* r = rep_.get();
+  assert(r->state == Rep::State::kUnbuffered);
+  // Single-threaded context only
+  assert(!r->IsParallelCompressionActive());
+  assert(uncompressed.size() > 0);
+  // When data blocks are aligned with super block alignment, delta encoding
+  // needs to be skipped for the first block after padding.
+  bool skip_delta_encoding = false;
+  WriteBlock(uncompressed, &r->pending_handle, BlockType::kData,
+             &skip_delta_encoding);
+  if (LIKELY(ok())) {
+    // We do not emit the index entry for a block until we have seen the
+    // first key for the next data block.  This allows us to use shorter
+    // keys in the index block.  For example, consider a block boundary
+    // between the keys "the quick brown fox" and "the who".  We can use
+    // "the r" as the key for the index block entry since it is >= all
+    // entries in the first block and < all entries in subsequent
+    // blocks.
+    r->index_builder->AddIndexEntry(
+        last_key_in_current_block, first_key_in_next_block, r->pending_handle,
+        &r->index_separator_scratch, skip_delta_encoding);
   }
-  WriteBlock(uncompressed_block_data, handle, block_type);
 }
 
 void BlockBasedTableBuilder::WriteBlock(const Slice& uncompressed_block_data,
                                         BlockHandle* handle,
-                                        BlockType block_type) {
-  Rep* r = rep_;
+                                        BlockType block_type,
+                                        bool* skip_delta_encoding) {
+  Rep* r = rep_.get();
   assert(r->state == Rep::State::kUnbuffered);
-  Slice block_contents;
+  // Single-threaded context only
+  assert(!r->IsParallelCompressionActive());
   CompressionType type;
-  Status compress_status;
   bool is_data_block = block_type == BlockType::kData;
-  CompressAndVerifyBlock(uncompressed_block_data, is_data_block,
-                         *(r->compression_ctxs[0]), r->verify_ctxs[0].get(),
-                         &(r->compressed_output), &(block_contents), &type,
-                         &compress_status);
+  // NOTE: only index and data blocks are currently compressed
+  assert(is_data_block || block_type == BlockType::kIndex);
+  Status compress_status = CompressAndVerifyBlock(
+      uncompressed_block_data, is_data_block,
+      is_data_block ? r->data_block_working_area : r->index_block_working_area,
+      &r->single_threaded_compressed_output, &type);
   r->SetStatus(compress_status);
-  if (!ok()) {
+  if (UNLIKELY(!ok())) {
     return;
   }
 
   TEST_SYNC_POINT_CALLBACK(
       "BlockBasedTableBuilder::WriteBlock:TamperWithCompressedData",
-      &r->compressed_output);
-  WriteMaybeCompressedBlock(block_contents, type, handle, block_type,
-                            &uncompressed_block_data);
-  r->compressed_output.clear();
+      &r->single_threaded_compressed_output);
+  WriteMaybeCompressedBlock(
+      type == kNoCompression ? uncompressed_block_data
+                             : Slice(r->single_threaded_compressed_output),
+      type, handle, block_type, &uncompressed_block_data, skip_delta_encoding);
+  r->single_threaded_compressed_output.Reset();
   if (is_data_block) {
     r->props.data_size = r->get_offset();
-    ++r->props.num_data_blocks;
+    r->props.uncompressed_data_size += uncompressed_block_data.size();
   }
 }
 
-void BlockBasedTableBuilder::BGWorkCompression(
-    const CompressionContext& compression_ctx,
-    UncompressionContext* verify_ctx) {
-  ParallelCompressionRep::BlockRep* block_rep = nullptr;
-  while (rep_->pc_rep->compress_queue.pop(block_rep)) {
-    assert(block_rep != nullptr);
-    CompressAndVerifyBlock(block_rep->contents, true, /* is_data_block*/
-                           compression_ctx, verify_ctx,
-                           block_rep->compressed_data.get(),
-                           &block_rep->compressed_contents,
-                           &(block_rep->compression_type), &block_rep->status);
-    block_rep->slot->Fill(block_rep);
-  }
+uint64_t BlockBasedTableBuilder::GetWorkerCPUMicros() const {
+  return rep_->worker_cpu_micros.LoadRelaxed();
 }
 
-void BlockBasedTableBuilder::CompressAndVerifyBlock(
-    const Slice& uncompressed_block_data, bool is_data_block,
-    const CompressionContext& compression_ctx, UncompressionContext* verify_ctx,
-    std::string* compressed_output, Slice* block_contents,
-    CompressionType* type, Status* out_status) {
-  Rep* r = rep_;
-  bool is_status_ok = ok();
-  if (!r->IsParallelCompressionEnabled()) {
-    assert(is_status_ok);
-  }
+void BlockBasedTableBuilder::BGWorker(WorkingAreaPair& working_area) {
+  // Record CPU usage of this thread
+  const uint64_t start_cpu_micros =
+      rep_->ioptions.env->GetSystemClock()->CPUMicros();
+  Defer log_cpu{[this, start_cpu_micros]() {
+    rep_->worker_cpu_micros.FetchAddRelaxed(
+        rep_->ioptions.env->GetSystemClock()->CPUMicros() - start_cpu_micros);
+  }};
+
+  auto& pc_rep = *rep_->pc_rep;
+#ifdef BBTB_PC_WATCHDOG
+  pc_rep.live_workers.FetchAddRelaxed(1);
+  Defer decr{[&pc_rep]() { pc_rep.live_workers.FetchSubRelaxed(1); }};
+#endif  // BBTB_PC_WATCHDOG
+  ParallelCompressionRep::ThreadState thread_state =
+      ParallelCompressionRep::ThreadState::kIdle;
+  uint32_t slot = 0;
+  // Workers should avoid checking the shared status (e.g. ok()) to minimize
+  // potential data dependencies across threads. If another thread hits an
+  // error, we will pick up the kEnd state from the abort.
+  IOStatus ios;
+  do {
+    pc_rep.WorkerStateTransition(thread_state, slot);
+    ParallelCompressionRep::BlockRep* block_rep = &pc_rep.ring_buffer[slot];
+    auto compress_fn = [this, block_rep, &ios, &working_area]() {
+      ios = status_to_io_status(CompressAndVerifyBlock(
+          block_rep->uncompressed, /*is_data_block=*/true, working_area,
+          &block_rep->compressed, &block_rep->compression_type));
+    };
+    auto write_fn = [this, block_rep, &ios]() {
+      Slice compressed = block_rep->compressed;
+      Slice uncompressed = block_rep->uncompressed;
+      bool skip_delta_encoding = false;
+      ios = WriteMaybeCompressedBlockImpl(
+          block_rep->compression_type == kNoCompression ? uncompressed
+                                                        : compressed,
+          block_rep->compression_type, &rep_->pending_handle, BlockType::kData,
+          &uncompressed, &skip_delta_encoding);
+      if (LIKELY(ios.ok())) {
+        rep_->props.data_size = rep_->get_offset();
+        rep_->props.uncompressed_data_size += block_rep->uncompressed.size();
+
+        rep_->index_builder->FinishIndexEntry(
+            rep_->pending_handle, block_rep->prepared_index_entry.get(),
+            skip_delta_encoding);
+      }
+    };
+    switch (thread_state) {
+      case ParallelCompressionRep::ThreadState::kEnd:
+        // All done
+        assert(ios.ok());
+        return;
+      case ParallelCompressionRep::ThreadState::kCompressing:
+        compress_fn();
+        break;
+      case ParallelCompressionRep::ThreadState::kCompressingAndWriting:
+        compress_fn();
+        if (LIKELY(ios.ok())) {
+          write_fn();
+        }
+        break;
+      case ParallelCompressionRep::ThreadState::kWriting:
+        write_fn();
+        break;
+      case ParallelCompressionRep::ThreadState::kEmitting:
+        // Shouldn't happen
+        assert(thread_state != ParallelCompressionRep::ThreadState::kEmitting);
+        break;
+      case ParallelCompressionRep::ThreadState::kIdle:
+        // Shouldn't happen
+        assert(thread_state != ParallelCompressionRep::ThreadState::kIdle);
+        break;
+      default:
+        assert(false);
+        break;
+    }
+  } while (LIKELY(ios.ok()));
+  // Hit an error, so abort
+  rep_->SetIOStatus(ios);
+  pc_rep.SetAbort(thread_state);
+}
 
-  if (is_status_ok && uncompressed_block_data.size() < kCompressionSizeLimit) {
-    StopWatchNano timer(
-        r->ioptions.clock,
-        ShouldReportDetailedTime(r->ioptions.env, r->ioptions.stats));
+Status BlockBasedTableBuilder::CompressAndVerifyBlock(
+    const Slice& uncompressed_block_data, bool is_data_block,
+    WorkingAreaPair& working_area, GrowableBuffer* compressed_output,
+    CompressionType* result_compression_type) {
+  Rep* r = rep_.get();
+  Status status;
 
-    *type = r->compression_type;
-#ifndef NDEBUG
-    if (r->compression_type != kNoCompression &&
-        g_hack_mixed_compression_in_block_based_table.LoadRelaxed() > 0U) {
-      // If zstd is in the mix, the compression_name table property needs to be
-      // set to it, for proper handling of context and dictionaries.
-      assert(!ZSTD_Supported() || r->compression_type == kZSTD);
-      const auto& compressions = GetSupportedCompressions();
-      auto counter =
-          g_hack_mixed_compression_in_block_based_table.FetchAddRelaxed(1);
-      *type = compressions[counter % compressions.size()];
-    }
-#endif  // !NDEBUG
+  UnownedPtr<Compressor> compressor = nullptr;
+  Decompressor* verify_decomp = nullptr;
+  if (is_data_block) {
+    compressor = r->data_block_compressor;
+    verify_decomp = r->data_block_verify_decompressor.get();
+  } else {
+    compressor = r->index_block_compressor;
+    verify_decomp = r->verify_decompressor.get();
+  }
+
+  compressed_output->Reset();
+  CompressionType type = kNoCompression;
+  if (LIKELY(uncompressed_block_data.size() < kCompressionSizeLimit)) {
+    if (compressor) {
+      StopWatchNano timer(
+          r->ioptions.clock,
+          ShouldReportDetailedTime(r->ioptions.env, r->ioptions.stats));
+
+      size_t max_compressed_size = static_cast<size_t>(
+          (static_cast<uint64_t>(r->max_compressed_bytes_per_kb) *
+           uncompressed_block_data.size()) >>
+          10);
+      compressed_output->ResetForSize(max_compressed_size);
+      status = compressor->CompressBlock(
+          uncompressed_block_data, compressed_output->data(),
+          &compressed_output->MutableSize(), &type, &working_area.compress);
+
+      // Post-condition of Compressor::CompressBlock
+      assert(type == kNoCompression || status.ok());
+      assert(type == kNoCompression ||
+             r->table_options.verify_compression == (verify_decomp != nullptr));
 
-    if (is_data_block) {
-      r->compressible_input_data_bytes.fetch_add(uncompressed_block_data.size(),
-                                                 std::memory_order_relaxed);
-    }
-    const CompressionDict* compression_dict;
-    if (!is_data_block || r->compression_dict == nullptr) {
-      compression_dict = &CompressionDict::GetEmptyDict();
-    } else {
-      compression_dict = r->compression_dict.get();
-    }
-    assert(compression_dict != nullptr);
-    CompressionInfo compression_info(r->compression_opts, compression_ctx,
-                                     *compression_dict, *type,
-                                     r->sample_for_compression);
-
-    std::string sampled_output_fast;
-    std::string sampled_output_slow;
-    *block_contents = CompressBlock(
-        uncompressed_block_data, compression_info, type,
-        r->table_options.format_version, is_data_block /* allow_sample */,
-        compressed_output, &sampled_output_fast, &sampled_output_slow);
-
-    if (sampled_output_slow.size() > 0 || sampled_output_fast.size() > 0) {
-      // Currently compression sampling is only enabled for data block.
-      assert(is_data_block);
-      r->sampled_input_data_bytes.fetch_add(uncompressed_block_data.size(),
-                                            std::memory_order_relaxed);
-      r->sampled_output_slow_data_bytes.fetch_add(sampled_output_slow.size(),
-                                                  std::memory_order_relaxed);
-      r->sampled_output_fast_data_bytes.fetch_add(sampled_output_fast.size(),
-                                                  std::memory_order_relaxed);
-    }
-    // notify collectors on block add
-    NotifyCollectTableCollectorsOnBlockAdd(
-        r->table_properties_collectors, uncompressed_block_data.size(),
-        sampled_output_fast.size(), sampled_output_slow.size());
-
-    // Some of the compression algorithms are known to be unreliable. If
-    // the verify_compression flag is set then try to de-compress the
-    // compressed data and compare to the input.
-    if (*type != kNoCompression && r->table_options.verify_compression) {
-      // Retrieve the uncompressed contents into a new buffer
-      const UncompressionDict* verify_dict;
-      if (!is_data_block || r->verify_dict == nullptr) {
-        verify_dict = &UncompressionDict::GetEmptyDict();
-      } else {
-        verify_dict = r->verify_dict.get();
-      }
-      assert(verify_dict != nullptr);
-      BlockContents contents;
-      UncompressionInfo uncompression_info(*verify_ctx, *verify_dict,
-                                           r->compression_type);
-      Status uncompress_status = UncompressBlockData(
-          uncompression_info, block_contents->data(), block_contents->size(),
-          &contents, r->table_options.format_version, r->ioptions);
-
-      if (uncompress_status.ok()) {
-        bool data_match = contents.data.compare(uncompressed_block_data) == 0;
-        if (!data_match) {
-          // The result of the compression was invalid. abort.
-          const char* const msg =
-              "Decompressed block did not match pre-compression block";
-          ROCKS_LOG_ERROR(r->ioptions.logger, "%s", msg);
-          *out_status = Status::Corruption(msg);
-          *type = kNoCompression;
+      TEST_SYNC_POINT_CALLBACK(
+          "BlockBasedTableBuilder::CompressAndVerifyBlock:TamperWithResultType",
+          &type);
+
+      // Some of the compression algorithms are known to be unreliable. If
+      // the verify_compression flag is set then try to de-compress the
+      // compressed data and compare to the input.
+      if (verify_decomp && type != kNoCompression) {
+        BlockContents contents;
+        Status uncompress_status = DecompressBlockData(
+            compressed_output->data(), compressed_output->size(), type,
+            *verify_decomp, &contents, r->ioptions,
+            /*allocator=*/nullptr, &working_area.verify);
+
+        if (LIKELY(uncompress_status.ok())) {
+          bool data_match = contents.data.compare(uncompressed_block_data) == 0;
+          if (!data_match) {
+            // The result of the compression was invalid. abort.
+            const char* const msg =
+                "Decompressed block did not match pre-compression block";
+            ROCKS_LOG_ERROR(r->ioptions.logger, "%s", msg);
+            status = Status::Corruption(msg);
+            type = kNoCompression;
+          }
+        } else {
+          // Decompression reported an error. abort.
+          status = Status::Corruption(std::string("Could not decompress: ") +
+                                      uncompress_status.getState());
+          type = kNoCompression;
         }
-      } else {
-        // Decompression reported an error. abort.
-        *out_status = Status::Corruption(std::string("Could not decompress: ") +
-                                         uncompress_status.getState());
-        *type = kNoCompression;
+      }
+      if (timer.IsStarted()) {
+        RecordTimeToHistogram(r->ioptions.stats, COMPRESSION_TIMES_NANOS,
+                              timer.ElapsedNanos());
       }
     }
-    if (timer.IsStarted()) {
-      RecordTimeToHistogram(r->ioptions.stats, COMPRESSION_TIMES_NANOS,
-                            timer.ElapsedNanos());
+    if (is_data_block) {
+      r->compressible_input_data_bytes.FetchAddRelaxed(
+          uncompressed_block_data.size());
+      r->uncompressible_input_data_bytes.FetchAddRelaxed(kBlockTrailerSize);
     }
   } else {
     // Status is not OK, or block is too big to be compressed.
     if (is_data_block) {
-      r->uncompressible_input_data_bytes.fetch_add(
-          uncompressed_block_data.size(), std::memory_order_relaxed);
+      r->uncompressible_input_data_bytes.FetchAddRelaxed(
+          uncompressed_block_data.size() + kBlockTrailerSize);
     }
-    *type = kNoCompression;
-  }
-  if (is_data_block) {
-    r->uncompressible_input_data_bytes.fetch_add(kBlockTrailerSize,
-                                                 std::memory_order_relaxed);
   }
 
   // Abort compression if the block is too big, or did not pass
   // verification.
-  if (*type == kNoCompression) {
-    *block_contents = uncompressed_block_data;
+  if (type == kNoCompression) {
     bool compression_attempted = !compressed_output->empty();
     RecordTick(r->ioptions.stats, compression_attempted
                                       ? NUMBER_BLOCK_COMPRESSION_REJECTED
@@ -1369,45 +1997,112 @@ void BlockBasedTableBuilder::CompressAndVerifyBlock(
                uncompressed_block_data.size());
     RecordTick(r->ioptions.stats, BYTES_COMPRESSED_TO,
                compressed_output->size());
+    if (r->IsParallelCompressionActive() && is_data_block) {
+      r->pc_rep->estimated_inflight_size.FetchSubRelaxed(
+          uncompressed_block_data.size() - compressed_output->size());
+    }
   }
+  *result_compression_type = type;
+  return status;
 }
 
 void BlockBasedTableBuilder::WriteMaybeCompressedBlock(
     const Slice& block_contents, CompressionType comp_type, BlockHandle* handle,
-    BlockType block_type, const Slice* uncompressed_block_data) {
+    BlockType block_type, const Slice* uncompressed_block_data,
+    bool* skip_delta_encoding) {
+  // Must have pre-checked status in single-threaded context
+  assert(status().ok());
+  assert(io_status().ok());
+  rep_->SetIOStatus(WriteMaybeCompressedBlockImpl(
+      block_contents, comp_type, handle, block_type, uncompressed_block_data,
+      skip_delta_encoding));
+}
+
+IOStatus BlockBasedTableBuilder::WriteMaybeCompressedBlockImpl(
+    const Slice& block_contents, CompressionType comp_type, BlockHandle* handle,
+    BlockType block_type, const Slice* uncompressed_block_data,
+    bool* skip_delta_encoding) {
   // File format contains a sequence of blocks where each block has:
   //    block_data: uint8[n]
   //    compression_type: uint8
   //    checksum: uint32
-  Rep* r = rep_;
+  Rep* r = rep_.get();
   bool is_data_block = block_type == BlockType::kData;
+  // For data block, skip_delta_encoding must be non null
+  if (is_data_block) {
+    assert(skip_delta_encoding != nullptr);
+  }
+  if (skip_delta_encoding != nullptr) {
+    *skip_delta_encoding = false;
+  }
   IOOptions io_options;
+  // Always return io_s for NRVO
   IOStatus io_s =
       WritableFileWriter::PrepareIOOptions(r->write_options, io_options);
-  if (!io_s.ok()) {
-    r->SetIOStatus(io_s);
-    return;
+  if (UNLIKELY(!io_s.ok())) {
+    return io_s;
   }
   // Old, misleading name of this function: WriteRawBlock
   StopWatch sw(r->ioptions.clock, r->ioptions.stats, WRITE_RAW_BLOCK_MICROS);
-  const uint64_t offset = r->get_offset();
+
+  auto offset = r->get_offset();
+  // try to align the data block page to the super alignment size, if enabled
+  if ((r->table_options.super_block_alignment_size != 0) && is_data_block) {
+    auto super_block_alignment_mask =
+        r->table_options.super_block_alignment_size - 1;
+    if ((r->table_options.super_block_alignment_space_overhead_ratio != 0) &&
+        (offset & (~super_block_alignment_mask)) !=
+            ((offset + block_contents.size()) &
+             (~super_block_alignment_mask))) {
+      auto allowed_max_padding_size =
+          r->table_options.super_block_alignment_size /
+          r->table_options.super_block_alignment_space_overhead_ratio;
+      // new block would cross the super block boundary
+      auto pad_bytes = r->table_options.super_block_alignment_size -
+                       (offset & super_block_alignment_mask);
+      if (pad_bytes < allowed_max_padding_size) {
+        io_s = r->file->Pad(io_options, pad_bytes, allowed_max_padding_size);
+        if (UNLIKELY(!io_s.ok())) {
+          r->SetIOStatus(io_s);
+          return io_s;
+        }
+        r->pre_compression_size += pad_bytes;
+        offset += pad_bytes;
+        r->set_offset(offset);
+        if (skip_delta_encoding != nullptr) {
+          // Skip delta encoding in index block builder when a super block
+          // alignment padding is added for data block.
+          *skip_delta_encoding = true;
+        }
+        TEST_SYNC_POINT(
+            "BlockBasedTableBuilder::WriteMaybeCompressedBlock:"
+            "SuperBlockAlignment");
+      } else {
+        TEST_SYNC_POINT(
+            "BlockBasedTableBuilder::WriteMaybeCompressedBlock:"
+            "SuperBlockAlignmentPaddingBytesExceedLimit");
+      }
+    }
+  }
+
   handle->set_offset(offset);
   handle->set_size(block_contents.size());
-  assert(status().ok());
-  assert(io_status().ok());
   if (uncompressed_block_data == nullptr) {
     uncompressed_block_data = &block_contents;
     assert(comp_type == kNoCompression);
   }
 
+  // TODO: consider a variant of this function that puts the trailer after
+  // block_contents (if it comes from a std::string) so we only need one
+  // r->file->Append call
   {
     io_s = r->file->Append(io_options, block_contents);
-    if (!io_s.ok()) {
-      r->SetIOStatus(io_s);
-      return;
+    if (UNLIKELY(!io_s.ok())) {
+      return io_s;
     }
   }
 
+  r->compression_types_used.Add(comp_type);
   std::array<char, kBlockTrailerSize> trailer;
   trailer[0] = comp_type;
   uint32_t checksum = ComputeBuiltinChecksumWithLastByte(
@@ -1416,10 +2111,10 @@ void BlockBasedTableBuilder::WriteMaybeCompressedBlock(
   checksum += ChecksumModifierForContext(r->base_context_checksum, offset);
 
   if (block_type == BlockType::kFilter) {
-    Status s = r->filter_builder->MaybePostVerifyFilter(block_contents);
-    if (!s.ok()) {
-      r->SetStatus(s);
-      return;
+    io_s = status_to_io_status(
+        r->filter_builder->MaybePostVerifyFilter(block_contents));
+    if (UNLIKELY(!io_s.ok())) {
+      return io_s;
     }
   }
 
@@ -1429,36 +2124,21 @@ void BlockBasedTableBuilder::WriteMaybeCompressedBlock(
       trailer.data());
   {
     io_s = r->file->Append(io_options, Slice(trailer.data(), trailer.size()));
-    if (!io_s.ok()) {
-      r->SetIOStatus(io_s);
-      return;
+    if UNLIKELY (!io_s.ok()) {
+      return io_s;
     }
   }
 
-  {
-    bool warm_cache;
-    switch (r->table_options.prepopulate_block_cache) {
-      case BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly:
-        warm_cache = (r->reason == TableFileCreationReason::kFlush);
-        break;
-      case BlockBasedTableOptions::PrepopulateBlockCache::kDisable:
-        warm_cache = false;
-        break;
-      default:
-        // missing case
-        assert(false);
-        warm_cache = false;
-    }
-    if (warm_cache) {
-      Status s = InsertBlockInCacheHelper(*uncompressed_block_data, handle,
-                                          block_type);
-      if (!s.ok()) {
-        r->SetStatus(s);
-        return;
-      }
+  if (r->warm_cache) {
+    io_s = status_to_io_status(
+        InsertBlockInCacheHelper(*uncompressed_block_data, handle, block_type));
+    if (UNLIKELY(!io_s.ok())) {
+      return io_s;
     }
   }
 
+  r->pre_compression_size +=
+      uncompressed_block_data->size() + kBlockTrailerSize;
   r->set_offset(r->get_offset() + block_contents.size() + kBlockTrailerSize);
   if (r->table_options.block_align && is_data_block) {
     size_t pad_bytes =
@@ -1466,109 +2146,93 @@ void BlockBasedTableBuilder::WriteMaybeCompressedBlock(
          ((block_contents.size() + kBlockTrailerSize) & (r->alignment - 1))) &
         (r->alignment - 1);
 
-    io_s = r->file->Pad(io_options, pad_bytes);
-    if (io_s.ok()) {
+    io_s = r->file->Pad(io_options, pad_bytes, kDefaultPageSize);
+    if (LIKELY(io_s.ok())) {
+      r->pre_compression_size += pad_bytes;
       r->set_offset(r->get_offset() + pad_bytes);
     } else {
-      r->SetIOStatus(io_s);
-      return;
+      return io_s;
     }
   }
 
-  if (r->IsParallelCompressionEnabled()) {
-    if (is_data_block) {
-      r->pc_rep->file_size_estimator.ReapBlock(block_contents.size(),
-                                               r->get_offset());
-    } else {
-      r->pc_rep->file_size_estimator.SetEstimatedFileSize(r->get_offset());
-    }
+  if (r->IsParallelCompressionActive() && is_data_block) {
+    r->pc_rep->estimated_inflight_size.FetchSubRelaxed(block_contents.size() +
+                                                       kBlockTrailerSize);
   }
+  return io_s;
 }
 
-void BlockBasedTableBuilder::BGWorkWriteMaybeCompressedBlock() {
-  Rep* r = rep_;
-  ParallelCompressionRep::BlockRepSlot* slot = nullptr;
-  ParallelCompressionRep::BlockRep* block_rep = nullptr;
-  // Starts empty; see FilterBlockBuilder::AddWithPrevKey
-  std::string prev_block_last_key_no_ts;
-  while (r->pc_rep->write_queue.pop(slot)) {
-    assert(slot != nullptr);
-    slot->Take(block_rep);
-    assert(block_rep != nullptr);
-    if (!block_rep->status.ok()) {
-      r->SetStatus(block_rep->status);
-      // Reap block so that blocked Flush() can finish
-      // if there is one, and Flush() will notice !ok() next time.
-      block_rep->status = Status::OK();
-      r->pc_rep->ReapBlock(block_rep);
-      continue;
-    }
-
-    Slice prev_key_no_ts = prev_block_last_key_no_ts;
-    for (size_t i = 0; i < block_rep->keys->Size(); i++) {
-      auto& key = (*block_rep->keys)[i];
-      if (r->filter_builder != nullptr) {
-        Slice key_no_ts = ExtractUserKeyAndStripTimestamp(key, r->ts_sz);
-        r->filter_builder->AddWithPrevKey(key_no_ts, prev_key_no_ts);
-        prev_key_no_ts = key_no_ts;
-      }
-      r->index_builder->OnKeyAdded(key);
-    }
-    if (r->filter_builder != nullptr) {
-      prev_block_last_key_no_ts.assign(prev_key_no_ts.data(),
-                                       prev_key_no_ts.size());
-    }
-
-    r->pc_rep->file_size_estimator.SetCurrBlockUncompSize(
-        block_rep->data->size());
-    WriteMaybeCompressedBlock(block_rep->compressed_contents,
-                              block_rep->compression_type, &r->pending_handle,
-                              BlockType::kData, &block_rep->contents);
-    if (!ok()) {
-      break;
-    }
-
-    r->props.data_size = r->get_offset();
-    ++r->props.num_data_blocks;
-
-    if (block_rep->first_key_in_next_block == nullptr) {
-      r->index_builder->AddIndexEntry(block_rep->keys->Back(), nullptr,
-                                      r->pending_handle,
-                                      &r->index_separator_scratch);
-    } else {
-      Slice first_key_in_next_block =
-          Slice(*block_rep->first_key_in_next_block);
-      r->index_builder->AddIndexEntry(
-          block_rep->keys->Back(), &first_key_in_next_block, r->pending_handle,
-          &r->index_separator_scratch);
-    }
-
-    r->pc_rep->ReapBlock(block_rep);
+void BlockBasedTableBuilder::MaybeStartParallelCompression() {
+  if (rep_->compression_parallel_threads <= 1) {
+    return;
   }
+  // Although in theory having a separate thread for writing to the SST file
+  // could help to hide the latency associated with writing, it is more often
+  // the case that the latency comes in large units for rare calls to write that
+  // flush downstream buffers, including in WritableFileWriter. The buffering
+  // provided by the compression ring buffer is almost negligible for hiding
+  // that latency. So even with some optimizations, turning on the parallel
+  // framework when compression is disabled just eats more CPU with little-to-no
+  // improvement in throughput.
+  if (!rep_->data_block_compressor) {
+    // Force the generally best configuration for no compression: no parallelism
+    return;
+  }
+  rep_->pc_rep = std::make_unique<ParallelCompressionRep>(
+      rep_->compression_parallel_threads);
+  auto& pc_rep = *rep_->pc_rep;
+  for (uint32_t i = 0; i <= pc_rep.ring_buffer_mask; i++) {
+    pc_rep.ring_buffer[i].prepared_index_entry =
+        rep_->index_builder->CreatePreparedIndexEntry();
+  }
+  pc_rep.worker_threads.reserve(pc_rep.num_worker_threads);
+  pc_rep.working_areas.resize(pc_rep.num_worker_threads);
+  for (uint32_t i = 0; i < pc_rep.num_worker_threads; i++) {
+    auto& wa = pc_rep.working_areas[i];
+    if (rep_->data_block_compressor) {
+      wa.compress = rep_->data_block_compressor->ObtainWorkingArea();
+    }
+    if (rep_->data_block_verify_decompressor) {
+      wa.verify = rep_->data_block_verify_decompressor->ObtainWorkingArea(
+          rep_->data_block_compressor->GetPreferredCompressionType());
+    }
+    pc_rep.worker_threads.emplace_back([this, &wa] { BGWorker(wa); });
+  }
+#ifdef BBTB_PC_WATCHDOG
+  // Start watchdog thread
+  pc_rep.watchdog_thread = std::thread([&pc_rep] { pc_rep.BGWatchdog(); });
+  pc_rep.live_emit.StoreRelaxed(true);
+#endif  // BBTB_PC_WATCHDOG
 }
 
-void BlockBasedTableBuilder::StartParallelCompression() {
-  rep_->pc_rep.reset(
-      new ParallelCompressionRep(rep_->compression_opts.parallel_threads));
-  rep_->pc_rep->compress_thread_pool.reserve(
-      rep_->compression_opts.parallel_threads);
-  for (uint32_t i = 0; i < rep_->compression_opts.parallel_threads; i++) {
-    rep_->pc_rep->compress_thread_pool.emplace_back([this, i] {
-      BGWorkCompression(*(rep_->compression_ctxs[i]),
-                        rep_->verify_ctxs[i].get());
-    });
-  }
-  rep_->pc_rep->write_thread.reset(
-      new port::Thread([this] { BGWorkWriteMaybeCompressedBlock(); }));
-}
-
-void BlockBasedTableBuilder::StopParallelCompression() {
-  rep_->pc_rep->compress_queue.finish();
-  for (auto& thread : rep_->pc_rep->compress_thread_pool) {
+void BlockBasedTableBuilder::StopParallelCompression(bool abort) {
+  auto& pc_rep = *rep_->pc_rep;
+  if (abort) {
+    pc_rep.SetAbort(pc_rep.emit_thread_state);
+  } else if (pc_rep.emit_thread_state !=
+             ParallelCompressionRep::ThreadState::kEnd) {
+    // In case we didn't do a final flush with no next key, which might have
+    // been skipped if !ok() was set after the start of Finish()
+    assert(rep_->props.num_data_blocks == 0 || !ok());
+    pc_rep.SetNoMoreToEmit(pc_rep.emit_thread_state, pc_rep.emit_slot);
+  }
+#ifdef BBTB_PC_WATCHDOG
+  pc_rep.live_emit.StoreRelaxed(false);
+#endif  // BBTB_PC_WATCHDOG
+  assert(pc_rep.emit_thread_state == ParallelCompressionRep::ThreadState::kEnd);
+  for (auto& thread : pc_rep.worker_threads) {
     thread.join();
   }
-  rep_->pc_rep->write_queue.finish();
-  rep_->pc_rep->write_thread->join();
+#ifdef BBTB_PC_WATCHDOG
+  // Wake & shutdown watchdog thread
+  {
+    std::unique_lock<std::mutex> lock(pc_rep.watchdog_mutex);
+    pc_rep.shutdown_watchdog = true;
+    pc_rep.watchdog_cv.notify_all();
+  }
+  pc_rep.watchdog_thread.join();
+#endif  // BBTB_PC_WATCHDOG
+  rep_->pc_rep.reset();
 }
 
 Status BlockBasedTableBuilder::status() const { return rep_->GetStatus(); }
@@ -1577,6 +2241,8 @@ IOStatus BlockBasedTableBuilder::io_status() const {
   return rep_->GetIOStatus();
 }
 
+bool BlockBasedTableBuilder::ok() const { return rep_->StatusOk(); }
+
 Status BlockBasedTableBuilder::InsertBlockInCacheHelper(
     const Slice& block_contents, const BlockHandle* handle,
     BlockType block_type) {
@@ -1587,11 +2253,15 @@ Status BlockBasedTableBuilder::InsertBlockInCacheHelper(
   if (block_cache && helper && helper->create_cb) {
     CacheKey key = BlockBasedTable::GetCacheKey(rep_->base_cache_key, *handle);
     size_t charge;
+    // NOTE: data blocks (and everything else) will be warmed in decompressed
+    // state, so does not need a dictionary-aware decompressor. The only thing
+    // needing a decompressor here (in create_context) is warming the
+    // (de)compression dictionary, which will clone and save a dict-based
+    // decompressor from the corresponding non-dict decompressor.
     s = WarmInCache(block_cache, key.AsSlice(), block_contents,
                     &rep_->create_context, helper, Cache::Priority::LOW,
                     &charge);
-
-    if (s.ok()) {
+    if (LIKELY(s.ok())) {
       BlockBasedTable::UpdateCacheInsertionMetrics(
           block_type, nullptr /*get_context*/, charge, s.IsOkOverwritten(),
           rep_->ioptions.stats);
@@ -1617,11 +2287,11 @@ void BlockBasedTableBuilder::WriteFilterBlock(
   }
   BlockHandle filter_block_handle;
   bool is_partitioned_filter = rep_->table_options.partition_filters;
-  if (ok()) {
+  if (LIKELY(ok())) {
     rep_->props.num_filter_entries +=
         rep_->filter_builder->EstimateEntriesAdded();
     Status s = Status::Incomplete();
-    while (ok() && s.IsIncomplete()) {
+    while (LIKELY(ok()) && s.IsIncomplete()) {
       // filter_data is used to store the transferred filter data payload from
       // FilterBlockBuilder and deallocate the payload by going out of scope.
       // Otherwise, the payload will unnecessarily remain until
@@ -1651,7 +2321,7 @@ void BlockBasedTableBuilder::WriteFilterBlock(
     }
     rep_->filter_builder->ResetFilterBitsBuilder();
   }
-  if (ok()) {
+  if (LIKELY(ok())) {
     // Add mapping from "<filter_block_prefix>.Name" to location
     // of filter data.
     std::string key;
@@ -1664,30 +2334,37 @@ void BlockBasedTableBuilder::WriteFilterBlock(
 
 void BlockBasedTableBuilder::WriteIndexBlock(
     MetaIndexBuilder* meta_index_builder, BlockHandle* index_block_handle) {
-  if (!ok()) {
+  if (UNLIKELY(!ok())) {
     return;
   }
   IndexBuilder::IndexBlocks index_blocks;
   auto index_builder_status = rep_->index_builder->Finish(&index_blocks);
-  if (index_builder_status.IsIncomplete()) {
-    // We we have more than one index partition then meta_blocks are not
-    // supported for the index. Currently meta_blocks are used only by
-    // HashIndexBuilder which is not multi-partition.
-    assert(index_blocks.meta_blocks.empty());
-  } else if (ok() && !index_builder_status.ok()) {
+  if (LIKELY(ok()) && !index_builder_status.ok() &&
+      !index_builder_status.IsIncomplete()) {
+    // If the index builder failed for non-Incomplete errors, we should
+    // mark the entire builder as having failed wit that status. However,
+    // If the index builder failed with an incomplete error, we should
+    // continue writing out any meta blocks that may have been generated.
     rep_->SetStatus(index_builder_status);
   }
-  if (ok()) {
+
+  if (LIKELY(ok())) {
     for (const auto& item : index_blocks.meta_blocks) {
       BlockHandle block_handle;
-      WriteBlock(item.second, &block_handle, BlockType::kIndex);
-      if (!ok()) {
+      if (item.second.first == BlockType::kIndex) {
+        WriteBlock(item.second.second, &block_handle, item.second.first);
+      } else {
+        assert(item.second.first == BlockType::kUserDefinedIndex);
+        WriteMaybeCompressedBlock(item.second.second, kNoCompression,
+                                  &block_handle, item.second.first);
+      }
+      if (UNLIKELY(!ok())) {
         break;
       }
       meta_index_builder->Add(item.first, block_handle);
     }
   }
-  if (ok()) {
+  if (LIKELY(ok())) {
     if (rep_->table_options.enable_index_compression) {
       WriteBlock(index_blocks.index_block_contents, index_block_handle,
                  BlockType::kIndex);
@@ -1700,7 +2377,7 @@ void BlockBasedTableBuilder::WriteIndexBlock(
   // If there are more index partitions, finish them and write them out
   if (index_builder_status.IsIncomplete()) {
     bool index_building_finished = false;
-    while (ok() && !index_building_finished) {
+    while (LIKELY(ok()) && !index_building_finished) {
       Status s =
           rep_->index_builder->Finish(&index_blocks, *index_block_handle);
       if (s.ok()) {
@@ -1726,8 +2403,8 @@ void BlockBasedTableBuilder::WriteIndexBlock(
     }
   }
   // If success and need to record in metaindex rather than footer...
-  if (!FormatVersionUsesIndexHandleInFooter(
-          rep_->table_options.format_version)) {
+  if (LIKELY(ok()) && !FormatVersionUsesIndexHandleInFooter(
+                          rep_->table_options.format_version)) {
     meta_index_builder->Add(kIndexBlockName, *index_block_handle);
   }
 }
@@ -1735,7 +2412,7 @@ void BlockBasedTableBuilder::WriteIndexBlock(
 void BlockBasedTableBuilder::WritePropertiesBlock(
     MetaIndexBuilder* meta_index_builder) {
   BlockHandle properties_block_handle;
-  if (ok()) {
+  if (LIKELY(ok())) {
     PropertyBlockBuilder property_block_builder;
     rep_->props.filter_policy_name =
         rep_->table_options.filter_policy != nullptr
@@ -1750,10 +2427,6 @@ void BlockBasedTableBuilder::WritePropertiesBlock(
         rep_->ioptions.merge_operator != nullptr
             ? rep_->ioptions.merge_operator->Name()
             : "nullptr";
-    rep_->props.compression_name =
-        CompressionTypeToString(rep_->compression_type);
-    rep_->props.compression_options =
-        CompressionOptionsToString(rep_->compression_opts);
     rep_->props.prefix_extractor_name =
         rep_->prefix_extractor ? rep_->prefix_extractor->AsString() : "nullptr";
     std::string property_collectors_names = "[";
@@ -1767,37 +2440,42 @@ void BlockBasedTableBuilder::WritePropertiesBlock(
     }
     property_collectors_names += "]";
     rep_->props.property_collectors_names = property_collectors_names;
+
+    rep_->PostPopulateCompressionProperties();
+
     if (rep_->table_options.index_type ==
         BlockBasedTableOptions::kTwoLevelIndexSearch) {
       assert(rep_->p_index_builder_ != nullptr);
       rep_->props.index_partitions = rep_->p_index_builder_->NumPartitions();
       rep_->props.top_level_index_size =
-          rep_->p_index_builder_->TopLevelIndexSize(rep_->offset);
+          rep_->p_index_builder_->TopLevelIndexSize(rep_->offset.LoadRelaxed());
     }
     rep_->props.index_key_is_user_key =
-        !rep_->index_builder->seperator_is_key_plus_seq();
+        !rep_->index_builder->separator_is_key_plus_seq();
     rep_->props.index_value_is_delta_encoded =
         rep_->use_delta_encoding_for_index_values;
-    if (rep_->sampled_input_data_bytes > 0) {
+    if (rep_->sampled_input_data_bytes.LoadRelaxed() > 0) {
       rep_->props.slow_compression_estimated_data_size = static_cast<uint64_t>(
-          static_cast<double>(rep_->sampled_output_slow_data_bytes) /
-              rep_->sampled_input_data_bytes *
-              rep_->compressible_input_data_bytes +
-          rep_->uncompressible_input_data_bytes + 0.5);
+          static_cast<double>(
+              rep_->sampled_output_slow_data_bytes.LoadRelaxed()) /
+              rep_->sampled_input_data_bytes.LoadRelaxed() *
+              rep_->compressible_input_data_bytes.LoadRelaxed() +
+          rep_->uncompressible_input_data_bytes.LoadRelaxed() + 0.5);
       rep_->props.fast_compression_estimated_data_size = static_cast<uint64_t>(
-          static_cast<double>(rep_->sampled_output_fast_data_bytes) /
-              rep_->sampled_input_data_bytes *
-              rep_->compressible_input_data_bytes +
-          rep_->uncompressible_input_data_bytes + 0.5);
+          static_cast<double>(
+              rep_->sampled_output_fast_data_bytes.LoadRelaxed()) /
+              rep_->sampled_input_data_bytes.LoadRelaxed() *
+              rep_->compressible_input_data_bytes.LoadRelaxed() +
+          rep_->uncompressible_input_data_bytes.LoadRelaxed() + 0.5);
     } else if (rep_->sample_for_compression > 0) {
-      // We tried to sample but none were found. Assume worst-case (compression
-      // ratio 1.0) so data is complete and aggregatable.
+      // We tried to sample but none were found. Assume worst-case
+      // (compression ratio 1.0) so data is complete and aggregatable.
       rep_->props.slow_compression_estimated_data_size =
-          rep_->compressible_input_data_bytes +
-          rep_->uncompressible_input_data_bytes;
+          rep_->compressible_input_data_bytes.LoadRelaxed() +
+          rep_->uncompressible_input_data_bytes.LoadRelaxed();
       rep_->props.fast_compression_estimated_data_size =
-          rep_->compressible_input_data_bytes +
-          rep_->uncompressible_input_data_bytes;
+          rep_->compressible_input_data_bytes.LoadRelaxed() +
+          rep_->uncompressible_input_data_bytes.LoadRelaxed();
     }
     rep_->props.user_defined_timestamps_persisted =
         rep_->persist_user_defined_timestamps;
@@ -1818,7 +2496,7 @@ void BlockBasedTableBuilder::WritePropertiesBlock(
     WriteMaybeCompressedBlock(block_data, kNoCompression,
                               &properties_block_handle, BlockType::kProperties);
   }
-  if (ok()) {
+  if (LIKELY(ok())) {
 #ifndef NDEBUG
     {
       uint64_t props_block_offset = properties_block_handle.offset();
@@ -1842,21 +2520,21 @@ void BlockBasedTableBuilder::WritePropertiesBlock(
 
 void BlockBasedTableBuilder::WriteCompressionDictBlock(
     MetaIndexBuilder* meta_index_builder) {
-  if (rep_->compression_dict != nullptr &&
-      rep_->compression_dict->GetRawDict().size()) {
+  Slice compression_dict;
+  if (rep_->data_block_compressor) {
+    compression_dict = rep_->data_block_compressor->GetSerializedDict();
+  }
+  if (!compression_dict.empty()) {
     BlockHandle compression_dict_block_handle;
-    if (ok()) {
-      WriteMaybeCompressedBlock(rep_->compression_dict->GetRawDict(),
-                                kNoCompression, &compression_dict_block_handle,
+    if (LIKELY(ok())) {
+      WriteMaybeCompressedBlock(compression_dict, kNoCompression,
+                                &compression_dict_block_handle,
                                 BlockType::kCompressionDictionary);
-#ifndef NDEBUG
-      Slice compression_dict = rep_->compression_dict->GetRawDict();
       TEST_SYNC_POINT_CALLBACK(
           "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict",
           &compression_dict);
-#endif  // NDEBUG
     }
-    if (ok()) {
+    if (LIKELY(ok())) {
       meta_index_builder->Add(kCompressionDictBlockName,
                               compression_dict_block_handle);
     }
@@ -1865,7 +2543,7 @@ void BlockBasedTableBuilder::WriteCompressionDictBlock(
 
 void BlockBasedTableBuilder::WriteRangeDelBlock(
     MetaIndexBuilder* meta_index_builder) {
-  if (ok() && !rep_->range_del_block.empty()) {
+  if (LIKELY(ok()) && !rep_->range_del_block.empty()) {
     BlockHandle range_del_block_handle;
     WriteMaybeCompressedBlock(rep_->range_del_block.Finish(), kNoCompression,
                               &range_del_block_handle,
@@ -1876,11 +2554,8 @@ void BlockBasedTableBuilder::WriteRangeDelBlock(
 
 void BlockBasedTableBuilder::WriteFooter(BlockHandle& metaindex_block_handle,
                                          BlockHandle& index_block_handle) {
-  assert(ok());
-  Rep* r = rep_;
-  // this is guaranteed by BlockBasedTableBuilder's constructor
-  assert(r->table_options.checksum == kCRC32c ||
-         r->table_options.format_version != 0);
+  assert(LIKELY(ok()));
+  Rep* r = rep_.get();
   FooterBuilder footer;
   Status s = footer.Build(kBlockBasedTableMagicNumber,
                           r->table_options.format_version, r->get_offset(),
@@ -1899,30 +2574,56 @@ void BlockBasedTableBuilder::WriteFooter(BlockHandle& metaindex_block_handle,
   }
   ios = r->file->Append(io_options, footer.GetSlice());
   if (ios.ok()) {
+    r->pre_compression_size += footer.GetSlice().size();
     r->set_offset(r->get_offset() + footer.GetSlice().size());
   } else {
     r->SetIOStatus(ios);
   }
 }
 
-void BlockBasedTableBuilder::EnterUnbuffered() {
-  Rep* r = rep_;
+void BlockBasedTableBuilder::MaybeEnterUnbuffered(
+    const Slice* first_key_in_next_block) {
+  Rep* r = rep_.get();
   assert(r->state == Rep::State::kBuffered);
+  // Don't yet enter unbuffered (early return) if none of the conditions are
+  // met
+  if (first_key_in_next_block != nullptr) {
+    bool exceeds_buffer_limit =
+        (r->buffer_limit != 0 && r->data_begin_offset > r->buffer_limit);
+    if (!exceeds_buffer_limit) {
+      bool exceeds_global_block_cache_limit = false;
+      // Increase cache charging for the last buffered data block
+      // only if the block is not going to be unbuffered immediately
+      // and there exists a cache reservation manager
+      if (r->compression_dict_buffer_cache_res_mgr != nullptr) {
+        Status s =
+            r->compression_dict_buffer_cache_res_mgr->UpdateCacheReservation(
+                r->data_begin_offset);
+        exceeds_global_block_cache_limit = s.IsMemoryLimit();
+      }
+      if (!exceeds_global_block_cache_limit) {
+        return;
+      }
+    }
+  }
+
+  // Enter Unbuffered state
   r->state = Rep::State::kUnbuffered;
-  const size_t kSampleBytes = r->compression_opts.zstd_max_train_bytes > 0
-                                  ? r->compression_opts.zstd_max_train_bytes
-                                  : r->compression_opts.max_dict_bytes;
   const size_t kNumBlocksBuffered = r->data_block_buffers.size();
   if (kNumBlocksBuffered == 0) {
     // The below code is neither safe nor necessary for handling zero data
     // blocks.
+    // For PostPopulateCompressionProperties()
+    assert(!r->data_block_compressor);
+    r->data_block_compressor = r->basic_compressor.get();
     return;
   }
 
   // Abstract algebra teaches us that a finite cyclic group (such as the
   // additive group of integers modulo N) can be generated by a number that is
   // coprime with N. Since N is variable (number of buffered data blocks), we
-  // must then pick a prime number in order to guarantee coprimeness with any N.
+  // must then pick a prime number in order to guarantee coprimeness with any
+  // N.
   //
   // One downside of this approach is the spread will be poor when
   // `kPrimeGeneratorRemainder` is close to zero or close to
@@ -1936,17 +2637,20 @@ void BlockBasedTableBuilder::EnterUnbuffered() {
       kPrimeGenerator % static_cast<uint64_t>(kNumBlocksBuffered));
   const size_t kInitSampleIdx = kNumBlocksBuffered / 2;
 
-  std::string compression_dict_samples;
-  std::vector<size_t> compression_dict_sample_lens;
+  Compressor::DictSamples samples;
   size_t buffer_idx = kInitSampleIdx;
+  // Get max_sample_bytes from the DictSampling guidance
+  auto* sampling =
+      std::get_if<Compressor::DictSampling>(&r->data_block_dict_guidance);
+  assert(sampling != nullptr);
+  size_t max_sample_bytes = sampling->max_sample_bytes;
   for (size_t i = 0;
-       i < kNumBlocksBuffered && compression_dict_samples.size() < kSampleBytes;
+       i < kNumBlocksBuffered && samples.sample_data.size() < max_sample_bytes;
        ++i) {
-    size_t copy_len = std::min(kSampleBytes - compression_dict_samples.size(),
+    size_t copy_len = std::min(max_sample_bytes - samples.sample_data.size(),
                                r->data_block_buffers[buffer_idx].size());
-    compression_dict_samples.append(r->data_block_buffers[buffer_idx], 0,
-                                    copy_len);
-    compression_dict_sample_lens.emplace_back(copy_len);
+    samples.sample_data.append(r->data_block_buffers[buffer_idx], 0, copy_len);
+    samples.sample_lens.emplace_back(copy_len);
 
     buffer_idx += kPrimeGeneratorRemainder;
     if (buffer_idx >= kNumBlocksBuffered) {
@@ -1954,26 +2658,36 @@ void BlockBasedTableBuilder::EnterUnbuffered() {
     }
   }
 
-  // final data block flushed, now we can generate dictionary from the samples.
-  // OK if compression_dict_samples is empty, we'll just get empty dictionary.
-  std::string dict;
-  if (r->compression_opts.zstd_max_train_bytes > 0) {
-    if (r->compression_opts.use_zstd_dict_trainer) {
-      dict = ZSTD_TrainDictionary(compression_dict_samples,
-                                  compression_dict_sample_lens,
-                                  r->compression_opts.max_dict_bytes);
+  assert(samples.sample_data.size() > 0);
+
+  // final sample data block flushed, now we can generate dictionary (or it
+  // might opt not to use a dictionary and that's ok)
+  r->data_block_compressor =
+      MaybeCloneSpecialized(r->basic_compressor.get(),
+                            CacheEntryRole::kDataBlock, std::move(samples));
+
+  Slice serialized_dict = r->data_block_compressor->GetSerializedDict();
+  if (r->verify_decompressor) {
+    if (serialized_dict.empty()) {
+      // No dictionary
+      r->data_block_verify_decompressor = r->verify_decompressor.get();
     } else {
-      dict = ZSTD_FinalizeDictionary(
-          compression_dict_samples, compression_dict_sample_lens,
-          r->compression_opts.max_dict_bytes, r->compression_opts.level);
+      // Get an updated dictionary-aware decompressor for verification.
+      Status s = r->verify_decompressor->MaybeCloneForDict(
+          serialized_dict, &r->verify_decompressor_with_dict);
+      // Dictionary support must be present on the decompressor side if it's
+      // on the compressor side.
+      assert(r->verify_decompressor_with_dict);
+      if (r->verify_decompressor_with_dict) {
+        r->data_block_verify_decompressor =
+            r->verify_decompressor_with_dict.get();
+        assert(s.ok());
+      } else {
+        assert(!s.ok());
+        r->SetStatus(s);
+      }
     }
-  } else {
-    dict = std::move(compression_dict_samples);
   }
-  r->compression_dict.reset(new CompressionDict(dict, r->compression_type,
-                                                r->compression_opts.level));
-  r->verify_dict.reset(
-      new UncompressionDict(dict, r->compression_type == kZSTD));
 
   auto get_iterator_for_block = [&r](size_t i) {
     auto& data_block = r->data_block_buffers[i];
@@ -1998,59 +2712,37 @@ void BlockBasedTableBuilder::EnterUnbuffered() {
       assert(iter != nullptr);
     };
 
+    for (; iter->Valid(); iter->Next()) {
+      Slice key = iter->key();
+      if (r->filter_builder != nullptr) {
+        // NOTE: AddWithPrevKey here would only save key copying if prev is
+        // pinned (iter->IsKeyPinned()), which is probably rare with delta
+        // encoding. OK to go from Add() here to AddWithPrevKey() in
+        // unbuffered operation.
+        r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, r->ts_sz));
+      }
+      r->index_builder->OnKeyAdded(key, iter->value());
+    }
+
+    Slice first_key_in_loop_next_block;
+    const Slice* first_key_in_loop_next_block_ptr;
     if (i + 1 < r->data_block_buffers.size()) {
       next_block_iter = get_iterator_for_block(i + 1);
+      first_key_in_loop_next_block = next_block_iter->key();
+      first_key_in_loop_next_block_ptr = &first_key_in_loop_next_block;
+    } else {
+      first_key_in_loop_next_block_ptr = first_key_in_next_block;
     }
 
     auto& data_block = r->data_block_buffers[i];
-    if (r->IsParallelCompressionEnabled()) {
-      Slice first_key_in_next_block;
-      const Slice* first_key_in_next_block_ptr = &first_key_in_next_block;
-      if (i + 1 < r->data_block_buffers.size()) {
-        assert(next_block_iter != nullptr);
-        first_key_in_next_block = next_block_iter->key();
-      } else {
-        first_key_in_next_block_ptr = r->first_key_in_next_block;
-      }
-
-      std::vector<std::string> keys;
-      for (; iter->Valid(); iter->Next()) {
-        keys.emplace_back(iter->key().ToString());
-      }
-
-      ParallelCompressionRep::BlockRep* block_rep = r->pc_rep->PrepareBlock(
-          r->compression_type, first_key_in_next_block_ptr, &data_block, &keys);
+    iter->SeekToLast();
+    assert(iter->Valid());
+    if (r->IsParallelCompressionActive()) {
+      EmitBlockForParallel(data_block, iter->key(),
+                           first_key_in_loop_next_block_ptr);
 
-      assert(block_rep != nullptr);
-      r->pc_rep->file_size_estimator.EmitBlock(block_rep->data->size(),
-                                               r->get_offset());
-      r->pc_rep->EmitBlock(block_rep);
     } else {
-      for (; iter->Valid(); iter->Next()) {
-        Slice key = iter->key();
-        if (r->filter_builder != nullptr) {
-          // NOTE: AddWithPrevKey here would only save key copying if prev is
-          // pinned (iter->IsKeyPinned()), which is probably rare with delta
-          // encoding. OK to go from Add() here to AddWithPrevKey() in
-          // unbuffered operation.
-          r->filter_builder->Add(
-              ExtractUserKeyAndStripTimestamp(key, r->ts_sz));
-        }
-        r->index_builder->OnKeyAdded(key);
-      }
-      WriteBlock(Slice(data_block), &r->pending_handle, BlockType::kData);
-      if (ok() && i + 1 < r->data_block_buffers.size()) {
-        assert(next_block_iter != nullptr);
-        Slice first_key_in_next_block = next_block_iter->key();
-
-        Slice* first_key_in_next_block_ptr = &first_key_in_next_block;
-
-        iter->SeekToLast();
-        assert(iter->Valid());
-        r->index_builder->AddIndexEntry(
-            iter->key(), first_key_in_next_block_ptr, r->pending_handle,
-            &r->index_separator_scratch);
-      }
+      EmitBlock(data_block, iter->key(), first_key_in_loop_next_block_ptr);
     }
     std::swap(iter, next_block_iter);
   }
@@ -2065,32 +2757,36 @@ void BlockBasedTableBuilder::EnterUnbuffered() {
 }
 
 Status BlockBasedTableBuilder::Finish() {
-  Rep* r = rep_;
+  Rep* r = rep_.get();
   assert(r->state != Rep::State::kClosed);
-  bool empty_data_block = r->data_block.empty();
-  r->first_key_in_next_block = nullptr;
-  Flush();
-  if (r->state == Rep::State::kBuffered) {
-    EnterUnbuffered();
-  }
-  if (r->IsParallelCompressionEnabled()) {
-    StopParallelCompression();
+
 #ifndef NDEBUG
-    for (const auto& br : r->pc_rep->block_rep_buf) {
-      assert(br.status.ok());
+  {
+    // This sync point callback is a simple approximation of a failure detected
+    // in parallel compression after the start of calling Finish() but before
+    // Finish() calls Flush()
+    IOStatus s = rep_->GetIOStatus();
+    TEST_SYNC_POINT_CALLBACK("BlockBasedTableBuilder::Finish:ParallelIOStatus",
+                             &s);
+    if (!s.ok()) {
+      rep_->SetIOStatus(s);
     }
+  }
 #endif  // !NDEBUG
-  } else {
-    // To make sure properties block is able to keep the accurate size of index
-    // block, we will finish writing all index entries first.
-    if (ok() && !empty_data_block) {
-      r->index_builder->AddIndexEntry(
-          r->last_ikey, nullptr /* no next data block */, r->pending_handle,
-          &r->index_separator_scratch);
-    }
+  // To make sure properties block is able to keep the accurate size of index
+  // block, we will finish writing all index entries first, in Flush().
+  Flush(/*first_key_in_next_block=*/nullptr);
+  if (rep_->state == Rep::State::kBuffered) {
+    MaybeEnterUnbuffered(nullptr);
+  }
+  assert(r->state == Rep::State::kUnbuffered);
+  if (r->IsParallelCompressionActive()) {
+    StopParallelCompression(/*abort=*/false);
   }
 
-  r->props.tail_start_offset = r->offset;
+  r->props.tail_start_offset = r->offset.LoadRelaxed();
+
+  uint64_t last_estimated_tail_size = EstimatedTailSize();
 
   // Write meta blocks, metaindex block and footer in the following order.
   //    1. [meta block: filter]
@@ -2107,36 +2803,45 @@ Status BlockBasedTableBuilder::Finish() {
   WriteCompressionDictBlock(&meta_index_builder);
   WriteRangeDelBlock(&meta_index_builder);
   WritePropertiesBlock(&meta_index_builder);
-  if (ok()) {
+  if (LIKELY(ok())) {
     // flush the meta index block
     WriteMaybeCompressedBlock(meta_index_builder.Finish(), kNoCompression,
                               &metaindex_block_handle, BlockType::kMetaIndex);
   }
-  if (ok()) {
+  if (LIKELY(ok())) {
     WriteFooter(metaindex_block_handle, index_block_handle);
   }
   r->state = Rep::State::kClosed;
-  r->tail_size = r->offset - r->props.tail_start_offset;
-
-  Status ret_status = r->CopyStatus();
-  IOStatus ios = r->GetIOStatus();
-  if (!ios.ok() && ret_status.ok()) {
-    // Let io_status supersede ok status (otherwise status takes precedennce)
-    ret_status = ios;
-  }
-  return ret_status;
+  r->tail_size = r->offset.LoadRelaxed() - r->props.tail_start_offset;
+
+  // Assert tail size estimation is an overestimate only when tail size
+  // estimation option is enabled for compaction files with supported
+  // index/filter types:
+  // - Shortened indexes (kBinarySearch, kBinarySearchWithFirstKey)
+  // - Partitioned indexes (kTwoLevelIndexSearch)
+  // - Full filters
+  // - Partitioned filters
+  if (r->target_file_size_is_upper_bound &&
+      r->reason == TableFileCreationReason::kCompaction &&
+      r->table_options.index_type != BlockBasedTableOptions::kHashSearch) {
+    ROCKS_LOG_WARN(r->ioptions.info_log,
+                   "File number: %" PRIu64 ", Estimated tail size = %" PRIu64
+                   " bytes, Actual tail size = %" PRIu64 " bytes",
+                   r->props.orig_file_number, last_estimated_tail_size,
+                   r->tail_size);
+    assert(r->tail_size <= last_estimated_tail_size);
+  }
+
+  return r->GetStatus();
 }
 
 void BlockBasedTableBuilder::Abandon() {
   assert(rep_->state != Rep::State::kClosed);
-  if (rep_->IsParallelCompressionEnabled()) {
-    StopParallelCompression();
+  if (rep_->IsParallelCompressionActive()) {
+    StopParallelCompression(/*abort=*/true);
   }
   rep_->state = Rep::State::kClosed;
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED  // Avoid unnecessary lock acquisition
-  rep_->CopyStatus().PermitUncheckedError();
-  rep_->CopyIOStatus().PermitUncheckedError();
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+  rep_->GetIOStatus().PermitUncheckedError();
 }
 
 uint64_t BlockBasedTableBuilder::NumEntries() const {
@@ -2147,18 +2852,66 @@ bool BlockBasedTableBuilder::IsEmpty() const {
   return rep_->props.num_entries == 0 && rep_->props.num_range_deletions == 0;
 }
 
-uint64_t BlockBasedTableBuilder::FileSize() const { return rep_->offset; }
+uint64_t BlockBasedTableBuilder::PreCompressionSize() const {
+  return rep_->pre_compression_size;
+}
+
+uint64_t BlockBasedTableBuilder::FileSize() const {
+  return rep_->offset.LoadRelaxed();
+}
 
 uint64_t BlockBasedTableBuilder::EstimatedFileSize() const {
-  if (rep_->IsParallelCompressionEnabled()) {
-    // Use compression ratio so far and inflight uncompressed bytes to estimate
-    // final SST size.
-    return rep_->pc_rep->file_size_estimator.GetEstimatedFileSize();
+  if (rep_->IsParallelCompressionActive()) {
+    // Use upper bound on "inflight" data size to estimate
+    return FileSize() + rep_->pc_rep->estimated_inflight_size.LoadRelaxed();
   } else {
     return FileSize();
   }
 }
 
+uint64_t BlockBasedTableBuilder::EstimatedTailSize() const {
+  uint64_t estimated_tail_size = 0;
+
+  // 1. Estimate index size
+  if (rep_->table_options.index_type ==
+      BlockBasedTableOptions::kTwoLevelIndexSearch) {
+    assert(rep_->p_index_builder_);
+    estimated_tail_size += rep_->p_index_builder_->CurrentIndexSizeEstimate();
+  } else {
+    assert(rep_->index_builder);
+    estimated_tail_size += rep_->index_builder->CurrentIndexSizeEstimate();
+  }
+
+  // 2. Estimate filter size
+  if (rep_->filter_builder) {
+    estimated_tail_size += rep_->filter_builder->CurrentFilterSizeEstimate();
+  }
+
+  // 3. Estimate compression dictionary size
+  if (rep_->data_block_compressor) {
+    Slice dict = rep_->data_block_compressor->GetSerializedDict();
+    if (!dict.empty()) {
+      estimated_tail_size += dict.size();
+    }
+  }
+
+  // 4. Estimate range deletion block size
+  if (!rep_->range_del_block.empty()) {
+    estimated_tail_size += rep_->range_del_block.CurrentSizeEstimate();
+  }
+
+  // 5. Estimate properties block size conservatively (~1-2KB)
+  estimated_tail_size += 2048;
+
+  // 6. Estimate meta-index block size conservatively (~1KB)
+  estimated_tail_size += 1024;
+
+  // 7. Add footer size
+  estimated_tail_size += Footer::kMaxEncodedLength;
+
+  return estimated_tail_size;
+}
+
 uint64_t BlockBasedTableBuilder::GetTailSize() const { return rep_->tail_size; }
 
 bool BlockBasedTableBuilder::NeedCompact() const {
@@ -2201,7 +2954,4 @@ const std::string BlockBasedTable::kFullFilterBlockPrefix = "fullfilter.";
 const std::string BlockBasedTable::kPartitionedFilterBlockPrefix =
     "partitionedfilter.";
 
-#ifndef NDEBUG
-RelaxedAtomic<uint64_t> g_hack_mixed_compression_in_block_based_table{0};
-#endif  // !NDEBUG
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/block_based_table_builder.h b/table/block_based/block_based_table_builder.h
index 61f5ad78e5a5..0988f2b959ae 100644
--- a/table/block_based/block_based_table_builder.h
+++ b/table/block_based/block_based_table_builder.h
@@ -35,7 +35,6 @@ class WritableFile;
 struct BlockBasedTableOptions;
 
 extern const uint64_t kBlockBasedTableMagicNumber;
-extern const uint64_t kLegacyBlockBasedTableMagicNumber;
 
 class BlockBasedTableBuilder : public TableBuilder {
  public:
@@ -83,15 +82,21 @@ class BlockBasedTableBuilder : public TableBuilder {
 
   bool IsEmpty() const override;
 
+  uint64_t PreCompressionSize() const override;
+
   // Size of the file generated so far.  If invoked after a successful
   // Finish() call, returns the size of the final generated file.
   uint64_t FileSize() const override;
 
-  // Estimated size of the file generated so far. This is used when
-  // FileSize() cannot estimate final SST size, e.g. parallel compression
-  // is enabled.
+  // Estimated size of the file generated so far (based on data blocks, this
+  // estimate does not include meta blocks). This is used when FileSize() cannot
+  // estimate final SST size, e.g. parallel compression is enabled.
   uint64_t EstimatedFileSize() const override;
 
+  // Estimated tail size of the SST file generated so far. The "tail" refers to
+  // all blocks written after data blocks (index + filter).
+  uint64_t EstimatedTailSize() const override;
+
   // Get the size of the "tail" part of a SST file. "Tail" refers to
   // all blocks after data blocks till the end of the SST file.
   uint64_t GetTailSize() const override;
@@ -110,27 +115,41 @@ class BlockBasedTableBuilder : public TableBuilder {
   void SetSeqnoTimeTableProperties(const SeqnoToTimeMapping& relevant_mapping,
                                    uint64_t oldest_ancestor_time) override;
 
+  uint64_t GetWorkerCPUMicros() const override;
+
  private:
-  bool ok() const { return status().ok(); }
+  bool ok() const;
 
-  // Transition state from buffered to unbuffered. See `Rep::State` API comment
-  // for details of the states.
+  // Transition state from buffered to unbuffered if the conditions are met. See
+  // `Rep::State` API comment for details of the states.
   // REQUIRES: `rep_->state == kBuffered`
-  void EnterUnbuffered();
-
-  // Call block's Finish() method and then
-  // - in buffered mode, buffer the uncompressed block contents.
-  // - in unbuffered mode, write the compressed block contents to file.
-  void WriteBlock(BlockBuilder* block, BlockHandle* handle,
-                  BlockType blocktype);
-
-  // Compress and write block content to the file.
+  void MaybeEnterUnbuffered(const Slice* first_key_in_next_block);
+
+  // Try to keep some parallel-specific code separate to improve hot code
+  // locality for non-parallel case
+  void EmitBlock(std::string& uncompressed,
+                 const Slice& last_key_in_current_block,
+                 const Slice* first_key_in_next_block);
+  void EmitBlockForParallel(std::string& uncompressed,
+                            const Slice& last_key_in_current_block,
+                            const Slice* first_key_in_next_block);
+
+  // Compress and write block content to the file, from a single-threaded
+  // context
+  // @skip_delta_encoding : This is set to non null for data blocks, so that
+  //     caller would know whether the index entry of this data block should
+  //     skip delta encoding or not
   void WriteBlock(const Slice& block_contents, BlockHandle* handle,
-                  BlockType block_type);
+                  BlockType block_type, bool* skip_delta_encoding = nullptr);
   // Directly write data to the file.
-  void WriteMaybeCompressedBlock(
+  void WriteMaybeCompressedBlock(const Slice& block_contents, CompressionType,
+                                 BlockHandle* handle, BlockType block_type,
+                                 const Slice* uncompressed_block_data = nullptr,
+                                 bool* skip_delta_encoding = nullptr);
+  IOStatus WriteMaybeCompressedBlockImpl(
       const Slice& block_contents, CompressionType, BlockHandle* handle,
-      BlockType block_type, const Slice* uncompressed_block_data = nullptr);
+      BlockType block_type, const Slice* uncompressed_block_data = nullptr,
+      bool* skip_delta_encoding = nullptr);
 
   void SetupCacheKeyPrefix(const TableBuilderOptions& tbo);
 
@@ -158,58 +177,38 @@ class BlockBasedTableBuilder : public TableBuilder {
   struct Rep;
   class BlockBasedTablePropertiesCollectorFactory;
   class BlockBasedTablePropertiesCollector;
-  Rep* rep_;
-
+  std::unique_ptr<Rep> rep_;
+  struct WorkingAreaPair;
   struct ParallelCompressionRep;
 
   // Advanced operation: flush any buffered key/value pairs to file.
   // Can be used to ensure that two adjacent entries never live in
   // the same data block.  Most clients should not need to use this method.
   // REQUIRES: Finish(), Abandon() have not been called
-  void Flush();
+  void Flush(const Slice* first_key_in_next_block);
 
   // Some compression libraries fail when the uncompressed size is bigger than
   // int. If uncompressed size is bigger than kCompressionSizeLimit, don't
   // compress it
   const uint64_t kCompressionSizeLimit = std::numeric_limits<int>::max();
 
-  // Get blocks from mem-table walking thread, compress them and
-  // pass them to the write thread. Used in parallel compression mode only
-  void BGWorkCompression(const CompressionContext& compression_ctx,
-                         UncompressionContext* verify_ctx);
+  // Code for a "parallel compression" worker thread, which can really do SST
+  // writes and block compressions alternately.
+  void BGWorker(WorkingAreaPair& working_area);
 
   // Given uncompressed block content, try to compress it and return result and
   // compression type
-  void CompressAndVerifyBlock(const Slice& uncompressed_block_data,
-                              bool is_data_block,
-                              const CompressionContext& compression_ctx,
-                              UncompressionContext* verify_ctx,
-                              std::string* compressed_output,
-                              Slice* result_block_contents,
-                              CompressionType* result_compression_type,
-                              Status* out_status);
-
-  // Get compressed blocks from BGWorkCompression and write them into SST
-  void BGWorkWriteMaybeCompressedBlock();
-
-  // Initialize parallel compression context and
-  // start BGWorkCompression and BGWorkWriteMaybeCompressedBlock threads
-  void StartParallelCompression();
-
-  // Stop BGWorkCompression and BGWorkWriteMaybeCompressedBlock threads
-  void StopParallelCompression();
-};
+  Status CompressAndVerifyBlock(const Slice& uncompressed_block_data,
+                                bool is_data_block,
+                                WorkingAreaPair& working_area,
+                                GrowableBuffer* compressed_output,
+                                CompressionType* result_compression_type);
 
-Slice CompressBlock(const Slice& uncompressed_data, const CompressionInfo& info,
-                    CompressionType* type, uint32_t format_version,
-                    bool do_sample, std::string* compressed_output,
-                    std::string* sampled_output_fast,
-                    std::string* sampled_output_slow);
-
-#ifndef NDEBUG
-// 0 == disable the hack
-// > 0 => counter for rotating through compression types
-extern RelaxedAtomic<uint64_t> g_hack_mixed_compression_in_block_based_table;
-#endif
+  // If configured, start worker threads for parallel compression
+  void MaybeStartParallelCompression();
+
+  // Stop worker threads for parallel compression
+  void StopParallelCompression(bool abort);
+};
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc
index 7add9fb16fcb..f90e95f36a06 100644
--- a/table/block_based/block_based_table_factory.cc
+++ b/table/block_based/block_based_table_factory.cc
@@ -20,11 +20,14 @@
 #include "options/options_helper.h"
 #include "port/port.h"
 #include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/flush_block_policy.h"
 #include "rocksdb/rocksdb_namespace.h"
 #include "rocksdb/table.h"
+#include "rocksdb/user_defined_index.h"
+#include "rocksdb/utilities/customizable_util.h"
 #include "rocksdb/utilities/options_type.h"
 #include "table/block_based/block_based_table_builder.h"
 #include "table/block_based/block_based_table_reader.h"
@@ -182,6 +185,12 @@ static std::unordered_map<std::string, BlockBasedTableOptions::IndexType>
         {"kBinarySearchWithFirstKey",
          BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey}};
 
+static std::unordered_map<std::string, BlockBasedTableOptions::BlockSearchType>
+    block_base_table_index_search_type_string_map = {
+        {"kBinary", BlockBasedTableOptions::BlockSearchType::kBinary},
+        {"kInterpolation",
+         BlockBasedTableOptions::BlockSearchType::kInterpolation}};
+
 static std::unordered_map<std::string,
                           BlockBasedTableOptions::DataBlockIndexType>
     block_base_table_data_block_index_type_string_map = {
@@ -259,6 +268,10 @@ static struct BlockBasedTableTypeInfo {
         {"index_type", OptionTypeInfo::Enum<BlockBasedTableOptions::IndexType>(
                            offsetof(struct BlockBasedTableOptions, index_type),
                            &block_base_table_index_type_string_map)},
+        {"index_block_search_type",
+         OptionTypeInfo::Enum<BlockBasedTableOptions::BlockSearchType>(
+             offsetof(struct BlockBasedTableOptions, index_block_search_type),
+             &block_base_table_index_search_type_string_map)},
         {"hash_index_allow_collision",
          {0, OptionType::kBoolean, OptionVerificationType::kDeprecated}},
         {"data_block_index_type",
@@ -312,6 +325,11 @@ static struct BlockBasedTableTypeInfo {
          OptionTypeInfo::AsCustomSharedPtr<const FilterPolicy>(
              offsetof(struct BlockBasedTableOptions, filter_policy),
              OptionVerificationType::kByNameAllowFromNull)},
+        {"user_defined_index_factory",
+         OptionTypeInfo::AsCustomSharedPtr<UserDefinedIndexFactory>(
+             offsetof(struct BlockBasedTableOptions,
+                      user_defined_index_factory),
+             OptionVerificationType::kByNameAllowFromNull)},
         {"whole_key_filtering",
          {offsetof(struct BlockBasedTableOptions, whole_key_filtering),
           OptionType::kBoolean, OptionVerificationType::kNormal}},
@@ -357,6 +375,13 @@ static struct BlockBasedTableTypeInfo {
         {"block_align",
          {offsetof(struct BlockBasedTableOptions, block_align),
           OptionType::kBoolean, OptionVerificationType::kNormal}},
+        {"super_block_alignment_size",
+         {offsetof(struct BlockBasedTableOptions, super_block_alignment_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal}},
+        {"super_block_alignment_space_overhead_ratio",
+         {offsetof(struct BlockBasedTableOptions,
+                   super_block_alignment_space_overhead_ratio),
+          OptionType::kSizeT, OptionVerificationType::kNormal}},
         {"pin_top_level_index_and_filter",
          {offsetof(struct BlockBasedTableOptions,
                    pin_top_level_index_and_filter),
@@ -392,6 +417,9 @@ static struct BlockBasedTableTypeInfo {
          {offsetof(struct BlockBasedTableOptions,
                    num_file_reads_for_auto_readahead),
           OptionType::kUInt64T, OptionVerificationType::kNormal}},
+        {"fail_if_no_udi_on_open",
+         {offsetof(struct BlockBasedTableOptions, fail_if_no_udi_on_open),
+          OptionType::kBoolean, OptionVerificationType::kNormal}},
     };
   }
 } block_based_table_type_info;
@@ -427,10 +455,10 @@ void BlockBasedTableFactory::InitializeOptions() {
   if (table_options_.no_block_cache) {
     table_options_.block_cache.reset();
   } else if (table_options_.block_cache == nullptr) {
-    LRUCacheOptions co;
-    // 32MB, the recommended minimum size for 64 shards, to reduce contention
-    co.capacity = 32 << 20;
-    table_options_.block_cache = NewLRUCache(co);
+    // Now using AutoHCC by default, with existing default size of 32MB
+    // which is just one cache shard in HCC
+    HyperClockCacheOptions hcc_opts{size_t{32} << 20};
+    table_options_.block_cache = hcc_opts.MakeSharedCache();
   }
   if (table_options_.block_size_deviation < 0 ||
       table_options_.block_size_deviation > 100) {
@@ -467,6 +495,21 @@ void BlockBasedTableFactory::InitializeOptions() {
       options_overrides_iter->second.charged = options.charged;
     }
   }
+
+  if (table_options_.format_version < kMinSupportedBbtFormatVersionForWrite) {
+    // In TEST mode, allow writing format versions that are at least supported
+    // for reading (so that we have a way of testing the read side).
+    if (TEST_AllowUnsupportedFormatVersion()) {
+      if (table_options_.format_version <
+          kMinSupportedBbtFormatVersionForRead) {
+        table_options_.format_version = kMinSupportedBbtFormatVersionForWrite;
+      }
+    } else {
+      table_options_.format_version = kMinSupportedBbtFormatVersionForWrite;
+    }
+  }
+  // NOTE: do not sanitize too high format_version, so that it can be rejected
+  // in validation
 }
 
 Status BlockBasedTableFactory::PrepareOptions(const ConfigOptions& opts) {
@@ -555,9 +598,11 @@ Status BlockBasedTableFactory::NewTableReader(
       file_size, table_reader_options.block_protection_bytes_per_key,
       table_reader, table_reader_options.tail_size,
       shared_state_->table_reader_cache_res_mgr,
-      table_reader_options.prefix_extractor, prefetch_index_and_filter_in_cache,
-      table_reader_options.skip_filters, table_reader_options.level,
-      table_reader_options.immortal, table_reader_options.largest_seqno,
+      table_reader_options.prefix_extractor,
+      table_reader_options.compression_manager,
+      prefetch_index_and_filter_in_cache, table_reader_options.skip_filters,
+      table_reader_options.level, table_reader_options.immortal,
+      table_reader_options.largest_seqno,
       table_reader_options.force_direct_prefetch,
       &shared_state_->tail_prefetch_stats,
       table_reader_options.block_cache_tracer,
@@ -582,6 +627,14 @@ Status BlockBasedTableFactory::ValidateOptions(
         "Hash index is specified for block-based "
         "table, but prefix_extractor is not given");
   }
+  if (table_options_.index_block_search_type ==
+      BlockBasedTableOptions::kInterpolation) {
+    // Interpolation search requires BytewiseComparator
+    if (cf_opts.comparator != BytewiseComparator()) {
+      return Status::InvalidArgument(
+          "Interpolation search requires BytewiseComparator");
+    }
+  }
   if (table_options_.cache_index_and_filter_blocks &&
       table_options_.no_block_cache) {
     return Status::InvalidArgument(
@@ -594,28 +647,71 @@ Status BlockBasedTableFactory::ValidateOptions(
         "Enable pin_l0_filter_and_index_blocks_in_cache, "
         ", but block cache is disabled");
   }
-  if (!IsSupportedFormatVersion(table_options_.format_version)) {
+  // In TEST mode, also allow writing
+  // (a) old format_versions that for users are only supported for reads
+  // (b) future "draft" format versions that are not yet published to users
+  if (!(IsSupportedFormatVersionForWrite(kBlockBasedTableMagicNumber,
+                                         table_options_.format_version) ||
+        (TEST_AllowUnsupportedFormatVersion() &&
+         table_options_.format_version >=
+             kMinSupportedBbtFormatVersionForRead))) {
     return Status::InvalidArgument(
         "Unsupported BlockBasedTable format_version. Please check "
         "include/rocksdb/table.h for more info");
   }
-  if (table_options_.block_align && (cf_opts.compression != kNoCompression)) {
-    return Status::InvalidArgument(
-        "Enable block_align, but compression "
-        "enabled");
-  }
-  if (table_options_.block_align &&
-      cf_opts.bottommost_compression != kDisableCompressionOption &&
-      cf_opts.bottommost_compression != kNoCompression) {
-    return Status::InvalidArgument(
-        "Enable block_align, but bottommost_compression enabled");
+  bool using_builtin_compatible_compression = true;
+  if (cf_opts.compression_manager &&
+      strcmp(cf_opts.compression_manager->CompatibilityName(),
+             GetBuiltinV2CompressionManager()->CompatibilityName()) != 0) {
+    if (FormatVersionUsesCompressionManagerName(
+            table_options_.format_version)) {
+      using_builtin_compatible_compression = false;
+    } else {
+      return Status::InvalidArgument(
+          "Using a CompressionManager incompatible with built-in (custom "
+          "CompatibilityName()) is not supported for format_version < 7");
+    }
   }
-  if (table_options_.block_align) {
-    for (auto level_compression : cf_opts.compression_per_level) {
-      if (level_compression != kDisableCompressionOption &&
-          level_compression != kNoCompression) {
+  auto validate_compression_type_fn = [&](CompressionType ctype,
+                                          const char* context) {
+    if (ctype == kNoCompression) {
+      return Status::OK();
+    }
+    if (ctype == kDisableCompressionOption) {
+      if (strcmp(context, "compression") == 0) {
         return Status::InvalidArgument(
-            "Enable block_align, but compression_per_level enabled");
+            "kDisableCompressionOption not permitted for option: "
+            "compression");
+      } else {
+        return Status::OK();
+      }
+    }
+    if (table_options_.block_align) {
+      return Status::InvalidArgument("Enable block_align, but " +
+                                     std::string(context) + " enabled");
+    }
+    if (ctype > kLastBuiltinCompression &&
+        using_builtin_compatible_compression) {
+      return Status::InvalidArgument(
+          "Using a CompressionType other than built-in ...");  // TODO
+    }
+    // Otherwise
+    return Status::OK();
+  };
+  {
+    Status s = validate_compression_type_fn(cf_opts.compression, "compression");
+    if (!s.ok()) {
+      return s;
+    }
+    s = validate_compression_type_fn(cf_opts.bottommost_compression,
+                                     "bottommost_compression");
+    if (!s.ok()) {
+      return s;
+    }
+    for (auto ctype : cf_opts.compression_per_level) {
+      s = validate_compression_type_fn(ctype, "compression_per_level");
+      if (!s.ok()) {
+        return s;
       }
     }
   }
@@ -628,6 +724,22 @@ Status BlockBasedTableFactory::ValidateOptions(
     return Status::InvalidArgument(
         "block size exceeds maximum number (4GiB) allowed");
   }
+  if ((table_options_.super_block_alignment_size &
+       (table_options_.super_block_alignment_size - 1))) {
+    return Status::InvalidArgument(
+        "Super Block alignment requested but super block alignment size is not "
+        "a power of 2");
+  }
+  if (table_options_.super_block_alignment_size >
+      std::numeric_limits<uint32_t>::max()) {
+    return Status::InvalidArgument(
+        "Super block alignment size exceeds maximum number (4GiB) allowed");
+  }
+  if (table_options_.super_block_alignment_space_overhead_ratio > 0 &&
+      table_options_.super_block_alignment_space_overhead_ratio < 4) {
+    return Status::InvalidArgument(
+        "Super block alignment space overhead is too high");
+  }
   if (table_options_.data_block_index_type ==
           BlockBasedTableOptions::kDataBlockBinaryAndHash &&
       table_options_.data_block_hash_table_util_ratio <= 0) {
@@ -635,6 +747,12 @@ Status BlockBasedTableFactory::ValidateOptions(
         "data_block_hash_table_util_ratio should be greater than 0 when "
         "data_block_index_type is set to kDataBlockBinaryAndHash");
   }
+  if (table_options_.user_defined_index_factory &&
+      (cf_opts.compression_opts.parallel_threads > 1 ||
+       cf_opts.bottommost_compression_opts.parallel_threads > 1)) {
+    return Status::InvalidArgument(
+        "user_defined_index_factory not supported with parallel compression");
+  }
   if (db_opts.unordered_write && cf_opts.max_successive_merges > 0) {
     // TODO(myabandeh): support it
     return Status::InvalidArgument(
@@ -806,6 +924,14 @@ std::string BlockBasedTableFactory::GetPrintableOptions() const {
                ? "nullptr"
                : table_options_.filter_policy->Name());
   ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  user_defined_index_factory: %s\n",
+           table_options_.user_defined_index_factory == nullptr
+               ? "nullptr"
+               : table_options_.user_defined_index_factory->Name());
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  fail_if_no_udi_on_open: %d\n",
+           table_options_.fail_if_no_udi_on_open);
+  ret.append(buffer);
   snprintf(buffer, kBufferSize, "  whole_key_filtering: %d\n",
            table_options_.whole_key_filtering);
   ret.append(buffer);
@@ -824,6 +950,15 @@ std::string BlockBasedTableFactory::GetPrintableOptions() const {
   snprintf(buffer, kBufferSize, "  block_align: %d\n",
            table_options_.block_align);
   ret.append(buffer);
+  snprintf(buffer, kBufferSize,
+           "  super_block_alignment_size: %" ROCKSDB_PRIszt "\n",
+           table_options_.super_block_alignment_size);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize,
+           "  super_block_alignment_space_overhead_ratio: %" ROCKSDB_PRIszt
+           "\n",
+           table_options_.super_block_alignment_space_overhead_ratio);
+  ret.append(buffer);
   snprintf(buffer, kBufferSize,
            "  max_auto_readahead_size: %" ROCKSDB_PRIszt "\n",
            table_options_.max_auto_readahead_size);
@@ -950,6 +1085,13 @@ TableFactory* NewBlockBasedTableFactory(
   return new BlockBasedTableFactory(_table_options);
 }
 
+Status UserDefinedIndexFactory::CreateFromString(
+    const ConfigOptions& config_options, const std::string& value,
+    std::shared_ptr<UserDefinedIndexFactory>* factory) {
+  return LoadSharedObject<UserDefinedIndexFactory>(config_options, value,
+                                                   factory);
+}
+
 const std::string BlockBasedTablePropertyNames::kIndexType =
     "rocksdb.block.based.table.index.type";
 const std::string BlockBasedTablePropertyNames::kWholeKeyFiltering =
diff --git a/table/block_based/block_based_table_iterator.cc b/table/block_based/block_based_table_iterator.cc
index 3f55f82a77a5..e0e51469f6f3 100644
--- a/table/block_based/block_based_table_iterator.cc
+++ b/table/block_based/block_based_table_iterator.cc
@@ -37,6 +37,14 @@ void BlockBasedTableIterator::SeekImpl(const Slice* target,
                                        bool async_prefetch) {
   // TODO(hx235): set `seek_key_prefix_for_readahead_trimming_`
   // even when `target == nullptr` that is when `SeekToFirst()` is called
+  if (!multi_scan_status_.ok()) {
+    return;
+  }
+  if (multi_scan_) {
+    SeekMultiScan(target);
+    return;
+  }
+
   if (target != nullptr && prefix_extractor_ &&
       read_options_.prefix_same_as_start) {
     const Slice& seek_user_key = ExtractUserKey(*target);
@@ -56,7 +64,7 @@ void BlockBasedTableIterator::SeekImpl(const Slice* target,
   ResetBlockCacheLookupVar();
 
   bool autotune_readaheadsize =
-      is_first_pass && read_options_.auto_readahead_size &&
+      read_options_.auto_readahead_size &&
       (read_options_.iterate_upper_bound || read_options_.prefix_same_as_start);
 
   if (autotune_readaheadsize &&
@@ -181,6 +189,7 @@ void BlockBasedTableIterator::SeekImpl(const Slice* target,
 }
 
 void BlockBasedTableIterator::SeekForPrev(const Slice& target) {
+  multi_scan_.reset();
   direction_ = IterDirection::kBackward;
   ResetBlockCacheLookupVar();
   is_out_of_bound_ = false;
@@ -255,6 +264,7 @@ void BlockBasedTableIterator::SeekForPrev(const Slice& target) {
 }
 
 void BlockBasedTableIterator::SeekToLast() {
+  multi_scan_.reset();
   direction_ = IterDirection::kBackward;
   ResetBlockCacheLookupVar();
   is_out_of_bound_ = false;
@@ -278,7 +288,9 @@ void BlockBasedTableIterator::SeekToLast() {
 }
 
 void BlockBasedTableIterator::Next() {
+  assert(Valid());
   if (is_at_first_key_from_index_ && !MaterializeCurrentBlock()) {
+    assert(!multi_scan_);
     return;
   }
   assert(block_iter_points_to_real_block_);
@@ -299,7 +311,9 @@ bool BlockBasedTableIterator::NextAndGetResult(IterateResult* result) {
 }
 
 void BlockBasedTableIterator::Prev() {
-  if (readahead_cache_lookup_ && !IsIndexAtCurr()) {
+  assert(!multi_scan_);
+  if ((readahead_cache_lookup_ && !IsIndexAtCurr()) || multi_scan_) {
+    multi_scan_.reset();
     // In case of readahead_cache_lookup_, index_iter_ has moved forward. So we
     // need to reseek the index_iter_ to point to current block by using
     // block_iter_'s key.
@@ -566,6 +580,10 @@ void BlockBasedTableIterator::FindKeyForward() {
 }
 
 void BlockBasedTableIterator::FindBlockForward() {
+  if (multi_scan_) {
+    FindBlockForwardInMultiScan();
+    return;
+  }
   // TODO the while loop inherits from two-level-iterator. We don't know
   // whether a block can be empty so it can be replaced by an "if".
   do {
@@ -749,7 +767,7 @@ void BlockBasedTableIterator::InitializeStartAndEndOffsets(
       // It can be when Reseek is from block cache (which doesn't clear the
       // buffers in FilePrefetchBuffer but clears block handles from queue) and
       // reseek also lies within the buffer. So Next will get data from
-      // exisiting buffers untill this callback is made to prefetch additional
+      // existing buffers until this callback is made to prefetch additional
       // data. All handles need to be added to the queue starting from
       // index_iter_.
       assert(index_iter_->Valid());
@@ -901,4 +919,505 @@ void BlockBasedTableIterator::BlockCacheLookupForReadAheadSize(
   ResetPreviousBlockOffset();
 }
 
+// Note:
+// - Iterator should not be reused for multiple multiscans or mixing
+// multiscan with regular iterator usage.
+// - scan ranges should be non-overlapping, and have increasing start keys.
+// If a scan range's limit is not set, then there should only be one scan range.
+// - After Prepare(), the iterator expects Seek to be called on the start key
+// of each ScanOption in order. If any other Seek is done, an error status is
+// returned
+// - Whenever all blocks of a scan opt are exhausted, the iterator will become
+// invalid and UpperBoundCheckResult() will return kOutOfBound. So that the
+// upper layer (LevelIterator) will stop scanning instead thinking EOF is
+// reached and continue into the next file. The only exception is for the last
+// scan opt. If we reach the end of the last scan opt, UpperBoundCheckResult()
+// will return kUnknown instead of kOutOfBound. This mechanism requires that
+// scan opts are properly pruned such that there is no scan opt that is after
+// this file's key range.
+// FIXME: DBIter and MergingIterator may
+// internally do Seek() on child iterators, e.g. due to
+// ReadOptions::max_skippable_internal_keys or reseeking into range deletion
+// end key. These Seeks will be handled properly, as long as the target is
+// moving forward.
+void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
+  assert(!multi_scan_);
+  RecordTick(table_->GetStatistics(), MULTISCAN_PREPARE_CALLS);
+  StopWatch sw(table_->get_rep()->ioptions.clock, table_->GetStatistics(),
+               MULTISCAN_PREPARE_MICROS);
+
+  if (!index_iter_->status().ok()) {
+    multi_scan_status_ = index_iter_->status();
+    RecordTick(table_->GetStatistics(), MULTISCAN_PREPARE_ERRORS);
+    return;
+  }
+  if (multi_scan_) {
+    multi_scan_.reset();
+    multi_scan_status_ = Status::InvalidArgument("Prepare already called");
+    RecordTick(table_->GetStatistics(), MULTISCAN_PREPARE_ERRORS);
+    return;
+  }
+
+  index_iter_->Prepare(multiscan_opts);
+
+  std::vector<BlockHandle> scan_block_handles;
+  std::vector<std::string> data_block_separators;
+  std::vector<std::tuple<size_t, size_t>> block_index_ranges_per_scan;
+  const std::vector<ScanOptions>& scan_opts = multiscan_opts->GetScanRanges();
+  multi_scan_status_ =
+      CollectBlockHandles(scan_opts, &scan_block_handles,
+                          &block_index_ranges_per_scan, &data_block_separators);
+  if (!multi_scan_status_.ok()) {
+    RecordTick(table_->GetStatistics(), MULTISCAN_PREPARE_ERRORS);
+    return;
+  }
+
+  // Calculate prefetch_max_idx (enforces max_prefetch_size)
+  size_t prefetch_max_idx = scan_block_handles.size();
+  if (multiscan_opts->max_prefetch_size > 0) {
+    uint64_t total_size = 0;
+    for (size_t i = 0; i < scan_block_handles.size(); ++i) {
+      total_size +=
+          BlockBasedTable::BlockSizeWithTrailer(scan_block_handles[i]);
+      if (total_size > multiscan_opts->max_prefetch_size) {
+        prefetch_max_idx = i;
+        break;
+      }
+    }
+  }
+
+  // Create block handles vector for IODispatcher (limited to prefetch_max_idx)
+  std::vector<BlockHandle> blocks_to_prefetch;
+  if (prefetch_max_idx > 0) {
+    blocks_to_prefetch.assign(scan_block_handles.begin(),
+                              scan_block_handles.begin() + prefetch_max_idx);
+  }
+
+  // Submit to IODispatcher
+  auto job = std::make_shared<IOJob>();
+  job->table = const_cast<BlockBasedTable*>(table_);
+  job->block_handles = std::move(blocks_to_prefetch);
+  job->job_options.io_coalesce_threshold =
+      multiscan_opts->io_coalesce_threshold;
+  job->job_options.read_options = read_options_;
+  job->job_options.read_options.async_io = multiscan_opts->use_async_io;
+
+  std::shared_ptr<ReadSet> read_set;
+  // IODispatcher should be provided by DBIter::Prepare() to enable sharing
+  // across all BlockBasedTableIterators in the scan. Create one if not
+  // provided (for direct calls to Prepare, e.g., in unit tests).
+  std::shared_ptr<IODispatcher> dispatcher = multiscan_opts->io_dispatcher;
+  if (!dispatcher) {
+    dispatcher.reset(NewIODispatcher());
+  }
+  multi_scan_status_ = dispatcher->SubmitJob(job, &read_set);
+  if (!multi_scan_status_.ok()) {
+    RecordTick(table_->GetStatistics(), MULTISCAN_PREPARE_ERRORS);
+    return;
+  }
+
+  // Successful Prepare, init related states so the iterator reads from prepared
+  // blocks. Note: data_block_separators keeps full size for seek logic.
+  multi_scan_ = std::make_unique<MultiScanState>(
+      table_->get_rep()->ioptions.env->GetFileSystem(), multiscan_opts,
+      std::move(read_set), std::move(data_block_separators),
+      std::move(block_index_ranges_per_scan), prefetch_max_idx,
+      table_->GetStatistics());
+
+  is_index_at_curr_block_ = false;
+  block_iter_points_to_real_block_ = false;
+}
+
+void BlockBasedTableIterator::SeekMultiScan(const Slice* seek_target) {
+  assert(multi_scan_ && multi_scan_status_.ok());
+  // This is a MultiScan and Prepare() has been called.
+
+  // Reset out of bound on seek, if it is out of bound again, it will be set
+  // properly later in the code path
+  is_out_of_bound_ = false;
+
+  // Validate seek key with scan options
+  if (!seek_target) {
+    // start key must be set for multi-scan
+    multi_scan_status_ = Status::InvalidArgument("No seek key for MultiScan");
+    RecordTick(table_->GetStatistics(), MULTISCAN_SEEK_ERRORS);
+    return;
+  }
+
+  // Check the case where there is no range prepared on this table
+  if (multi_scan_->scan_opts->size() == 0) {
+    // out of bound
+    MarkPreparedRangeExhausted();
+    return;
+  }
+
+  // Check whether seek key is moving forward.
+  if (multi_scan_->prev_seek_key_.empty() ||
+      icomp_.Compare(*seek_target, multi_scan_->prev_seek_key_) > 0) {
+    // If seek key is empty or is larger than previous seek key, update the
+    // previous seek key. Otherwise use the previous seek key as the adjusted
+    // seek target moving forward. This prevents seek target going backward,
+    // which would visit pages that have been unpinned.
+    // This issue is caused by sub-optimal range delete handling inside merge
+    // iterator.
+    // TODO xingbo issues:14068 : Optimize the handling of range delete iterator
+    // inside merge iterator, so that it doesn't move seek key backward. After
+    // that we could return error if the key moves backward here.
+    multi_scan_->prev_seek_key_ = seek_target->ToString();
+  } else {
+    // Seek key is adjusted to previous one, we can return here directly.
+    return;
+  }
+
+  // There are 3 different Cases we need to handle:
+  // The following diagram explain different seek targets seeking at various
+  // position on the table, while the next_scan_idx points to the PreparedRange
+  // 2.
+  //
+  // next_scan_idx: -------------------┐
+  //                                   ▼
+  // table:     : __[PreparedRange 1]__[PreparedRange 2]__[PreparedRange 3]__
+  // Seek target: <----- Case 1 ------>▲<------------- Case 2 -------------->
+  //                                   │
+  //                                 Case 3
+  //
+  // Case 1: seek before the start of next prepared ranges. This could happen
+  //    due to too many delete tomestone triggered reseek or delete range.
+  // Case 2: seek after the start of next prepared range.
+  //    This could happen due to seek key adjustment from delete range file.
+  //    E.g. LSM has 3 levels, each level has only 1 file:
+  //    L1 : key :              0---10
+  //    L2 : Delete range key : 0-5
+  //    L3 : key :              0---10
+  //    When a range 2-8 was prepared, the prepared key would be 2 on L3 file,
+  //    but the seek key would be 5, as the seek key was updated by the largest
+  //    key of delete range. This causes all of the cases above to be possible,
+  //    when the ranges are adjusted in the above examples.
+  // Case 3: seek at the beginning of a prepared range (expected case)
+
+  // Allow reseek on the start of the last prepared range due to too many
+  // tombstone
+  multi_scan_->next_scan_idx =
+      std::min(multi_scan_->next_scan_idx,
+               multi_scan_->block_index_ranges_per_scan.size() - 1);
+
+  auto user_seek_target = ExtractUserKey(*seek_target);
+
+  auto compare_next_scan_start_result =
+      user_comparator_.CompareWithoutTimestamp(
+          user_seek_target, /*a_has_ts=*/true,
+          multi_scan_->scan_opts->GetScanRanges()[multi_scan_->next_scan_idx]
+              .range.start.value(),
+          /*b_has_ts=*/false);
+
+  if (compare_next_scan_start_result != 0) {
+    // The seek target is not exactly same as what was prepared.
+    if (compare_next_scan_start_result < 0) {
+      // Case 1:
+      if (multi_scan_->next_scan_idx == 0) {
+        // This should not happen, even when seek target is adjusted by delete
+        // range. The reason is that if the seek target is before the start key
+        // of the first prepared range, its end key needs to be >= the smallest
+        // key of this file, otherwise it is skipped in level iterator. If its
+        // end key is >= the smallest key of this file, then this range will be
+        // prepared for this file. As delete range could only adjust seek
+        // target forward, so it would never be before the start key of the
+        // first prepared range.
+        assert(false && "Seek target before the first prepared range");
+        MarkPreparedRangeExhausted();
+        return;
+      }
+      auto seek_target_before_previous_prepared_range =
+          user_comparator_.CompareWithoutTimestamp(
+              user_seek_target, /*a_has_ts=*/true,
+              multi_scan_->scan_opts
+                  ->GetScanRanges()[multi_scan_->next_scan_idx - 1]
+                  .range.start.value(),
+              /*b_has_ts=*/false) < 0;
+      // Not expected to happen
+      // This should never happen, the reason is that the
+      // multi_scan_->next_scan_idx is set to a non zero value is due to a seek
+      // target larger or equal to the start key of multi_scan_->next_scan_idx-1
+      // happened earlier. If a seek happens before the start key of
+      // multi_scan_->next_scan_idx-1, it would seek a key that is less than
+      // what was seeked before.
+      assert(!seek_target_before_previous_prepared_range);
+      if (seek_target_before_previous_prepared_range) {
+        multi_scan_status_ = Status::InvalidArgument(
+            "Seek target is before the previous prepared range at index " +
+            std::to_string(multi_scan_->next_scan_idx));
+        RecordTick(table_->GetStatistics(), MULTISCAN_SEEK_ERRORS);
+        return;
+      }
+      // It should only be possible to seek a key between the start of current
+      // prepared scan and start of next prepared range.
+      MultiScanUnexpectedSeekTarget(seek_target, &user_seek_target);
+    } else {
+      // Case 2:
+      MultiScanUnexpectedSeekTarget(seek_target, &user_seek_target);
+    }
+  } else {
+    // Case 2:
+    assert(multi_scan_->next_scan_idx <
+           multi_scan_->block_index_ranges_per_scan.size());
+
+    auto [cur_scan_start_idx, cur_scan_end_idx] =
+        multi_scan_->block_index_ranges_per_scan[multi_scan_->next_scan_idx];
+    // We should have the data block already loaded
+    ++multi_scan_->next_scan_idx;
+    if (cur_scan_start_idx >= cur_scan_end_idx) {
+      // No blocks are prepared for this range at current file.
+      MarkPreparedRangeExhausted();
+      return;
+    }
+
+    // max_sequential_skip_in_iterations can trigger a reseek on the start
+    // key of a scan range, even though the multiscan is already past
+    // `cur_scan_start_idx` (e.g., a user key spans multiple data blocks).
+    size_t block_idx =
+        std::max(cur_scan_start_idx, multi_scan_->cur_data_block_idx);
+    MultiScanSeekTargetFromBlock(seek_target, block_idx);
+  }
+}
+
+void BlockBasedTableIterator::MultiScanUnexpectedSeekTarget(
+    const Slice* seek_target, const Slice* user_seek_target) {
+  // linear search the block that contains the seek target, and unpin blocks
+  // that are before it.
+
+  // The logic here could be confusing when there is a delete range involved.
+  // E.g. we have an LSM with 3 levels, each level has only 1 file:
+  // L1: data file :    0---10
+  // L2: Delete range : 0-5
+  // L3: data file :    0---10
+  //
+  // MultiScan on ranges 1-2, 3-4, and 5-6.
+  // When user first do Seek(1), on level 2, due to delete range 0-5, the seek
+  // key is adjusted to 5 at level 3. Therefore, we will internally do Seek(5)
+  // and unpins all blocks until 5 at level 3. Then the next scan's blocks from
+  // 3-4 are unpinned at level 3. It is confusing that maybe block 3-4 should
+  // not be unpinned, as next scan would need it. But Seek(5) implies that these
+  // keys are all covered by some range deletion, so the next Seek(3) will also
+  // do Seek(5) internally, so the blocks from 3-4 could be safely unpinned.
+
+  // advance to the right prepared range
+  while (
+      multi_scan_->next_scan_idx <
+          multi_scan_->block_index_ranges_per_scan.size() &&
+      (user_comparator_.CompareWithoutTimestamp(
+           *user_seek_target, /*a_has_ts=*/true,
+           multi_scan_->scan_opts->GetScanRanges()[multi_scan_->next_scan_idx]
+               .range.start.value(),
+           /*b_has_ts=*/false) >= 0)) {
+    multi_scan_->next_scan_idx++;
+  }
+
+  // next_scan_idx is guaranteed to be higher than 0. If the seek key is before
+  // the start key of first prepared range, it is already handled by caller
+  // SeekMultiScan. It is equal, it would not call this funciton. If it is
+  // after, next_scan_idx would be advanced by the loop above.
+  assert(multi_scan_->next_scan_idx > 0);
+  // Get the current range
+  auto cur_scan_idx = multi_scan_->next_scan_idx - 1;
+  auto [cur_scan_start_idx, cur_scan_end_idx] =
+      multi_scan_->block_index_ranges_per_scan[cur_scan_idx];
+
+  if (cur_scan_start_idx >= cur_scan_end_idx) {
+    // No blocks are prepared for this range at current file.
+    MarkPreparedRangeExhausted();
+    return;
+  }
+
+  // Unpin all the blocks from multi_scan_->cur_data_block_idx to
+  // cur_scan_start_idx - these are wasted (prefetched but skipped)
+  for (auto unpin_block_idx = multi_scan_->cur_data_block_idx;
+       unpin_block_idx < cur_scan_start_idx; unpin_block_idx++) {
+    // Count as wasted if it was prefetched
+    if (unpin_block_idx < multi_scan_->prefetch_max_idx) {
+      multi_scan_->wasted_blocks_count++;
+    }
+    multi_scan_->read_set->ReleaseBlock(unpin_block_idx);
+  }
+
+  // Take the max here to ensure we don't move backwards.
+  size_t block_idx =
+      std::max(cur_scan_start_idx, multi_scan_->cur_data_block_idx);
+  auto const& data_block_separators = multi_scan_->data_block_separators;
+  while (block_idx < data_block_separators.size() &&
+         (user_comparator_.CompareWithoutTimestamp(
+              *user_seek_target, /*a_has_ts=*/true,
+              data_block_separators[block_idx],
+              /*b_has_ts=*/false) > 0)) {
+    // Unpin the blocks that are passed - count as wasted if prefetched
+    if (block_idx < multi_scan_->prefetch_max_idx) {
+      multi_scan_->wasted_blocks_count++;
+    }
+    multi_scan_->read_set->ReleaseBlock(block_idx);
+    block_idx++;
+  }
+
+  if (block_idx >= data_block_separators.size()) {
+    // All of the prepared blocks for this file is exhausted.
+    MarkPreparedRangeExhausted();
+    return;
+  }
+
+  // The current block may contain the data for the target key
+  MultiScanSeekTargetFromBlock(seek_target, block_idx);
+}
+
+void BlockBasedTableIterator::MultiScanSeekTargetFromBlock(
+    const Slice* seek_target, size_t block_idx) {
+  assert(multi_scan_->cur_data_block_idx <= block_idx);
+
+  if (!block_iter_points_to_real_block_ ||
+      multi_scan_->cur_data_block_idx != block_idx) {
+    if (block_iter_points_to_real_block_) {
+      // Should be scan in increasing key range.
+      // All blocks before cur_data_block_idx_ are not pinned anymore.
+      assert(multi_scan_->cur_data_block_idx < block_idx);
+    }
+
+    ResetDataIter();
+
+    if (MultiScanLoadDataBlock(block_idx)) {
+      return;
+    }
+  }
+
+  // Move current data block index forward until block_idx, meantime, unpin all
+  // the blocks in between - these are wasted (prefetched but skipped)
+  while (multi_scan_->cur_data_block_idx < block_idx) {
+    // Count as wasted if it was prefetched
+    if (multi_scan_->cur_data_block_idx < multi_scan_->prefetch_max_idx) {
+      multi_scan_->wasted_blocks_count++;
+    }
+    multi_scan_->read_set->ReleaseBlock(multi_scan_->cur_data_block_idx);
+    multi_scan_->cur_data_block_idx++;
+  }
+  block_iter_points_to_real_block_ = true;
+  block_iter_.Seek(*seek_target);
+  FindKeyForward();
+  CheckOutOfBound();
+}
+
+void BlockBasedTableIterator::FindBlockForwardInMultiScan() {
+  assert(multi_scan_);
+  assert(multi_scan_->next_scan_idx >= 1);
+  const auto cur_scan_end_idx = std::get<1>(
+      multi_scan_->block_index_ranges_per_scan[multi_scan_->next_scan_idx - 1]);
+  do {
+    if (!block_iter_.status().ok()) {
+      return;
+    }
+
+    // If is_out_of_bound_ is true, upper layer (LevelIterator) considers this
+    // level has reached iterate_upper_bound_ and will not continue to iterate
+    // into the next file. When we are doing the last scan within a MultiScan
+    // for this file, it may need to continue to scan into the next file, so
+    // we do not set is_out_of_bound_ in this case.
+    if (multi_scan_->cur_data_block_idx + 1 >= cur_scan_end_idx) {
+      MarkPreparedRangeExhausted();
+      return;
+    }
+    // Move to the next pinned data block
+    ResetDataIter();
+    // Unpin previous block via ReadSet
+    multi_scan_->read_set->ReleaseBlock(multi_scan_->cur_data_block_idx);
+    ++multi_scan_->cur_data_block_idx;
+
+    if (MultiScanLoadDataBlock(multi_scan_->cur_data_block_idx)) {
+      return;
+    }
+
+    block_iter_points_to_real_block_ = true;
+    block_iter_.SeekToFirst();
+  } while (!block_iter_.Valid());
+}
+
+constexpr auto kVerbose = false;
+
+Status BlockBasedTableIterator::CollectBlockHandles(
+    const std::vector<ScanOptions>& scan_opts,
+    std::vector<BlockHandle>* scan_block_handles,
+    std::vector<std::tuple<size_t, size_t>>* block_index_ranges_per_scan,
+    std::vector<std::string>* data_block_separators) {
+  // print file name and level
+  if (UNLIKELY(kVerbose)) {
+    auto file_name = table_->get_rep()->file->file_name();
+    auto level = table_->get_rep()->level;
+    printf("file name : %s, level %d\n", file_name.c_str(), level);
+  }
+  for (const auto& scan_opt : scan_opts) {
+    size_t num_blocks = 0;
+    bool check_overlap = !scan_block_handles->empty();
+
+    InternalKey start_key;
+    const size_t timestamp_size =
+        user_comparator_.user_comparator()->timestamp_size();
+    if (timestamp_size == 0) {
+      start_key = InternalKey(scan_opt.range.start.value(), kMaxSequenceNumber,
+                              kValueTypeForSeek);
+    } else {
+      std::string seek_key;
+      AppendKeyWithMaxTimestamp(&seek_key, scan_opt.range.start.value(),
+                                timestamp_size);
+      start_key = InternalKey(seek_key, kMaxSequenceNumber, kValueTypeForSeek);
+    }
+    index_iter_->Seek(start_key.Encode());
+    while (index_iter_->status().ok() && index_iter_->Valid() &&
+           (!scan_opt.range.limit.has_value() ||
+            user_comparator_.CompareWithoutTimestamp(index_iter_->user_key(),
+                                                     /*a_has_ts*/ true,
+                                                     *scan_opt.range.limit,
+                                                     /*b_has_ts=*/false) < 0)) {
+      // Only add the block if the index separator is smaller than limit. When
+      // they are equal or larger, it will be handled later below.
+      if (check_overlap &&
+          scan_block_handles->back() == index_iter_->value().handle) {
+        // Skip the current block since it's already in the list
+      } else {
+        scan_block_handles->push_back(index_iter_->value().handle);
+        // clone the Slice to avoid the lifetime issue
+        data_block_separators->push_back(index_iter_->user_key().ToString());
+      }
+      ++num_blocks;
+      index_iter_->Next();
+      check_overlap = false;
+    }
+
+    if (!index_iter_->status().ok()) {
+      // Abort: index iterator error
+      return index_iter_->status();
+    }
+
+    if (index_iter_->Valid()) {
+      // Handle the last block when its separator is equal or larger than limit
+      if (check_overlap &&
+          scan_block_handles->back() == index_iter_->value().handle) {
+        // Skip adding the current block since it's already in the list
+      } else {
+        scan_block_handles->push_back(index_iter_->value().handle);
+        data_block_separators->push_back(index_iter_->user_key().ToString());
+      }
+      ++num_blocks;
+    }
+    block_index_ranges_per_scan->emplace_back(
+        scan_block_handles->size() - num_blocks, scan_block_handles->size());
+    if (UNLIKELY(kVerbose)) {
+      printf("separators :");
+      for (const auto& separator : *data_block_separators) {
+        printf("%s, ", separator.c_str());
+      }
+      printf("\nblock_index_ranges_per_scan :");
+      for (auto const& block_index_range : *block_index_ranges_per_scan) {
+        printf("[%zu, %zu], ", std::get<0>(block_index_range),
+               std::get<1>(block_index_range));
+      }
+      printf("\n");
+    }
+  }
+  return Status::OK();
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/block_based_table_iterator.h b/table/block_based/block_based_table_iterator.h
index d49224de4ac2..d7c4d409305b 100644
--- a/table/block_based/block_based_table_iterator.h
+++ b/table/block_based/block_based_table_iterator.h
@@ -10,6 +10,7 @@
 #include <deque>
 
 #include "db/seqno_to_time_mapping.h"
+#include "rocksdb/io_dispatcher.h"
 #include "table/block_based/block_based_table_reader.h"
 #include "table/block_based/block_based_table_reader_impl.h"
 #include "table/block_based/block_prefetcher.h"
@@ -41,11 +42,13 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
             compaction_readahead_size,
             table_->get_rep()->table_options.initial_auto_readahead_size),
         allow_unprepared_value_(allow_unprepared_value),
-        block_iter_points_to_real_block_(false),
         check_filter_(check_filter),
         need_upper_bound_check_(need_upper_bound_check),
         async_read_in_progress_(false),
-        is_last_level_(table->IsLastLevel()) {}
+        is_last_level_(table->IsLastLevel()),
+        block_iter_points_to_real_block_(false) {
+    multi_scan_status_.PermitUncheckedError();
+  }
 
   ~BlockBasedTableIterator() override { ClearBlockHandles(); }
 
@@ -57,7 +60,7 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
   bool NextAndGetResult(IterateResult* result) override;
   void Prev() override;
   bool Valid() const override {
-    return !is_out_of_bound_ &&
+    return !is_out_of_bound_ && multi_scan_status_.ok() &&
            (is_at_first_key_from_index_ ||
             (block_iter_points_to_real_block_ && block_iter_.Valid()));
   }
@@ -69,6 +72,7 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
   Slice key() const override {
     assert(Valid());
     if (is_at_first_key_from_index_) {
+      assert(!multi_scan_);
       return index_iter_->value().first_internal_key;
     } else {
       return block_iter_.key();
@@ -135,17 +139,25 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
     return block_iter_.value();
   }
   Status status() const override {
+    if (!multi_scan_status_.ok()) {
+      return multi_scan_status_;
+    }
     // In case of block cache readahead lookup, it won't add the block to
     // block_handles if it's index is invalid. So index_iter_->status check can
     // be skipped.
     // Prefix index set status to NotFound when the prefix does not exist.
     if (IsIndexAtCurr() && !index_iter_->status().ok() &&
         !index_iter_->status().IsNotFound()) {
+      assert(!multi_scan_);
       return index_iter_->status();
     } else if (block_iter_points_to_real_block_) {
+      // This is the common case.
       return block_iter_.status();
     } else if (async_read_in_progress_) {
+      assert(!multi_scan_);
       return Status::TryAgain("Async read in progress");
+    } else if (multi_scan_) {
+      return multi_scan_status_;
     } else {
       return Status::OK();
     }
@@ -157,6 +169,8 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
     } else if (block_upper_bound_check_ ==
                BlockUpperBound::kUpperBoundBeyondCurBlock) {
       assert(!is_out_of_bound_);
+      // MultiScan does not do block level upper bound check yet.
+      assert(!multi_scan_);
       return IterBoundCheck::kInbound;
     } else {
       return IterBoundCheck::kUnknown;
@@ -222,12 +236,21 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
     }
   }
 
+  void Prepare(const MultiScanArgs* scan_opts) override;
+
   FilePrefetchBuffer* prefetch_buffer() {
     return block_prefetcher_.prefetch_buffer();
   }
 
   std::unique_ptr<InternalIteratorBase<IndexValue>> index_iter_;
 
+  bool TEST_IsBlockPinnedByMultiScan(size_t block_idx) {
+    if (!multi_scan_ || !multi_scan_->read_set) {
+      return false;
+    }
+    return multi_scan_->read_set->IsBlockAvailable(block_idx);
+  }
+
  private:
   enum class IterDirection {
     kForward,
@@ -308,12 +331,20 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
 
   BlockPrefetcher block_prefetcher_;
 
+  // It stores all the block handles that are lookuped in cache ahead when
+  // BlockCacheLookupForReadAheadSize is called. Since index_iter_ may point to
+  // different blocks when readahead_size is calculated in
+  // BlockCacheLookupForReadAheadSize, to avoid index_iter_ reseek,
+  // block_handles_ is used.
+  // `block_handles_` is lazily constructed to save CPU when it is unused
+  std::unique_ptr<std::deque<BlockHandleInfo>> block_handles_;
+
+  // The prefix of the key called with SeekImpl().
+  // This is for readahead trimming so no data blocks containing keys of a
+  // different prefix are prefetched
+  std::string seek_key_prefix_for_readahead_trimming_ = "";
+
   const bool allow_unprepared_value_;
-  // True if block_iter_ is initialized and points to the same block
-  // as index iterator.
-  bool block_iter_points_to_real_block_;
-  // See InternalIteratorBase::IsOutOfBound().
-  bool is_out_of_bound_ = false;
   // How current data block's boundary key with the next block is compared with
   // iterate upper bound.
   BlockUpperBound block_upper_bound_check_ = BlockUpperBound::kUnknown;
@@ -333,18 +364,6 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
   // size based on cache hit and miss.
   bool readahead_cache_lookup_ = false;
 
-  // It stores all the block handles that are lookuped in cache ahead when
-  // BlockCacheLookupForReadAheadSize is called. Since index_iter_ may point to
-  // different blocks when readahead_size is calculated in
-  // BlockCacheLookupForReadAheadSize, to avoid index_iter_ reseek,
-  // block_handles_ is used.
-  // `block_handles_` is lazily constructed to save CPU when it is unused
-  std::unique_ptr<std::deque<BlockHandleInfo>> block_handles_;
-
-  // During cache lookup to find readahead size, index_iter_ is iterated and it
-  // can point to a different block. is_index_at_curr_block_ keeps track of
-  // that.
-  bool is_index_at_curr_block_ = true;
   bool is_index_out_of_bound_ = false;
 
   // Used in case of auto_readahead_size to disable the block_cache lookup if
@@ -353,10 +372,99 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
   // is used to disable the lookup.
   IterDirection direction_ = IterDirection::kForward;
 
-  // The prefix of the key called with SeekImpl().
-  // This is for readahead trimming so no data blocks containing keys of a
-  // different prefix are prefetched
-  std::string seek_key_prefix_for_readahead_trimming_ = "";
+  //*** BEGIN States used by both regular scan and multiscan
+
+  // True if block_iter_ is initialized and points to the same block
+  // as index iterator.
+  bool block_iter_points_to_real_block_;
+  // See InternalIteratorBase::IsOutOfBound().
+  bool is_out_of_bound_ = false;
+
+  // Mark prepared ranges as exhausted for multiscan.
+  void MarkPreparedRangeExhausted() {
+    assert(multi_scan_ != nullptr);
+    if (multi_scan_->next_scan_idx <
+        multi_scan_->block_index_ranges_per_scan.size()) {
+      // If there are more prepared ranges, we don't ResetDataIter() here,
+      // because next scan might be reading from the same block. ResetDataIter()
+      // will free the underlying block cache handle and we don't want the
+      // block to be unpinned.
+      // Set out of bound to mark the current prepared range as exhausted.
+      is_out_of_bound_ = true;
+    } else {
+      // This is the last prepared range of this file, there might be more
+      // data on next file. Reset data iterator to indicate the iterator is
+      // no longer valid on this file. Let LevelIter advance to the next file
+      // instead of ending the scan.
+      ResetDataIter();
+    }
+  }
+
+  // During cache lookup to find readahead size, index_iter_ is iterated and it
+  // can point to a different block.
+  // If Prepare() is called, index_iter_ is used to prefetch data blocks for the
+  // multiscan, so is_index_at_curr_block_ will be false.
+  // Whether index is expected to match the current data_block_iter_.
+  bool is_index_at_curr_block_ = true;
+
+  // *** END States used by both regular scan and multiscan
+
+  // *** BEGIN MultiScan related states ***
+  struct MultiScanState {
+    // For Aborting async I/Os in destructor.
+    const std::shared_ptr<FileSystem> fs;
+    const MultiScanArgs* scan_opts;
+    // ReadSet owns pinned data blocks and handles async I/O
+    std::shared_ptr<ReadSet> read_set;
+    // The separator of each data block.
+    // Its size is same as the number of block handles submitted to
+    // IODispatcher. The value of separator is larger than or equal to the last
+    // key in the corresponding data block.
+    std::vector<std::string> data_block_separators;
+    // Track previously seeked key in multi-scan.
+    // This is used to ensure that the seek key is keep moving forward, as
+    // blocks that are smaller than the seek key are unpinned from memory.
+    std::string prev_seek_key_;
+
+    // Indicies into block handles for data blocks for each scan range.
+    // inclusive start, exclusive end
+    std::vector<std::tuple<size_t, size_t>> block_index_ranges_per_scan;
+    size_t next_scan_idx;
+    size_t cur_data_block_idx;
+    size_t prefetch_max_idx;
+
+    // For tracking wasted prefetch blocks (prefetched but never read)
+    Statistics* statistics;
+    size_t wasted_blocks_count;
+
+    MultiScanState(
+        const std::shared_ptr<FileSystem>& _fs, const MultiScanArgs* _scan_opts,
+        std::shared_ptr<ReadSet>&& _read_set,
+        std::vector<std::string>&& _data_block_separators,
+        std::vector<std::tuple<size_t, size_t>>&& _block_index_ranges_per_scan,
+        size_t _prefetch_max_idx, Statistics* _statistics)
+        : fs(_fs),
+          scan_opts(_scan_opts),
+          read_set(std::move(_read_set)),
+          data_block_separators(std::move(_data_block_separators)),
+          block_index_ranges_per_scan(std::move(_block_index_ranges_per_scan)),
+          next_scan_idx(0),
+          cur_data_block_idx(0),
+          prefetch_max_idx(_prefetch_max_idx),
+          statistics(_statistics),
+          wasted_blocks_count(0) {}
+
+    ~MultiScanState() {
+      if (statistics && wasted_blocks_count > 0) {
+        RecordTick(statistics, MULTISCAN_PREFETCH_BLOCKS_WASTED,
+                   wasted_blocks_count);
+      }
+    }
+  };
+
+  Status multi_scan_status_;
+  std::unique_ptr<MultiScanState> multi_scan_;
+  // *** END MultiScan related APIs and states ***
 
   void SeekSecondPass(const Slice* target);
 
@@ -472,5 +580,55 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
                                     uint64_t& end_updated_offset,
                                     size_t& prev_handles_size);
   // *** END APIs relevant to auto tuning of readahead_size ***
+
+  // *** BEGIN APIs relevant to multiscan ***
+
+  void SeekMultiScan(const Slice* target);
+
+  void FindBlockForwardInMultiScan();
+
+  void MultiScanSeekTargetFromBlock(const Slice* seek_target, size_t block_idx);
+  void MultiScanUnexpectedSeekTarget(const Slice* seek_target,
+                                     const Slice* user_seek_target);
+
+  // Return true, if there is an error, or end of file
+  bool MultiScanLoadDataBlock(size_t idx) {
+    if (idx >= multi_scan_->prefetch_max_idx) {
+      // TODO: Fix the max_prefetch_size support for multiple files.
+      // The goal is to limit the memory usage, prefetch could be done
+      // incrementally.
+      if (multi_scan_->scan_opts->max_prefetch_size == 0) {
+        // If max_prefetch_size is not set, treat this as end of file.
+        ResetDataIter();
+        assert(!is_out_of_bound_);
+        assert(!Valid());
+      } else {
+        // If max_prefetch_size is set, treat this as error.
+        multi_scan_status_ = Status::PrefetchLimitReached();
+      }
+      return true;
+    }
+
+    // Use ReadSet to get block (handles cache/async/sync transparently)
+    CachableEntry<Block> block_entry;
+    multi_scan_status_ = multi_scan_->read_set->ReadIndex(idx, &block_entry);
+    if (!multi_scan_status_.ok()) {
+      return true;
+    }
+
+    assert(block_entry.GetValue());
+    // Note that the block_iter_ takes ownership of the pinned data block
+    table_->NewDataBlockIterator<DataBlockIter>(read_options_, block_entry,
+                                                &block_iter_, Status::OK());
+    return false;
+  }
+
+  Status CollectBlockHandles(
+      const std::vector<ScanOptions>& scan_opts,
+      std::vector<BlockHandle>* scan_block_handles,
+      std::vector<std::tuple<size_t, size_t>>* block_index_ranges_per_scan,
+      std::vector<std::string>* data_block_boundary_keys);
+
+  // *** END APIs relevant to multiscan ***
 };
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 103f687f812c..1de0096f4a72 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -46,6 +46,7 @@
 #include "rocksdb/table.h"
 #include "rocksdb/table_properties.h"
 #include "rocksdb/trace_record.h"
+#include "rocksdb/user_defined_index.h"
 #include "table/block_based/binary_search_index_reader.h"
 #include "table/block_based/block.h"
 #include "table/block_based/block_based_table_factory.h"
@@ -58,6 +59,7 @@
 #include "table/block_based/hash_index_reader.h"
 #include "table/block_based/partitioned_filter_block.h"
 #include "table/block_based/partitioned_index_reader.h"
+#include "table/block_based/user_defined_index_wrapper.h"
 #include "table/block_fetcher.h"
 #include "table/format.h"
 #include "table/get_context.h"
@@ -91,28 +93,33 @@ CacheAllocationPtr CopyBufferToHeap(MemoryAllocator* allocator, Slice& buf) {
 #define INSTANTIATE_BLOCKLIKE_TEMPLATES(T)                                     \
   template Status BlockBasedTable::RetrieveBlock<T>(                           \
       FilePrefetchBuffer * prefetch_buffer, const ReadOptions& ro,             \
-      const BlockHandle& handle, const UncompressionDict& uncompression_dict,  \
+      const BlockHandle& handle, UnownedPtr<Decompressor> decomp,              \
       CachableEntry<T>* out_parsed_block, GetContext* get_context,             \
       BlockCacheLookupContext* lookup_context, bool for_compaction,            \
       bool use_cache, bool async_read, bool use_block_cache_for_lookup) const; \
   template Status BlockBasedTable::MaybeReadBlockAndLoadToCache<T>(            \
       FilePrefetchBuffer * prefetch_buffer, const ReadOptions& ro,             \
-      const BlockHandle& handle, const UncompressionDict& uncompression_dict,  \
+      const BlockHandle& handle, UnownedPtr<Decompressor> decomp,              \
       bool for_compaction, CachableEntry<T>* block_entry,                      \
       GetContext* get_context, BlockCacheLookupContext* lookup_context,        \
       BlockContents* contents, bool async_read,                                \
       bool use_block_cache_for_lookup) const;                                  \
   template Status BlockBasedTable::LookupAndPinBlocksInCache<T>(               \
       const ReadOptions& ro, const BlockHandle& handle,                        \
+      CachableEntry<T>* out_parsed_block) const;                               \
+  template Status BlockBasedTable::CreateAndPinBlockInCache<T>(                \
+      const ReadOptions& ro, const BlockHandle& handle,                        \
+      UnownedPtr<Decompressor> decomp, BlockContents* block_contents,          \
       CachableEntry<T>* out_parsed_block) const;
 
 INSTANTIATE_BLOCKLIKE_TEMPLATES(ParsedFullFilterBlock);
-INSTANTIATE_BLOCKLIKE_TEMPLATES(UncompressionDict);
+INSTANTIATE_BLOCKLIKE_TEMPLATES(DecompressorDict);
 INSTANTIATE_BLOCKLIKE_TEMPLATES(Block_kData);
 INSTANTIATE_BLOCKLIKE_TEMPLATES(Block_kIndex);
 INSTANTIATE_BLOCKLIKE_TEMPLATES(Block_kFilterPartitionIndex);
 INSTANTIATE_BLOCKLIKE_TEMPLATES(Block_kRangeDeletion);
 INSTANTIATE_BLOCKLIKE_TEMPLATES(Block_kMetaIndex);
+INSTANTIATE_BLOCKLIKE_TEMPLATES(Block_kUserDefinedIndex);
 
 }  // namespace ROCKSDB_NAMESPACE
 
@@ -195,7 +202,7 @@ Status ReadAndParseBlockFromFile(
     const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
     std::unique_ptr<TBlocklike>* result, const ImmutableOptions& ioptions,
     BlockCreateContext& create_context, bool maybe_compressed,
-    const UncompressionDict& uncompression_dict,
+    UnownedPtr<Decompressor> decomp,
     const PersistentCacheOptions& cache_options,
     MemoryAllocator* memory_allocator, bool for_compaction, bool async_read) {
   assert(result);
@@ -204,8 +211,8 @@ Status ReadAndParseBlockFromFile(
   BlockFetcher block_fetcher(
       file, prefetch_buffer, footer, options, handle, &contents, ioptions,
       /*do_uncompress*/ maybe_compressed, maybe_compressed,
-      TBlocklike::kBlockType, uncompression_dict, cache_options,
-      memory_allocator, nullptr, for_compaction);
+      TBlocklike::kBlockType, decomp, cache_options, memory_allocator, nullptr,
+      for_compaction);
   Status s;
   // If prefetch_buffer is not allocated, it will fallback to synchronous
   // reading of block contents.
@@ -562,6 +569,110 @@ Status GetGlobalSequenceNumber(const TableProperties& table_properties,
 
   return Status::OK();
 }
+
+Status GetDecompressor(const std::string& compression_name,
+                       UnownedPtr<CompressionManager> compression_manager,
+                       uint32_t table_format_version,
+                       std::shared_ptr<Decompressor>* out_decompressor) {
+  if (compression_name.empty()) {
+    // Very old file (before RocksDB 4.9.0) that might contain compressed
+    // blocks. Get a general decompressor (for all supported format_versions)
+    auto mgr_to_use = GetBuiltinV2CompressionManager();
+    *out_decompressor = mgr_to_use->GetDecompressor();
+    return Status::OK();
+  }
+  if (FormatVersionUsesCompressionManagerName(table_format_version)) {
+    constexpr char kFieldSep = ';';
+    size_t separator_pos = compression_name.find_first_of(kFieldSep);
+    if (separator_pos == std::string::npos) {
+      return Status::Corruption(
+          "Missing separator in compression_name property");
+    }
+    // Built with explicit CompressionManager and schema support for
+    // identifying its compatibility name, which is the first field here.
+    Slice compatibility_name(compression_name.data(), separator_pos);
+    std::shared_ptr<CompressionManager> mgr_to_use;
+    if (compression_manager) {
+      // First attempt to go through the compression manager configured for
+      // writing new files, for efficiency (usually correct) and not forcing
+      // use of ObjectLibrary registration (dependency injection).
+      mgr_to_use = compression_manager->FindCompatibleCompressionManager(
+          compatibility_name);
+    }
+    if (mgr_to_use == nullptr) {
+      ConfigOptions strict;
+      strict.ignore_unknown_options = false;
+      strict.ignore_unsupported_options = false;
+      Status s = CompressionManager::CreateFromString(
+          strict, compatibility_name.ToString(), &mgr_to_use);
+      // Even though we might be able to recover from "not found" if only
+      // built-in compression types are used (would be checked below), it
+      // would provide misleading or unreliable success to allow that to
+      // succeed.
+      if (!s.ok()) {
+        return s;
+      }
+      assert(mgr_to_use || compatibility_name == kNullptrString ||
+             compatibility_name.empty());
+    }
+
+    // Second field is set of compression types actually used in the file
+    size_t start_pos = separator_pos + 1;
+    separator_pos = compression_name.find_first_of(kFieldSep, start_pos);
+    if (UNLIKELY(separator_pos == std::string::npos)) {
+      return Status::Corruption("Missing second field from compression_name");
+    }
+    if (UNLIKELY((separator_pos - start_pos) & 1)) {
+      return Status::Corruption(
+          "Second field of compression_name has odd size");
+    }
+    size_t count = (separator_pos - start_pos) / 2;
+    auto ctypes = std::make_unique<CompressionType[]>(count);
+    const char* ptr = compression_name.data() + start_pos;
+    for (size_t i = 0; i < count; ++i) {
+      uint64_t val = 0;
+      bool success = ParseBaseChars<16>(&ptr, 2, &val);
+      if (UNLIKELY(!success || val == kNoCompression ||
+                   val >= kDisableCompressionOption)) {
+        return Status::Corruption(
+            "Error parsing second field of compression_name");
+      }
+      ctypes[i] = static_cast<CompressionType>(val);
+    }
+    if (mgr_to_use) {
+      *out_decompressor = mgr_to_use->GetDecompressorForTypes(
+          ctypes.get(), ctypes.get() + count);
+      assert(*out_decompressor || count == 0);
+    } else {
+      // Compression/decompression disabled
+      *out_decompressor = nullptr;
+      assert(count == 0);
+    }
+    // Can ignore possible additional future fields
+  } else {
+    // No explicit CompressionManager, e.g. legacy file support where
+    // decompressing with built-in CompressionManager works.
+    CompressionType saved_comp_type =
+        CompressionTypeFromString(compression_name);
+    if (saved_comp_type == kDisableCompressionOption) {
+      // Unrecognized. For RocksDB versions able to read format_version=7,
+      // this is considered an error so that we can continue to evolve the
+      // schema of the compression_name property and report good error
+      // messages.
+      return Status::Corruption("Unrecognized compression_name: " +
+                                compression_name);
+    } else if (saved_comp_type != kNoCompression) {
+      // Use built-in compression manager
+      auto mgr_to_use = GetBuiltinV2CompressionManager();
+      *out_decompressor =
+          mgr_to_use->GetDecompressorOptimizeFor(saved_comp_type);
+    } else {
+      // No compression -> decompressor not needed
+      *out_decompressor = nullptr;
+    }
+  }
+  return Status::OK();
+}
 }  // namespace
 
 void BlockBasedTable::SetupBaseCacheKey(const TableProperties* properties,
@@ -629,6 +740,7 @@ Status BlockBasedTable::Open(
     std::unique_ptr<TableReader>* table_reader, uint64_t tail_size,
     std::shared_ptr<CacheReservationManager> table_reader_cache_res_mgr,
     const std::shared_ptr<const SliceTransform>& prefix_extractor,
+    UnownedPtr<CompressionManager> compression_manager,
     const bool prefetch_index_and_filter_in_cache, const bool skip_filters,
     const int level, const bool immortal_table,
     const SequenceNumber largest_seqno, const bool force_direct_prefetch,
@@ -683,7 +795,8 @@ Status BlockBasedTable::Open(
   //    6. [meta block: index]
   //    7. [meta block: filter]
   IOOptions opts;
-  s = file->PrepareIOOptions(ro, opts);
+  IODebugContext dbg;
+  s = file->PrepareIOOptions(ro, opts, &dbg);
   if (s.ok()) {
     s = ReadFooterFromFile(opts, file.get(), *ioptions.fs,
                            prefetch_buffer.get(), file_size, &footer,
@@ -695,7 +808,9 @@ Status BlockBasedTable::Open(
     }
     return s;
   }
-  if (!IsSupportedFormatVersion(footer.format_version())) {
+  if (!IsSupportedFormatVersionForRead(kBlockBasedTableMagicNumber,
+                                       footer.format_version()) &&
+      !TEST_AllowUnsupportedFormatVersion()) {
     return Status::Corruption(
         "Unknown Footer version. Maybe this file was created with newer "
         "version of RocksDB?");
@@ -738,13 +853,19 @@ Status BlockBasedTable::Open(
     return s;
   }
 
+  // Read compression metadata and configure decompressor
+  s = GetDecompressor(
+      rep->table_properties ? rep->table_properties->compression_name
+                            : std::string{},
+      compression_manager, footer.format_version(), &rep->decompressor);
+  if (!s.ok()) {
+    return s;
+  }
+
   // Populate BlockCreateContext
-  bool blocks_definitely_zstd_compressed =
-      rep->table_properties && (rep->table_properties->compression_name ==
-                                CompressionTypeToString(kZSTD));
   rep->create_context = BlockCreateContext(
       &rep->table_options, &rep->ioptions, rep->ioptions.stats,
-      blocks_definitely_zstd_compressed, block_protection_bytes_per_key,
+      rep->decompressor.get(), block_protection_bytes_per_key,
       rep->internal_comparator.user_comparator(), rep->index_value_is_full,
       rep->index_has_first_key);
 
@@ -806,20 +927,18 @@ Status BlockBasedTable::Open(
     rep->table_prefix_extractor = prefix_extractor;
   } else {
     // Current prefix_extractor doesn't match table
-    if (rep->table_properties) {
-      //**TODO: If/When the DBOptions has a registry in it, the ConfigOptions
-      // will need to use it
-      ConfigOptions config_options;
-      Status st = SliceTransform::CreateFromString(
-          config_options, rep->table_properties->prefix_extractor_name,
-          &(rep->table_prefix_extractor));
-      if (!st.ok()) {
-        //**TODO: Should this be error be returned or swallowed?
-        ROCKS_LOG_ERROR(rep->ioptions.logger,
-                        "Failed to create prefix extractor[%s]: %s",
-                        rep->table_properties->prefix_extractor_name.c_str(),
-                        st.ToString().c_str());
-      }
+    //**TODO: If/When the DBOptions has a registry in it, the ConfigOptions
+    // will need to use it
+    ConfigOptions config_options;
+    Status st = SliceTransform::CreateFromString(
+        config_options, rep->table_properties->prefix_extractor_name,
+        &(rep->table_prefix_extractor));
+    if (!st.ok()) {
+      //**TODO: Should this be error be returned or swallowed?
+      ROCKS_LOG_ERROR(rep->ioptions.logger,
+                      "Failed to create prefix extractor[%s]: %s",
+                      rep->table_properties->prefix_extractor_name.c_str(),
+                      st.ToString().c_str());
     }
   }
 
@@ -914,6 +1033,7 @@ Status BlockBasedTable::PrefetchTail(
                      "TailPrefetchStats.",
                      file->file_name().c_str(), tail_prefetch_size);
     }
+    TEST_SYNC_POINT("BlockBasedTable::PrefetchTail::TaiSizeNotRecorded");
   }
   size_t prefetch_off;
   size_t prefetch_len;
@@ -933,7 +1053,8 @@ Status BlockBasedTable::PrefetchTail(
 #endif  // NDEBUG
 
   IOOptions opts;
-  Status s = file->PrepareIOOptions(ro, opts);
+  IODebugContext dbg;
+  Status s = file->PrepareIOOptions(ro, opts, &dbg);
   // Try file system prefetch
   if (s.ok() && !file->use_direct_io() && !force_direct_prefetch) {
     if (!file->Prefetch(opts, prefetch_off, prefetch_len).IsNotSupported()) {
@@ -963,89 +1084,72 @@ Status BlockBasedTable::ReadPropertiesBlock(
   BlockHandle handle;
   s = FindOptionalMetaBlock(meta_iter, kPropertiesBlockName, &handle);
 
+  if (!s.ok()) {
+    return s;
+  } else if (handle.IsNull()) {
+    return Status::Corruption("Cannot find Properties block from file.");
+  }
+
+  s = meta_iter->status();
+  std::unique_ptr<TableProperties> table_properties;
+  if (s.ok()) {
+    s = ReadTablePropertiesHelper(
+        ro, handle, rep_->file.get(), prefetch_buffer, rep_->footer,
+        rep_->ioptions, &table_properties, nullptr /* memory_allocator */);
+  }
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  assert(table_properties != nullptr);
+  rep_->table_properties = std::move(table_properties);
+
+  s = rep_->seqno_to_time_mapping.DecodeFrom(
+      rep_->table_properties->seqno_to_time_mapping);
   if (!s.ok()) {
     ROCKS_LOG_WARN(rep_->ioptions.logger,
-                   "Error when seeking to properties block from file: %s",
+                   "Problem reading or processing seqno-to-time mapping: %s",
                    s.ToString().c_str());
-  } else if (!handle.IsNull()) {
-    s = meta_iter->status();
-    std::unique_ptr<TableProperties> table_properties;
-    if (s.ok()) {
-      s = ReadTablePropertiesHelper(
-          ro, handle, rep_->file.get(), prefetch_buffer, rep_->footer,
-          rep_->ioptions, &table_properties, nullptr /* memory_allocator */);
-    }
-    IGNORE_STATUS_IF_ERROR(s);
+  }
 
-    if (!s.ok()) {
-      ROCKS_LOG_WARN(rep_->ioptions.logger,
-                     "Encountered error while reading data from properties "
-                     "block %s",
-                     s.ToString().c_str());
-    } else {
-      assert(table_properties != nullptr);
-      rep_->table_properties = std::move(table_properties);
+  // Read the table properties
+  rep_->whole_key_filtering &= IsFeatureSupported(
+      *(rep_->table_properties),
+      BlockBasedTablePropertyNames::kWholeKeyFiltering, rep_->ioptions.logger);
+  rep_->prefix_filtering &= IsFeatureSupported(
+      *(rep_->table_properties), BlockBasedTablePropertyNames::kPrefixFiltering,
+      rep_->ioptions.logger);
 
-      if (s.ok()) {
-        s = rep_->seqno_to_time_mapping.DecodeFrom(
-            rep_->table_properties->seqno_to_time_mapping);
-      }
-      if (!s.ok()) {
-        ROCKS_LOG_WARN(
-            rep_->ioptions.logger,
-            "Problem reading or processing seqno-to-time mapping: %s",
-            s.ToString().c_str());
-      }
-      rep_->blocks_maybe_compressed =
-          rep_->table_properties->compression_name !=
-          CompressionTypeToString(kNoCompression);
-    }
-  } else {
-    ROCKS_LOG_ERROR(rep_->ioptions.logger,
-                    "Cannot find Properties block from file.");
+  rep_->index_key_includes_seq =
+      rep_->table_properties->index_key_is_user_key == 0;
+  rep_->index_value_is_full =
+      rep_->table_properties->index_value_is_delta_encoded == 0;
+
+  // Read index_type from properties (required for format_version >= 2)
+  auto& props = rep_->table_properties->user_collected_properties;
+  auto index_type_pos = props.find(BlockBasedTablePropertyNames::kIndexType);
+  if (index_type_pos == props.end()) {
+    return Status::Corruption("Missing index type property");
+  }
+  rep_->index_type = static_cast<BlockBasedTableOptions::IndexType>(
+      DecodeFixed32(index_type_pos->second.c_str()));
+  auto min_ts_pos = props.find("rocksdb.timestamp_min");
+  if (min_ts_pos != props.end()) {
+    rep_->min_timestamp = Slice(min_ts_pos->second);
+  }
+  auto max_ts_pos = props.find("rocksdb.timestamp_max");
+  if (max_ts_pos != props.end()) {
+    rep_->max_timestamp = Slice(max_ts_pos->second);
   }
 
-  // Read the table properties, if provided.
-  if (rep_->table_properties) {
-    rep_->whole_key_filtering &=
-        IsFeatureSupported(*(rep_->table_properties),
-                           BlockBasedTablePropertyNames::kWholeKeyFiltering,
-                           rep_->ioptions.logger);
-    rep_->prefix_filtering &= IsFeatureSupported(
-        *(rep_->table_properties),
-        BlockBasedTablePropertyNames::kPrefixFiltering, rep_->ioptions.logger);
-
-    rep_->index_key_includes_seq =
-        rep_->table_properties->index_key_is_user_key == 0;
-    rep_->index_value_is_full =
-        rep_->table_properties->index_value_is_delta_encoded == 0;
-
-    // Update index_type with the true type.
-    // If table properties don't contain index type, we assume that the table
-    // is in very old format and has kBinarySearch index type.
-    auto& props = rep_->table_properties->user_collected_properties;
-    auto index_type_pos = props.find(BlockBasedTablePropertyNames::kIndexType);
-    if (index_type_pos != props.end()) {
-      rep_->index_type = static_cast<BlockBasedTableOptions::IndexType>(
-          DecodeFixed32(index_type_pos->second.c_str()));
-    }
-    auto min_ts_pos = props.find("rocksdb.timestamp_min");
-    if (min_ts_pos != props.end()) {
-      rep_->min_timestamp = Slice(min_ts_pos->second);
-    }
-    auto max_ts_pos = props.find("rocksdb.timestamp_max");
-    if (max_ts_pos != props.end()) {
-      rep_->max_timestamp = Slice(max_ts_pos->second);
-    }
-
-    rep_->index_has_first_key =
-        rep_->index_type == BlockBasedTableOptions::kBinarySearchWithFirstKey;
-
-    s = GetGlobalSequenceNumber(*(rep_->table_properties), largest_seqno,
-                                &(rep_->global_seqno));
-    if (!s.ok()) {
-      ROCKS_LOG_ERROR(rep_->ioptions.logger, "%s", s.ToString().c_str());
-    }
+  rep_->index_has_first_key =
+      rep_->index_type == BlockBasedTableOptions::kBinarySearchWithFirstKey;
+
+  s = GetGlobalSequenceNumber(*(rep_->table_properties), largest_seqno,
+                              &(rep_->global_seqno));
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(rep_->ioptions.logger, "%s", s.ToString().c_str());
   }
   return s;
 }
@@ -1197,13 +1301,75 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
   if (!s.ok()) {
     return s;
   }
+  if (table_options.user_defined_index_factory != nullptr) {
+    std::string udi_name(table_options.user_defined_index_factory->Name());
+    BlockHandle udi_block_handle;
+
+    // Should we use FindOptionalMetaBlock here?
+    s = FindMetaBlock(meta_iter, kUserDefinedIndexPrefix + udi_name,
+                      &udi_block_handle);
+    if (!s.ok()) {
+      RecordTick(rep_->ioptions.statistics.get(),
+                 SST_USER_DEFINED_INDEX_LOAD_FAIL_COUNT);
+      if (table_options.fail_if_no_udi_on_open) {
+        ROCKS_LOG_ERROR(rep_->ioptions.logger,
+                        "Failed to find the the UDI block %s in file %s; %s",
+                        udi_name.c_str(), rep_->file->file_name().c_str(),
+                        s.ToString().c_str());
+        // MAke the status more informative
+        s = Status::Corruption(s.ToString(), rep_->file->file_name());
+        return s;
+      } else {
+        // Emit a warning, but ignore the error status
+        ROCKS_LOG_WARN(rep_->ioptions.logger,
+                       "Failed to find the the UDI block %s in file %s; %s",
+                       udi_name.c_str(), rep_->file->file_name().c_str(),
+                       s.ToString().c_str());
+        s = Status::OK();
+      }
+    }
+
+    // If the UDI block size is 0, that means there's effectively no user
+    // defined index. In that case, skip setting up the reader.
+    if (udi_block_handle.size() > 0) {
+      // Read the block, and allocate on heap or pin in cache. The UDI block is
+      // not compressed. RetrieveBlock will verify the checksum.
+      if (s.ok()) {
+        s = RetrieveBlock(prefetch_buffer, ro, udi_block_handle,
+                          rep_->decompressor.get(), &rep_->udi_block,
+                          /*get_context=*/nullptr, lookup_context,
+                          /*for_compaction=*/false, use_cache,
+                          /*async_read=*/false,
+                          /*use_block_cache_for_lookup=*/false);
+      }
+      if (s.ok()) {
+        assert(!rep_->udi_block.IsEmpty());
+
+        std::unique_ptr<UserDefinedIndexReader> udi_reader;
+        UserDefinedIndexOption udi_option;
+        udi_option.comparator = rep_->internal_comparator.user_comparator();
+        s = table_options.user_defined_index_factory->NewReader(
+            udi_option, rep_->udi_block.GetValue()->data, udi_reader);
+        if (s.ok()) {
+          if (udi_reader) {
+            index_reader = std::make_unique<UserDefinedIndexReaderWrapper>(
+                udi_name, std::move(index_reader), std::move(udi_reader));
+          } else {
+            s = Status::Corruption("Failed to create UDI reader for " +
+                                   udi_name + " in file " +
+                                   rep_->file->file_name());
+          }
+        }
+      }
+    }
+  }
 
   rep_->index_reader = std::move(index_reader);
 
   // The partitions of partitioned index are always stored in cache. They
   // are hence follow the configuration for pin and prefetch regardless of
   // the value of cache_index_and_filter_blocks
-  if (prefetch_all || pin_partition) {
+  if (s.ok() && (prefetch_all || pin_partition)) {
     s = rep_->index_reader->CacheDependencies(ro, pin_partition,
                                               prefetch_buffer);
   }
@@ -1238,7 +1404,10 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
     }
   }
 
-  if (!rep_->compression_dict_handle.IsNull()) {
+  // NOTE: before the fix to https://github.com/facebook/rocksdb/issues/12409, a
+  // file could have a (de)compression dictionary block without a configured
+  // compression, so we need to ignore the dictionary in that case.
+  if (!rep_->compression_dict_handle.IsNull() && rep_->decompressor) {
     std::unique_ptr<UncompressionDictReader> uncompression_dict_reader;
     s = UncompressionDictReader::Create(
         this, ro, prefetch_buffer, use_cache, prefetch_all || pin_unpartitioned,
@@ -1300,10 +1469,9 @@ Status BlockBasedTable::ReadMetaIndexBlock(
   Status s = ReadAndParseBlockFromFile(
       rep_->file.get(), prefetch_buffer, rep_->footer, ro,
       rep_->footer.metaindex_handle(), &metaindex, rep_->ioptions,
-      rep_->create_context, true /*maybe_compressed*/,
-      UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options,
-      GetMemoryAllocator(rep_->table_options), false /* for_compaction */,
-      false /* async_read */);
+      rep_->create_context, true /*maybe_compressed*/, rep_->decompressor.get(),
+      rep_->persistent_cache_options, GetMemoryAllocator(rep_->table_options),
+      false /* for_compaction */, false /* async_read */);
 
   if (!s.ok()) {
     ROCKS_LOG_ERROR(rep_->ioptions.logger,
@@ -1342,7 +1510,7 @@ template <typename TBlocklike>
 WithBlocklikeCheck<Status, TBlocklike> BlockBasedTable::GetDataBlockFromCache(
     const Slice& cache_key, BlockCacheInterface<TBlocklike> block_cache,
     CachableEntry<TBlocklike>* out_parsed_block, GetContext* get_context,
-    const UncompressionDict* dict) const {
+    UnownedPtr<Decompressor> decomp) const {
   assert(out_parsed_block);
   assert(out_parsed_block->IsEmpty());
 
@@ -1351,12 +1519,24 @@ WithBlocklikeCheck<Status, TBlocklike> BlockBasedTable::GetDataBlockFromCache(
 
   // Lookup uncompressed cache first
   if (block_cache) {
-    BlockCreateContext create_ctx = rep_->create_context;
-    create_ctx.dict = dict;
     assert(!cache_key.empty());
-    auto cache_handle = block_cache.LookupFull(
-        cache_key, &create_ctx, GetCachePriority<TBlocklike>(), statistics,
-        rep_->ioptions.lowest_used_cache_tier);
+    typename BlockCacheInterface<TBlocklike>::TypedHandle* cache_handle;
+    if (decomp.get() != rep_->decompressor.get() && decomp) {
+      // `decomp` must be a dictionary-aware decompressor, which is only
+      // available in the block cache (so that dictionaries can be evicted
+      // from memory) and can't live in the table reader.
+      // NOTE: inefficient BlockCreateContext copy for dict-aware decompressor
+      // (see TODO in block_cache.h)
+      BlockCreateContext create_ctx = rep_->create_context;
+      create_ctx.decompressor = decomp.get();
+      cache_handle = block_cache.LookupFull(
+          cache_key, &create_ctx, GetCachePriority<TBlocklike>(), statistics,
+          rep_->ioptions.lowest_used_cache_tier);
+    } else {
+      cache_handle = block_cache.LookupFull(
+          cache_key, &rep_->create_context, GetCachePriority<TBlocklike>(),
+          statistics, rep_->ioptions.lowest_used_cache_tier);
+    }
 
     // Avoid updating metrics here if the handle is not complete yet. This
     // happens with MultiGet and secondary cache. So update the metrics only
@@ -1386,10 +1566,9 @@ WithBlocklikeCheck<Status, TBlocklike> BlockBasedTable::PutDataBlockToCache(
     CachableEntry<TBlocklike>* out_parsed_block,
     BlockContents&& uncompressed_block_contents,
     BlockContents&& compressed_block_contents, CompressionType block_comp_type,
-    const UncompressionDict& uncompression_dict,
-    MemoryAllocator* memory_allocator, GetContext* get_context) const {
+    UnownedPtr<Decompressor> decomp, MemoryAllocator* memory_allocator,
+    GetContext* get_context) const {
   const ImmutableOptions& ioptions = rep_->ioptions;
-  const uint32_t format_version = rep_->table_options.format_version;
   assert(out_parsed_block);
   assert(out_parsed_block->IsEmpty());
 
@@ -1401,12 +1580,10 @@ WithBlocklikeCheck<Status, TBlocklike> BlockBasedTable::PutDataBlockToCache(
       uncompressed_block_contents.data.empty()) {
     assert(compressed_block_contents.data.data());
     // Retrieve the uncompressed contents into a new buffer
-    UncompressionContext context(block_comp_type);
-    UncompressionInfo info(context, uncompression_dict, block_comp_type);
-    s = UncompressBlockData(info, compressed_block_contents.data.data(),
-                            compressed_block_contents.data.size(),
-                            &uncompressed_block_contents, format_version,
-                            ioptions, memory_allocator);
+    s = DecompressBlockData(
+        compressed_block_contents.data.data(),
+        compressed_block_contents.data.size(), block_comp_type, *decomp,
+        &uncompressed_block_contents, ioptions, memory_allocator);
     if (!s.ok()) {
       return s;
     }
@@ -1505,7 +1682,8 @@ IndexBlockIter* BlockBasedTable::InitBlockIterator<IndexBlockIter>(
       rep->get_global_seqno(block_type), input_iter, rep->ioptions.stats,
       /* total_order_seek */ true, rep->index_has_first_key,
       rep->index_key_includes_seq, rep->index_value_is_full,
-      block_contents_pinned, rep->user_defined_timestamps_persisted);
+      block_contents_pinned, rep->user_defined_timestamps_persisted,
+      nullptr /* prefix_index */, rep->table_options.index_block_search_type);
 }
 
 // Right now only called for Data blocks.
@@ -1519,15 +1697,18 @@ Status BlockBasedTable::LookupAndPinBlocksInCache(
   assert(block_cache);
 
   Status s;
-  CachableEntry<UncompressionDict> uncompression_dict;
+  CachableEntry<DecompressorDict> cached_dict;
   if (rep_->uncompression_dict_reader) {
     s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
         /* prefetch_buffer= */ nullptr, ro,
         /* get_context= */ nullptr, /* lookup_context= */ nullptr,
-        &uncompression_dict);
+        &cached_dict);
     if (!s.ok()) {
       return s;
     }
+    if (!cached_dict.GetValue()) {
+      return Status::Corruption("Success but no dictionary read");
+    }
   }
 
   // Do the lookup.
@@ -1536,14 +1717,20 @@ Status BlockBasedTable::LookupAndPinBlocksInCache(
 
   Statistics* statistics = rep_->ioptions.statistics.get();
 
-  BlockCreateContext create_ctx = rep_->create_context;
-  create_ctx.dict = uncompression_dict.GetValue()
-                        ? uncompression_dict.GetValue()
-                        : &UncompressionDict::GetEmptyDict();
-
-  auto cache_handle =
-      block_cache.LookupFull(key, &create_ctx, GetCachePriority<TBlocklike>(),
-                             statistics, rep_->ioptions.lowest_used_cache_tier);
+  typename BlockCacheInterface<TBlocklike>::TypedHandle* cache_handle;
+  if (cached_dict.GetValue()) {
+    // NOTE: inefficient BlockCreateContext copy for dict-aware decompressor
+    // (see TODO in block_cache.h)
+    BlockCreateContext create_ctx = rep_->create_context;
+    create_ctx.decompressor = cached_dict.GetValue()->decompressor_.get();
+    cache_handle = block_cache.LookupFull(
+        key, &create_ctx, GetCachePriority<TBlocklike>(), statistics,
+        rep_->ioptions.lowest_used_cache_tier);
+  } else {
+    cache_handle = block_cache.LookupFull(
+        key, &rep_->create_context, GetCachePriority<TBlocklike>(), statistics,
+        rep_->ioptions.lowest_used_cache_tier);
+  }
 
   if (!cache_handle) {
     UpdateCacheMissMetrics(TBlocklike::kBlockType, /* get_context = */ nullptr);
@@ -1563,6 +1750,59 @@ Status BlockBasedTable::LookupAndPinBlocksInCache(
   return s;
 }
 
+template <typename TBlocklike>
+Status BlockBasedTable::CreateAndPinBlockInCache(
+    const ReadOptions& ro, const BlockHandle& handle,
+    UnownedPtr<Decompressor> decomp, BlockContents* contents,
+    CachableEntry<TBlocklike>* out_parsed_block) const {
+  CompressionType compression_type = GetBlockCompressionType(*contents);
+  // If we don't own the contents and we don't need to decompress, copy
+  // the block to heap in order to have ownership. If decompression is
+  // needed, then the decompressor will allocate a buffer.
+  if (!contents->own_bytes() && compression_type == kNoCompression) {
+    Slice src = Slice(contents->data.data(), BlockSizeWithTrailer(handle));
+    *contents = BlockContents(
+        CopyBufferToHeap(GetMemoryAllocator(rep_->table_options), src),
+        handle.size());
+#ifndef NDEBUG
+    contents->has_trailer = true;
+#endif
+  }
+
+  Status s;
+  if (ro.fill_cache) {
+    s = MaybeReadBlockAndLoadToCache(nullptr, ro, handle, decomp,
+                                     /*for_compaction=*/false, out_parsed_block,
+                                     nullptr, nullptr, contents,
+                                     /*async_read=*/false,
+                                     /*use_block_cache_for_lookup=*/true);
+  }
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  // fill_cache could be false, or no block cache is configured. In that
+  // case, decompress if necessary and take ownership of the block
+  if (out_parsed_block->GetValue() == nullptr && contents != nullptr) {
+    BlockContents tmp_contents;
+    if (compression_type != kNoCompression) {
+      s = DecompressSerializedBlock(contents->data.data(), handle.size(),
+                                    compression_type, *decomp, &tmp_contents,
+                                    rep_->ioptions,
+                                    GetMemoryAllocator(rep_->table_options));
+    } else {
+      tmp_contents = std::move(*contents);
+    }
+    if (s.ok()) {
+      std::unique_ptr<TBlocklike> block_holder;
+      rep_->create_context.Create(&block_holder, std::move(tmp_contents));
+      out_parsed_block->SetOwnedValue(std::move(block_holder));
+    }
+  }
+  return s;
+}
+
 // If contents is nullptr, this function looks up the block caches for the
 // data block referenced by handle, and read the block from disk if necessary.
 // If contents is non-null, it skips the cache lookup and disk read, since
@@ -1572,7 +1812,7 @@ template <typename TBlocklike>
 WithBlocklikeCheck<Status, TBlocklike>
 BlockBasedTable::MaybeReadBlockAndLoadToCache(
     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
-    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+    const BlockHandle& handle, UnownedPtr<Decompressor> decomp,
     bool for_compaction, CachableEntry<TBlocklike>* out_parsed_block,
     GetContext* get_context, BlockCacheLookupContext* lookup_context,
     BlockContents* contents, bool async_read,
@@ -1596,7 +1836,7 @@ BlockBasedTable::MaybeReadBlockAndLoadToCache(
     if (!contents) {
       if (use_block_cache_for_lookup) {
         s = GetDataBlockFromCache(key, block_cache, out_parsed_block,
-                                  get_context, &uncompression_dict);
+                                  get_context, decomp);
         // Value could still be null at this point, so check the cache handle
         // and update the read pattern for prefetching
         if (out_parsed_block->GetValue() ||
@@ -1624,9 +1864,8 @@ BlockBasedTable::MaybeReadBlockAndLoadToCache(
         ro.fill_cache) {
       Statistics* statistics = rep_->ioptions.stats;
       const bool maybe_compressed =
-          TBlocklike::kBlockType != BlockType::kFilter &&
-          TBlocklike::kBlockType != BlockType::kCompressionDictionary &&
-          rep_->blocks_maybe_compressed;
+          BlockTypeMaybeCompressed(TBlocklike::kBlockType) &&
+          rep_->decompressor;
       // This flag, if true, tells BlockFetcher to return the uncompressed
       // block when ReadBlockContents() is called.
       const bool do_uncompress = maybe_compressed;
@@ -1650,8 +1889,7 @@ BlockBasedTable::MaybeReadBlockAndLoadToCache(
         BlockFetcher block_fetcher(
             rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle,
             &tmp_contents, rep_->ioptions, do_uncompress, maybe_compressed,
-            TBlocklike::kBlockType, uncompression_dict,
-            rep_->persistent_cache_options,
+            TBlocklike::kBlockType, decomp, rep_->persistent_cache_options,
             GetMemoryAllocator(rep_->table_options),
             /*allocator=*/nullptr);
 
@@ -1666,7 +1904,7 @@ BlockBasedTable::MaybeReadBlockAndLoadToCache(
           s = block_fetcher.ReadBlockContents();
         }
 
-        contents_comp_type = block_fetcher.get_compression_type();
+        contents_comp_type = block_fetcher.compression_type();
         if (get_context) {
           switch (TBlocklike::kBlockType) {
             case BlockType::kIndex:
@@ -1698,7 +1936,7 @@ BlockBasedTable::MaybeReadBlockAndLoadToCache(
           // block in block_fetcher
           s = PutDataBlockToCache(
               key, block_cache, out_parsed_block, std::move(uncomp_contents),
-              std::move(comp_contents), contents_comp_type, uncompression_dict,
+              std::move(comp_contents), contents_comp_type, decomp,
               GetMemoryAllocator(rep_->table_options), get_context);
         }
       } else {
@@ -1714,7 +1952,7 @@ BlockBasedTable::MaybeReadBlockAndLoadToCache(
           // the block to the cache.
           s = PutDataBlockToCache(
               key, block_cache, out_parsed_block, std::move(uncomp_contents),
-              std::move(comp_contents), contents_comp_type, uncompression_dict,
+              std::move(comp_contents), contents_comp_type, decomp,
               GetMemoryAllocator(rep_->table_options), get_context);
         }
       }
@@ -1770,6 +2008,7 @@ BlockBasedTable::SaveLookupContextOrTraceRecord(
       trace_block_type = TraceType::kBlockTraceRangeDeletionBlock;
       break;
     case BlockType::kIndex:
+    case BlockType::kUserDefinedIndex:
       trace_block_type = TraceType::kBlockTraceIndexBlock;
       break;
     default:
@@ -1829,7 +2068,7 @@ void BlockBasedTable::FinishTraceRecord(
 template <typename TBlocklike /*, auto*/>
 WithBlocklikeCheck<Status, TBlocklike> BlockBasedTable::RetrieveBlock(
     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
-    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+    const BlockHandle& handle, UnownedPtr<Decompressor> decomp,
     CachableEntry<TBlocklike>* out_parsed_block, GetContext* get_context,
     BlockCacheLookupContext* lookup_context, bool for_compaction,
     bool use_cache, bool async_read, bool use_block_cache_for_lookup) const {
@@ -1839,8 +2078,8 @@ WithBlocklikeCheck<Status, TBlocklike> BlockBasedTable::RetrieveBlock(
   Status s;
   if (use_cache) {
     s = MaybeReadBlockAndLoadToCache(
-        prefetch_buffer, ro, handle, uncompression_dict, for_compaction,
-        out_parsed_block, get_context, lookup_context,
+        prefetch_buffer, ro, handle, decomp, for_compaction, out_parsed_block,
+        get_context, lookup_context,
         /*contents=*/nullptr, async_read, use_block_cache_for_lookup);
 
     if (!s.ok()) {
@@ -1862,9 +2101,7 @@ WithBlocklikeCheck<Status, TBlocklike> BlockBasedTable::RetrieveBlock(
   }
 
   const bool maybe_compressed =
-      TBlocklike::kBlockType != BlockType::kFilter &&
-      TBlocklike::kBlockType != BlockType::kCompressionDictionary &&
-      rep_->blocks_maybe_compressed;
+      BlockTypeMaybeCompressed(TBlocklike::kBlockType) && rep_->decompressor;
   std::unique_ptr<TBlocklike> block;
 
   {
@@ -1873,9 +2110,9 @@ WithBlocklikeCheck<Status, TBlocklike> BlockBasedTable::RetrieveBlock(
     StopWatch sw(rep_->ioptions.clock, rep_->ioptions.stats, histogram);
     s = ReadAndParseBlockFromFile(
         rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block,
-        rep_->ioptions, rep_->create_context, maybe_compressed,
-        uncompression_dict, rep_->persistent_cache_options,
-        GetMemoryAllocator(rep_->table_options), for_compaction, async_read);
+        rep_->ioptions, rep_->create_context, maybe_compressed, decomp,
+        rep_->persistent_cache_options, GetMemoryAllocator(rep_->table_options),
+        for_compaction, async_read);
 
     if (get_context) {
       switch (TBlocklike::kBlockType) {
@@ -2445,7 +2682,7 @@ Status BlockBasedTable::Prefetch(const ReadOptions& read_options,
   }
   BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
   IndexBlockIter iiter_on_stack;
-  auto iiter = NewIndexIterator(read_options, /*need_upper_bound_check=*/false,
+  auto iiter = NewIndexIterator(read_options, /*disable_prefix_seek=*/false,
                                 &iiter_on_stack, /*get_context=*/nullptr,
                                 &lookup_context);
   std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
@@ -2482,7 +2719,7 @@ Status BlockBasedTable::Prefetch(const ReadOptions& read_options,
     DataBlockIter biter;
     Status tmp_status;
     NewDataBlockIterator<DataBlockIter>(
-        read_options, block_handle, &biter, /*type=*/BlockType::kData,
+        read_options, block_handle, &biter, /*block_type=*/BlockType::kData,
         /*get_context=*/nullptr, &lookup_context,
         /*prefetch_buffer=*/nullptr, /*for_compaction=*/false,
         /*async_read=*/false, tmp_status, /*use_block_cache_for_lookup=*/true);
@@ -2497,7 +2734,8 @@ Status BlockBasedTable::Prefetch(const ReadOptions& read_options,
 }
 
 Status BlockBasedTable::VerifyChecksum(const ReadOptions& read_options,
-                                       TableReaderCaller caller) {
+                                       TableReaderCaller caller,
+                                       bool meta_blocks_only) {
   Status s;
   // Check Meta blocks
   std::unique_ptr<Block> metaindex;
@@ -2512,6 +2750,9 @@ Status BlockBasedTable::VerifyChecksum(const ReadOptions& read_options,
   } else {
     return s;
   }
+  if (meta_blocks_only) {
+    return s;
+  }
   // Check Data blocks
   IndexBlockIter iiter_on_stack;
   BlockCacheLookupContext context{caller};
@@ -2557,8 +2798,8 @@ Status BlockBasedTable::VerifyChecksumInBlocks(
     BlockFetcher block_fetcher(
         rep_->file.get(), &prefetch_buffer, rep_->footer, read_options, handle,
         &contents, rep_->ioptions, false /* decompress */,
-        false /*maybe_compressed*/, BlockType::kData,
-        UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
+        false /*maybe_compressed*/, BlockType::kData, nullptr /*decompressor*/,
+        rep_->persistent_cache_options);
     s = block_fetcher.ReadBlockContents();
     if (!s.ok()) {
       break;
@@ -2607,6 +2848,10 @@ BlockType BlockBasedTable::GetBlockTypeForMetaBlockByName(
     return BlockType::kIndex;
   }
 
+  if (meta_block_name.starts_with(kUserDefinedIndexPrefix)) {
+    return BlockType::kUserDefinedIndex;
+  }
+
   if (meta_block_name.starts_with(kObsoleteFilterBlockPrefix)) {
     // Obsolete but possible in old files
     return BlockType::kInvalid;
@@ -2647,12 +2892,12 @@ Status BlockBasedTable::VerifyChecksumInMetaBlocks(
       // if it was checked on open.
     } else {
       // FIXME? Need to verify checksums of index and filter partitions?
-      s = BlockFetcher(
-              rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
-              read_options, handle, &contents, rep_->ioptions,
-              false /* decompress */, false /*maybe_compressed*/,
-              GetBlockTypeForMetaBlockByName(meta_block_name),
-              UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options)
+      s = BlockFetcher(rep_->file.get(), nullptr /* prefetch buffer */,
+                       rep_->footer, read_options, handle, &contents,
+                       rep_->ioptions, false /* decompress */,
+                       false /*maybe_compressed*/,
+                       GetBlockTypeForMetaBlockByName(meta_block_name),
+                       nullptr /*decompressor*/, rep_->persistent_cache_options)
               .ReadBlockContents();
     }
     if (!s.ok()) {
@@ -2703,7 +2948,7 @@ bool BlockBasedTable::TEST_BlockInCache(const BlockHandle& handle) const {
 bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
                                       const Slice& key) {
   std::unique_ptr<InternalIteratorBase<IndexValue>> iiter(NewIndexIterator(
-      options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
+      options, /*disable_prefix_seek=*/false, /*input_iter=*/nullptr,
       /*get_context=*/nullptr, /*lookup_context=*/nullptr));
   iiter->Seek(key);
   assert(iiter->status().ok());
@@ -2792,12 +3037,7 @@ uint64_t BlockBasedTable::ApproximateDataOffsetOf(
 }
 
 uint64_t BlockBasedTable::GetApproximateDataSize() {
-  // Should be in table properties unless super old version
-  if (rep_->table_properties) {
-    return rep_->table_properties->data_size;
-  }
-  // Fall back to rough estimate from footer
-  return rep_->footer.metaindex_handle().offset();
+  return rep_->table_properties->data_size;
 }
 
 uint64_t BlockBasedTable::ApproximateOffsetOf(const ReadOptions& read_options,
@@ -2910,9 +3150,9 @@ bool BlockBasedTable::TEST_IndexBlockInCache() const {
 Status BlockBasedTable::GetKVPairsFromDataBlocks(
     const ReadOptions& read_options, std::vector<KVPairBlock>* kv_pair_blocks) {
   std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
-      NewIndexIterator(read_options, /*need_upper_bound_check=*/false,
+      NewIndexIterator(read_options, /*disable_prefix_seek=*/false,
                        /*input_iter=*/nullptr, /*get_context=*/nullptr,
-                       /*lookup_contex=*/nullptr));
+                       /*lookup_context=*/nullptr));
 
   Status s = blockhandles_iter->status();
   if (!s.ok()) {
@@ -2932,7 +3172,7 @@ Status BlockBasedTable::GetKVPairsFromDataBlocks(
     Status tmp_status;
     datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
         read_options, blockhandles_iter->value().handle,
-        /*input_iter=*/nullptr, /*type=*/BlockType::kData,
+        /*input_iter=*/nullptr, /*block_type=*/BlockType::kData,
         /*get_context=*/nullptr, /*lookup_context=*/nullptr,
         /*prefetch_buffer=*/nullptr, /*for_compaction=*/false,
         /*async_read=*/false, tmp_status, /*use_block_cache_for_lookup=*/true));
@@ -2964,7 +3204,8 @@ Status BlockBasedTable::GetKVPairsFromDataBlocks(
   return Status::OK();
 }
 
-Status BlockBasedTable::DumpTable(WritableFile* out_file) {
+Status BlockBasedTable::DumpTable(WritableFile* out_file,
+                                  bool show_sequence_number_type) {
   WritableFileStringStreamAdapter out_file_wrapper(out_file);
   std::ostream out_stream(&out_file_wrapper);
   // Output Footer
@@ -2972,6 +3213,17 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) {
                 "--------------------------------------\n";
   out_stream << "  " << rep_->footer.ToString() << "\n";
 
+  // Output Checksum Type Legend
+  out_stream << "Block Checksum Type Legend:\n"
+                "--------------------------------------\n";
+  out_stream << "  0 = kNoChecksum\n";
+  out_stream << "  1 = kCRC32c\n";
+  out_stream << "  2 = kxxHash\n";
+  out_stream << "  3 = kxxHash64\n";
+  out_stream << "  4 = kXXH3\n";
+  out_stream << "  (This file uses checksum type: "
+             << static_cast<int>(rep_->footer.checksum_type()) << ")\n\n";
+
   // Output MetaIndex
   out_stream << "Metaindex Details:\n"
                 "--------------------------------------\n";
@@ -2982,25 +3234,47 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) {
   Status s = ReadMetaIndexBlock(ro, nullptr /* prefetch_buffer */, &metaindex,
                                 &metaindex_iter);
   if (s.ok()) {
+    // Print metaindex block checksum
+    DumpBlockChecksumInfo(rep_->footer.metaindex_handle(), ro,
+                          "Metaindex block", out_stream);
+
     for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid();
          metaindex_iter->Next()) {
       s = metaindex_iter->status();
       if (!s.ok()) {
         return s;
       }
+      // Parse block handle from metaindex value
+      BlockHandle block_handle;
+      Slice input = metaindex_iter->value();
+      Status handle_status = block_handle.DecodeFrom(&input);
+
+      if (!handle_status.ok()) {
+        out_stream << "  Skip the block with type "
+                   << metaindex_iter->key().ToString()
+                   << " due to error: " << handle_status.ToString() << "\n\n";
+        continue;
+      }
+
       if (metaindex_iter->key() == kPropertiesBlockName) {
         out_stream << "  Properties block handle: "
                    << metaindex_iter->value().ToString(true) << "\n";
+        DumpBlockChecksumInfo(block_handle, ro, "Properties block", out_stream);
       } else if (metaindex_iter->key() == kCompressionDictBlockName) {
         out_stream << "  Compression dictionary block handle: "
                    << metaindex_iter->value().ToString(true) << "\n";
+        DumpBlockChecksumInfo(block_handle, ro, "Compression dictionary block",
+                              out_stream);
       } else if (strstr(metaindex_iter->key().ToString().c_str(),
                         "filter.rocksdb.") != nullptr) {
         out_stream << "  Filter block handle: "
                    << metaindex_iter->value().ToString(true) << "\n";
+        DumpBlockChecksumInfo(block_handle, ro, "Filter block", out_stream);
       } else if (metaindex_iter->key() == kRangeDelBlockName) {
         out_stream << "  Range deletion block handle: "
                    << metaindex_iter->value().ToString(true) << "\n";
+        DumpBlockChecksumInfo(block_handle, ro, "Range deletion block",
+                              out_stream);
       }
     }
     out_stream << "\n";
@@ -3032,7 +3306,7 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) {
 
   // Output compression dictionary
   if (rep_->uncompression_dict_reader) {
-    CachableEntry<UncompressionDict> uncompression_dict;
+    CachableEntry<DecompressorDict> uncompression_dict;
     s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
         nullptr /* prefetch_buffer */, ro, nullptr /* get_context */,
         nullptr /* lookup_context */, &uncompression_dict);
@@ -3057,15 +3331,15 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) {
       out_stream << "Range deletions:\n"
                     "--------------------------------------\n";
       for (; range_del_iter->Valid(); range_del_iter->Next()) {
-        DumpKeyValue(range_del_iter->key(), range_del_iter->value(),
-                     out_stream);
+        DumpKeyValue(range_del_iter->key(), range_del_iter->value(), out_stream,
+                     show_sequence_number_type);
       }
       out_stream << "\n";
     }
     delete range_del_iter;
   }
   // Output Data blocks
-  s = DumpDataBlocks(out_stream);
+  s = DumpDataBlocks(out_stream, show_sequence_number_type);
 
   if (!s.ok()) {
     return s;
@@ -3077,15 +3351,65 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) {
   return Status::OK();
 }
 
+void BlockBasedTable::DumpBlockChecksumInfo(const BlockHandle& block_handle,
+                                            const ReadOptions& read_options,
+                                            const char* block_name,
+                                            std::ostream& out_stream) const {
+  if (rep_->footer.GetBlockTrailerSize() == 0) {
+    return;
+  }
+
+  size_t block_size = static_cast<size_t>(block_handle.size());
+  size_t block_size_with_trailer = block_size + kBlockTrailerSize;
+  std::unique_ptr<char[]> raw_block(new char[block_size_with_trailer]);
+  Slice raw_block_slice;
+  IOOptions opts;
+  IODebugContext dbg;
+  IOStatus io_s = rep_->file->PrepareIOOptions(read_options, opts, &dbg);
+  if (io_s.ok()) {
+    io_s = rep_->file->Read(opts, block_handle.offset(),
+                            block_size_with_trailer, &raw_block_slice,
+                            raw_block.get(), /*aligned_buf=*/nullptr, &dbg);
+  }
+  if (io_s.ok() && raw_block_slice.size() == block_size_with_trailer) {
+    const char* data = raw_block_slice.data();
+    uint8_t compression_type_byte = static_cast<uint8_t>(data[block_size]);
+    uint32_t stored_checksum = DecodeFixed32(data + block_size + 1);
+    uint32_t modifier = ChecksumModifierForContext(
+        rep_->footer.base_context_checksum(), block_handle.offset());
+    uint32_t actual_checksum = stored_checksum - modifier;
+    out_stream << "  " << block_name << " checksum type: "
+               << static_cast<int>(rep_->footer.checksum_type())
+               << "  checksum value: 0x" << std::hex << actual_checksum
+               << std::dec << "  offset: " << block_handle.offset()
+               << "  size: " << block_size << "  compression type: "
+               << static_cast<int>(compression_type_byte) << "\n";
+  } else {
+    out_stream << "  ERROR: Failed to read " << block_name << " checksum info";
+    if (!io_s.ok()) {
+      out_stream << " - " << io_s.ToString();
+    } else if (raw_block_slice.size() != block_size_with_trailer) {
+      out_stream << " - read " << raw_block_slice.size() << " bytes, expected "
+                 << block_size_with_trailer;
+    }
+    out_stream << "\n";
+  }
+}
+
 Status BlockBasedTable::DumpIndexBlock(std::ostream& out_stream) {
   out_stream << "Index Details:\n"
                 "--------------------------------------\n";
   // TODO: plumb Env::IOActivity, Env::IOPriority
   const ReadOptions read_options;
+
+  // Print index block checksum information
+  DumpBlockChecksumInfo(rep_->index_handle, read_options, "Index block",
+                        out_stream);
+
   std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
-      NewIndexIterator(read_options, /*need_upper_bound_check=*/false,
+      NewIndexIterator(read_options, /*disable_prefix_seek=*/false,
                        /*input_iter=*/nullptr, /*get_context=*/nullptr,
-                       /*lookup_contex=*/nullptr));
+                       /*lookup_context=*/nullptr));
   Status s = blockhandles_iter->status();
   if (!s.ok()) {
     out_stream << "Can not read Index Block \n\n";
@@ -3130,13 +3454,14 @@ Status BlockBasedTable::DumpIndexBlock(std::ostream& out_stream) {
   return Status::OK();
 }
 
-Status BlockBasedTable::DumpDataBlocks(std::ostream& out_stream) {
+Status BlockBasedTable::DumpDataBlocks(std::ostream& out_stream,
+                                       bool show_sequence_number_type) {
   // TODO: plumb Env::IOActivity, Env::IOPriority
   const ReadOptions read_options;
   std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
-      NewIndexIterator(read_options, /*need_upper_bound_check=*/false,
+      NewIndexIterator(read_options, /*disable_prefix_seek=*/false,
                        /*input_iter=*/nullptr, /*get_context=*/nullptr,
-                       /*lookup_contex=*/nullptr));
+                       /*lookup_context=*/nullptr));
   Status s = blockhandles_iter->status();
   if (!s.ok()) {
     out_stream << "Can not read Index Block \n\n";
@@ -3163,13 +3488,17 @@ Status BlockBasedTable::DumpDataBlocks(std::ostream& out_stream) {
 
     out_stream << "Data Block # " << block_id << " @ "
                << blockhandles_iter->value().handle.ToString(true) << "\n";
+
+    // Read block checksum information
+    DumpBlockChecksumInfo(bh, read_options, "Data block", out_stream);
+
     out_stream << "--------------------------------------\n";
 
     std::unique_ptr<InternalIterator> datablock_iter;
     Status tmp_status;
     datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
         read_options, blockhandles_iter->value().handle,
-        /*input_iter=*/nullptr, /*type=*/BlockType::kData,
+        /*input_iter=*/nullptr, /*block_type=*/BlockType::kData,
         /*get_context=*/nullptr, /*lookup_context=*/nullptr,
         /*prefetch_buffer=*/nullptr, /*for_compaction=*/false,
         /*async_read=*/false, tmp_status, /*use_block_cache_for_lookup=*/true));
@@ -3187,7 +3516,8 @@ Status BlockBasedTable::DumpDataBlocks(std::ostream& out_stream) {
         out_stream << "Error reading the block - Skipped \n";
         break;
       }
-      DumpKeyValue(datablock_iter->key(), datablock_iter->value(), out_stream);
+      DumpKeyValue(datablock_iter->key(), datablock_iter->value(), out_stream,
+                   show_sequence_number_type);
     }
     out_stream << "\n";
   }
@@ -3209,14 +3539,26 @@ Status BlockBasedTable::DumpDataBlocks(std::ostream& out_stream) {
 }
 
 void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value,
-                                   std::ostream& out_stream) {
-  InternalKey ikey;
-  ikey.DecodeFrom(key);
+                                   std::ostream& out_stream,
+                                   bool show_sequence_number_type) {
+  ParsedInternalKey result;
+  auto s = ParseInternalKey(key, &result, true);
+  if (!s.ok()) {
+    out_stream << "Error parsing internal key - Skipped \n";
+    return;
+  }
 
-  out_stream << "  HEX    " << ikey.user_key().ToString(true) << ": "
-             << value.ToString(true) << "\n";
+  if (show_sequence_number_type) {
+    out_stream << "  HEX    " << result.user_key.ToString(true)
+               << "  seq: " << result.sequence
+               << "  type: " << std::to_string(result.type) << " : "
+               << value.ToString(true) << "\n";
+  } else {
+    out_stream << "  HEX    " << result.user_key.ToString(true) << ": "
+               << value.ToString(true) << "\n";
+  }
 
-  std::string str_key = ikey.user_key().ToString();
+  std::string str_key = result.user_key.ToString();
   std::string str_value = value.ToString();
   std::string res_key, res_value;
   char cspace = ' ';
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index 513e517aa85a..4663a83d5721 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -34,6 +34,7 @@
 #include "table/two_level_iterator.h"
 #include "trace_replay/block_cache_tracer.h"
 #include "util/atomic.h"
+#include "util/cast_util.h"
 #include "util/coro_utils.h"
 #include "util/hash_containers.h"
 
@@ -105,6 +106,7 @@ class BlockBasedTable : public TableReader {
       std::shared_ptr<CacheReservationManager> table_reader_cache_res_mgr =
           nullptr,
       const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
+      UnownedPtr<CompressionManager> compression_manager = nullptr,
       bool prefetch_index_and_filter_in_cache = true, bool skip_filters = false,
       int level = -1, const bool immortal_table = false,
       const SequenceNumber largest_seqno = 0,
@@ -206,10 +208,12 @@ class BlockBasedTable : public TableReader {
   size_t ApproximateMemoryUsage() const override;
 
   // convert SST file to a human readable form
-  Status DumpTable(WritableFile* out_file) override;
+  Status DumpTable(WritableFile* out_file,
+                   bool show_sequence_number_type = false) override;
 
   Status VerifyChecksum(const ReadOptions& readOptions,
-                        TableReaderCaller caller) override;
+                        TableReaderCaller caller,
+                        bool meta_blocks_only = false) override;
 
   void MarkObsolete(uint32_t uncache_aggressiveness) override;
 
@@ -226,11 +230,15 @@ class BlockBasedTable : public TableReader {
 
     // Create an iterator for index access. If iter is null, then a new object
     // is created on the heap, and the callee will have the ownership.
-    // If a non-null iter is passed in, it will be used, and the returned value
-    // is either the same as iter or a new on-heap object that
-    // wraps the passed iter. In the latter case the return value points
-    // to a different object then iter, and the callee has the ownership of the
-    // returned object.
+    // If a non-null iter is passed in, it may be used, and the returned value
+    // is either the same as iter or a new on-heap object.
+    // In the latter case the return value points to a different object then
+    // iter, and the callee has the ownership of the returned object.
+    //
+    // Under all circumstances, the caller MUST use the returned iterator
+    // for further operations. If the returned iterator != iter, then the
+    // caller MUST ensure that iter stays in scope until the returned
+    // iterator is destroyed.
     virtual InternalIteratorBase<IndexValue>* NewIterator(
         const ReadOptions& read_options, bool disable_prefix_seek,
         IndexBlockIter* iter, GetContext* get_context,
@@ -293,11 +301,21 @@ class BlockBasedTable : public TableReader {
   Status GetKVPairsFromDataBlocks(const ReadOptions& read_options,
                                   std::vector<KVPairBlock>* kv_pair_blocks);
 
+  // Look up the block cache for the specified block.
+  // out_parsed_block is set to nullptr if the block is not found in the cache.
   template <typename TBlocklike>
   Status LookupAndPinBlocksInCache(
       const ReadOptions& ro, const BlockHandle& handle,
       CachableEntry<TBlocklike>* out_parsed_block) const;
 
+  // Create the block given in `block_contents` and insert it into block cache.
+  // `out_parsed_block` points to the inserted block if successful.
+  template <typename TBlocklike>
+  Status CreateAndPinBlockInCache(
+      const ReadOptions& ro, const BlockHandle& handle,
+      UnownedPtr<Decompressor> decomp, BlockContents* block_contents,
+      CachableEntry<TBlocklike>* out_parsed_block) const;
+
   struct Rep;
 
   Rep* get_rep() { return rep_; }
@@ -364,7 +382,7 @@ class BlockBasedTable : public TableReader {
   template <typename TBlocklike>
   WithBlocklikeCheck<Status, TBlocklike> MaybeReadBlockAndLoadToCache(
       FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
-      const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+      const BlockHandle& handle, UnownedPtr<Decompressor> decomp,
       bool for_compaction, CachableEntry<TBlocklike>* block_entry,
       GetContext* get_context, BlockCacheLookupContext* lookup_context,
       BlockContents* contents, bool async_read,
@@ -376,7 +394,7 @@ class BlockBasedTable : public TableReader {
   template <typename TBlocklike>
   WithBlocklikeCheck<Status, TBlocklike> RetrieveBlock(
       FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
-      const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+      const BlockHandle& handle, UnownedPtr<Decompressor> decomp,
       CachableEntry<TBlocklike>* block_entry, GetContext* get_context,
       BlockCacheLookupContext* lookup_context, bool for_compaction,
       bool use_cache, bool async_read, bool use_block_cache_for_lookup) const;
@@ -397,7 +415,7 @@ class BlockBasedTable : public TableReader {
       const MultiGetRange* batch,
       const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>* handles,
       Status* statuses, CachableEntry<Block_kData>* results, char* scratch,
-      const UncompressionDict& uncompression_dict, bool use_fs_scratch);
+      UnownedPtr<Decompressor> decomp, bool use_fs_scratch);
 
   // Get the iterator from the index reader.
   //
@@ -413,7 +431,7 @@ class BlockBasedTable : public TableReader {
   //  3. We disallowed any io to be performed, that is, read_options ==
   //     kBlockCacheTier
   InternalIteratorBase<IndexValue>* NewIndexIterator(
-      const ReadOptions& read_options, bool need_upper_bound_check,
+      const ReadOptions& read_options, bool disable_prefix_seek,
       IndexBlockIter* input_iter, GetContext* get_context,
       BlockCacheLookupContext* lookup_context) const;
 
@@ -429,7 +447,7 @@ class BlockBasedTable : public TableReader {
   WithBlocklikeCheck<Status, TBlocklike> GetDataBlockFromCache(
       const Slice& cache_key, BlockCacheInterface<TBlocklike> block_cache,
       CachableEntry<TBlocklike>* block, GetContext* get_context,
-      const UncompressionDict* dict) const;
+      UnownedPtr<Decompressor> decomp) const;
 
   // Put a maybe compressed block to the corresponding block caches.
   // This method will perform decompression against block_contents if needed
@@ -447,8 +465,7 @@ class BlockBasedTable : public TableReader {
       CachableEntry<TBlocklike>* cached_block,
       BlockContents&& uncompressed_block_contents,
       BlockContents&& compressed_block_contents,
-      CompressionType block_comp_type,
-      const UncompressionDict& uncompression_dict,
+      CompressionType block_comp_type, UnownedPtr<Decompressor> decomp,
       MemoryAllocator* memory_allocator, GetContext* get_context) const;
 
   // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found
@@ -533,9 +550,15 @@ class BlockBasedTable : public TableReader {
 
   // Helper functions for DumpTable()
   Status DumpIndexBlock(std::ostream& out_stream);
-  Status DumpDataBlocks(std::ostream& out_stream);
+  Status DumpDataBlocks(std::ostream& out_stream,
+                        bool show_sequence_number_type = false);
   void DumpKeyValue(const Slice& key, const Slice& value,
-                    std::ostream& out_stream);
+                    std::ostream& out_stream,
+                    bool show_sequence_number_type = false);
+  void DumpBlockChecksumInfo(const BlockHandle& block_handle,
+                             const ReadOptions& read_options,
+                             const char* block_name,
+                             std::ostream& out_stream) const;
 
   // Returns false if prefix_extractor exists and is compatible with that used
   // in building the table file, otherwise true.
@@ -543,6 +566,12 @@ class BlockBasedTable : public TableReader {
 
   bool TimestampMayMatch(const ReadOptions& read_options) const;
 
+  bool BlockTypeMaybeCompressed(BlockType type) const {
+    return type != BlockType::kFilter &&
+           type != BlockType::kCompressionDictionary &&
+           type != BlockType::kUserDefinedIndex;
+  }
+
   // A cumulative data block file read in MultiGet lower than this size will
   // use a stack buffer
   static constexpr size_t kMultiGetReadStackBufSize = 8192;
@@ -550,6 +579,8 @@ class BlockBasedTable : public TableReader {
   friend class PartitionedFilterBlockReader;
   friend class PartitionedFilterBlockTest;
   friend class DBBasicTest_MultiGetIOBufferOverrun_Test;
+  friend class ReadSet;
+  friend class IODispatcherTest;
 };
 
 // Maintaining state of a two-level iteration on a partitioned index structure.
@@ -589,7 +620,9 @@ struct BlockBasedTable::Rep {
         file_size(_file_size),
         level(_level),
         immortal_table(_immortal_table),
-        user_defined_timestamps_persisted(_user_defined_timestamps_persisted) {}
+        user_defined_timestamps_persisted(_user_defined_timestamps_persisted),
+        fs_prefetch_support(CheckFSFeatureSupport(
+            _ioptions.fs.get(), FSSupportedOps::kFSPrefetch)) {}
   ~Rep() { status.PermitUncheckedError(); }
   const ImmutableOptions& ioptions;
   const EnvOptions& env_options;
@@ -650,9 +683,11 @@ struct BlockBasedTable::Rep {
   Slice min_timestamp;
   Slice max_timestamp;
 
-  // If false, blocks in this file are definitely all uncompressed. Knowing this
-  // before reading individual blocks enables certain optimizations.
-  bool blocks_maybe_compressed = true;
+  // If blocks might be compressed, refers to a decompressor that can decompress
+  // them. (nullptr -> no blocks compressed)  However, if (data) blocks are
+  // dictionary compressed, a dictionary-aware decompressor is needed, which
+  // might live in the block cache.
+  std::shared_ptr<Decompressor> decompressor;
 
   // These describe how index is encoded.
   bool index_has_first_key = false;
@@ -676,6 +711,8 @@ struct BlockBasedTable::Rep {
   // `end_key` for range deletion entries.
   const bool user_defined_timestamps_persisted;
 
+  const bool fs_prefetch_support;
+
   // Set to >0 when the file is known to be obsolete and should have its block
   // cache entries evicted on close. NOTE: when the file becomes obsolete,
   // there could be multiple table cache references that all mark this file as
@@ -686,6 +723,8 @@ struct BlockBasedTable::Rep {
   std::unique_ptr<CacheReservationManager::CacheReservationHandle>
       table_reader_cache_res_handle = nullptr;
 
+  CachableEntry<Block_kUserDefinedIndex> udi_block;
+
   SequenceNumber get_global_seqno(BlockType block_type) const {
     return (block_type == BlockType::kFilterPartitionIndex ||
             block_type == BlockType::kCompressionDictionary)
diff --git a/table/block_based/block_based_table_reader_impl.h b/table/block_based/block_based_table_reader_impl.h
index fd0db73af1de..288d3035565f 100644
--- a/table/block_based/block_based_table_reader_impl.h
+++ b/table/block_based/block_based_table_reader_impl.h
@@ -60,34 +60,33 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
   }
 
   CachableEntry<Block> block;
-  if (rep_->uncompression_dict_reader && block_type == BlockType::kData) {
-    CachableEntry<UncompressionDict> uncompression_dict;
-    // For async scans, don't use the prefetch buffer since an async prefetch
-    // might already be under way and this would invalidate it. Also, the
-    // uncompression dict is typically at the end of the file and would
-    // most likely break the sequentiality of the access pattern.
-    // Same is with auto_readahead_size. It iterates over index to lookup for
-    // data blocks. And this could break the the sequentiality of the access
-    // pattern.
-    s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
-        ((ro.async_io || ro.auto_readahead_size) ? nullptr : prefetch_buffer),
-        ro, get_context, lookup_context, &uncompression_dict);
-    if (!s.ok()) {
-      iter->Invalidate(s);
-      return iter;
+  {
+    CachableEntry<DecompressorDict> dict;
+    Decompressor* decomp = rep_->decompressor.get();
+    if (rep_->uncompression_dict_reader && block_type == BlockType::kData) {
+      // For async scans, don't use the prefetch buffer since an async prefetch
+      // might already be under way and this would invalidate it. Also, the
+      // uncompression dict is typically at the end of the file and would
+      // most likely break the sequentiality of the access pattern.
+      // Same is with auto_readahead_size. It iterates over index to lookup for
+      // data blocks. And this could break the the sequentiality of the access
+      // pattern.
+      s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
+          ((ro.async_io || ro.auto_readahead_size) ? nullptr : prefetch_buffer),
+          ro, get_context, lookup_context, &dict);
+      if (!s.ok()) {
+        iter->Invalidate(s);
+        return iter;
+      }
+      assert(dict.GetValue());
+      if (dict.GetValue()) {
+        decomp = dict.GetValue()->decompressor_.get();
+      }
     }
-    const UncompressionDict& dict = uncompression_dict.GetValue()
-                                        ? *uncompression_dict.GetValue()
-                                        : UncompressionDict::GetEmptyDict();
     s = RetrieveBlock(
-        prefetch_buffer, ro, handle, dict, &block.As<IterBlocklike>(),
+        prefetch_buffer, ro, handle, decomp, &block.As<IterBlocklike>(),
         get_context, lookup_context, for_compaction,
         /* use_cache */ true, async_read, use_block_cache_for_lookup);
-  } else {
-    s = RetrieveBlock(
-        prefetch_buffer, ro, handle, UncompressionDict::GetEmptyDict(),
-        &block.As<IterBlocklike>(), get_context, lookup_context, for_compaction,
-        /* use_cache */ true, async_read, use_block_cache_for_lookup);
   }
 
   if (s.IsTryAgain() && async_read) {
diff --git a/table/block_based/block_based_table_reader_sync_and_async.h b/table/block_based/block_based_table_reader_sync_and_async.h
index 7ec152fc8e93..dc9e66214022 100644
--- a/table/block_based/block_based_table_reader_sync_and_async.h
+++ b/table/block_based/block_based_table_reader_sync_and_async.h
@@ -33,12 +33,10 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks)
 (const ReadOptions& options, const MultiGetRange* batch,
  const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>* handles,
  Status* statuses, CachableEntry<Block_kData>* results, char* scratch,
- const UncompressionDict& uncompression_dict, bool use_fs_scratch) const {
+ UnownedPtr<Decompressor> decomp, bool use_fs_scratch) const {
   RandomAccessFileReader* file = rep_->file.get();
   const Footer& footer = rep_->footer;
   const ImmutableOptions& ioptions = rep_->ioptions;
-  size_t read_amp_bytes_per_bit = rep_->table_options.read_amp_bytes_per_bit;
-  MemoryAllocator* memory_allocator = GetMemoryAllocator(rep_->table_options);
 
   if (ioptions.allow_mmap_reads) {
     size_t idx_in_batch = 0;
@@ -51,7 +49,7 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks)
 
       // XXX: use_cache=true means double cache query?
       statuses[idx_in_batch] = RetrieveBlock(
-          nullptr, options, handle, uncompression_dict,
+          nullptr, options, handle, decomp,
           &results[idx_in_batch].As<Block_kData>(), mget_iter->get_context,
           /* lookup_context */ nullptr,
           /* for_compaction */ false, /* use_cache */ true,
@@ -138,17 +136,18 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks)
   AlignedBuf direct_io_buf;
   {
     IOOptions opts;
-    IOStatus s = file->PrepareIOOptions(options, opts);
+    IODebugContext dbg;
+    IOStatus s = file->PrepareIOOptions(options, opts, &dbg);
     if (s.ok()) {
 #if defined(WITH_COROUTINES)
       if (file->use_direct_io()) {
 #endif  // WITH_COROUTINES
         s = file->MultiRead(opts, &read_reqs[0], read_reqs.size(),
-                            &direct_io_buf);
+                            &direct_io_buf, &dbg);
 #if defined(WITH_COROUTINES)
       } else {
         co_await batch->context()->reader().MultiReadAsync(
-            file, opts, &read_reqs[0], read_reqs.size(), &direct_io_buf);
+            file, opts, &read_reqs[0], read_reqs.size(), &direct_io_buf, &dbg);
       }
 #endif  // WITH_COROUTINES
     }
@@ -221,7 +220,8 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks)
         // in each read request. Checksum is stored in the block trailer,
         // beyond the payload size.
         s = VerifyBlockChecksum(footer, data, handle.size(),
-                                rep_->file->file_name(), handle.offset());
+                                rep_->file->file_name(), handle.offset(),
+                                BlockType::kData);
         RecordTick(ioptions.stats, BLOCK_CHECKSUM_COMPUTE_COUNT);
         if (!s.ok()) {
           RecordTick(ioptions.stats, BLOCK_CHECKSUM_MISMATCH_COUNT);
@@ -240,15 +240,17 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks)
           // its not a memory mapped file
           Slice result;
           IOOptions opts;
-          IOStatus io_s = file->PrepareIOOptions(options, opts);
+          IODebugContext dbg;
+          IOStatus io_s = file->PrepareIOOptions(options, opts, &dbg);
           opts.verify_and_reconstruct_read = true;
           io_s = file->Read(opts, handle.offset(), BlockSizeWithTrailer(handle),
-                            &result, const_cast<char*>(data), nullptr);
+                            &result, const_cast<char*>(data), nullptr, &dbg);
           if (io_s.ok()) {
             assert(result.data() == data);
             assert(result.size() == BlockSizeWithTrailer(handle));
             s = VerifyBlockChecksum(footer, data, handle.size(),
-                                    rep_->file->file_name(), handle.offset());
+                                    rep_->file->file_name(), handle.offset(),
+                                    BlockType::kData);
             if (s.ok()) {
               RecordTick(ioptions.stats,
                          FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT);
@@ -264,81 +266,8 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks)
     }
 
     if (s.ok()) {
-      // When the blocks share the same underlying buffer (scratch or direct io
-      // buffer), we may need to manually copy the block into heap if the
-      // serialized block has to be inserted into a cache. That falls into the
-      // following cases -
-      // 1. serialized block is not compressed, it needs to be inserted into
-      //    the uncompressed block cache if there is one
-      // 2. If the serialized block is compressed, it needs to be inserted
-      //    into the compressed block cache if there is one
-      //
-      // In all other cases, the serialized block is either uncompressed into a
-      // heap buffer or there is no cache at all.
-      CompressionType compression_type =
-          GetBlockCompressionType(serialized_block);
-      if ((use_fs_scratch || use_shared_buffer) &&
-          compression_type == kNoCompression) {
-        Slice serialized =
-            Slice(req.result.data() + req_offset, BlockSizeWithTrailer(handle));
-        serialized_block = BlockContents(
-            CopyBufferToHeap(GetMemoryAllocator(rep_->table_options),
-                             serialized),
-            handle.size());
-#ifndef NDEBUG
-        serialized_block.has_trailer = true;
-#endif
-      }
-    }
-
-    if (s.ok()) {
-      if (options.fill_cache) {
-        CachableEntry<Block_kData>* block_entry = &results[idx_in_batch];
-        // MaybeReadBlockAndLoadToCache will insert into the block caches if
-        // necessary. Since we're passing the serialized block contents, it
-        // will avoid looking up the block cache
-        s = MaybeReadBlockAndLoadToCache(
-            nullptr, options, handle, uncompression_dict,
-            /*for_compaction=*/false, block_entry, mget_iter->get_context,
-            /*lookup_context=*/nullptr, &serialized_block,
-            /*async_read=*/false, /*use_block_cache_for_lookup=*/true);
-
-        if (!s.ok()) {
-          statuses[idx_in_batch] = s;
-          continue;
-        }
-        // block_entry value could be null if no block cache is present, i.e
-        // BlockBasedTableOptions::no_block_cache is true and no compressed
-        // block cache is configured. In that case, fall
-        // through and set up the block explicitly
-        if (block_entry->GetValue() != nullptr) {
-          continue;
-        }
-      }
-
-      CompressionType compression_type =
-          GetBlockCompressionType(serialized_block);
-      BlockContents contents;
-      if (compression_type != kNoCompression) {
-        UncompressionContext context(compression_type);
-        UncompressionInfo info(context, uncompression_dict, compression_type);
-        s = UncompressSerializedBlock(
-            info, req.result.data() + req_offset, handle.size(), &contents,
-            footer.format_version(), rep_->ioptions, memory_allocator);
-      } else {
-        // There are two cases here:
-        // 1) caller uses the shared buffer (scratch or direct io buffer);
-        // 2) we use the requst buffer.
-        // If scratch buffer or direct io buffer is used, we ensure that
-        // all serialized blocks are copyed to the heap as single blocks. If
-        // scratch buffer is not used, we also have no combined read, so the
-        // serialized block can be used directly.
-        contents = std::move(serialized_block);
-      }
-      if (s.ok()) {
-        results[idx_in_batch].SetOwnedValue(std::make_unique<Block_kData>(
-            std::move(contents), read_amp_bytes_per_bit, ioptions.stats));
-      }
+      s = CreateAndPinBlockInCache(options, handle, decomp, &serialized_block,
+                                   &results[idx_in_batch]);
     }
     statuses[idx_in_batch] = s;
   }
@@ -421,10 +350,10 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::MultiGet)
     {
       MultiGetRange data_block_range(sst_file_range, sst_file_range.begin(),
                                      sst_file_range.end());
-      CachableEntry<UncompressionDict> uncompression_dict;
-      Status uncompression_dict_status;
-      uncompression_dict_status.PermitUncheckedError();
-      bool uncompression_dict_inited = false;
+      CachableEntry<DecompressorDict> dict;
+      Status dict_status;
+      dict_status.PermitUncheckedError();
+      bool dict_inited = false;
       size_t total_len = 0;
 
       // GetContext for any key will do, as the stats will be aggregated
@@ -466,26 +395,26 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::MultiGet)
             continue;
           }
 
-          if (!uncompression_dict_inited && rep_->uncompression_dict_reader) {
-            uncompression_dict_status =
-                rep_->uncompression_dict_reader
-                    ->GetOrReadUncompressionDictionary(
-                        nullptr /* prefetch_buffer */, read_options,
-                        get_context, &metadata_lookup_context,
-                        &uncompression_dict);
-            uncompression_dict_inited = true;
+          if (!dict_inited && rep_->uncompression_dict_reader) {
+            dict_status = rep_->uncompression_dict_reader
+                              ->GetOrReadUncompressionDictionary(
+                                  nullptr /* prefetch_buffer */, read_options,
+                                  get_context, &metadata_lookup_context, &dict);
+            dict_inited = true;
           }
 
-          if (!uncompression_dict_status.ok()) {
-            assert(!uncompression_dict_status.IsNotFound());
-            *(miter->s) = uncompression_dict_status;
+          if (!dict_status.ok()) {
+            assert(!dict_status.IsNotFound());
+            *(miter->s) = dict_status;
             data_block_range.SkipKey(miter);
             sst_file_range.SkipKey(miter);
             continue;
+          } else {
+            assert(!dict_inited || dict.GetValue() != nullptr);
+          }
+          if (dict.GetValue()) {
+            create_ctx.decompressor = dict.GetValue()->decompressor_.get();
           }
-          create_ctx.dict = uncompression_dict.GetValue()
-                                ? uncompression_dict.GetValue()
-                                : &UncompressionDict::GetEmptyDict();
 
           if (v.handle.offset() == prev_offset) {
             // This key can reuse the previous block (later on).
@@ -565,11 +494,8 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::MultiGet)
       if (total_len) {
         char* scratch = nullptr;
         bool use_fs_scratch = false;
-        const UncompressionDict& dict = uncompression_dict.GetValue()
-                                            ? *uncompression_dict.GetValue()
-                                            : UncompressionDict::GetEmptyDict();
-        assert(uncompression_dict_inited || !rep_->uncompression_dict_reader);
-        assert(uncompression_dict_status.ok());
+        assert(dict_inited || !rep_->uncompression_dict_reader);
+        assert(dict_status.ok());
 
         if (!rep_->file->use_direct_io()) {
           if (CheckFSFeatureSupport(rep_->ioptions.fs.get(),
@@ -589,7 +515,7 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::MultiGet)
         // 3. If blocks are compressed and no compressed block cache, use
         //    stack buf
         if (!use_fs_scratch && !rep_->file->use_direct_io() &&
-            rep_->blocks_maybe_compressed) {
+            rep_->decompressor) {
           if (total_len <= kMultiGetReadStackBufSize) {
             scratch = stack_buf;
           } else {
@@ -599,7 +525,10 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::MultiGet)
         }
         CO_AWAIT(RetrieveMultipleBlocks)
         (read_options, &data_block_range, &block_handles, &statuses[0],
-         &results[0], scratch, dict, use_fs_scratch);
+         &results[0], scratch,
+         dict.GetValue() ? dict.GetValue()->decompressor_.get()
+                         : rep_->decompressor.get(),
+         use_fs_scratch);
         if (get_context) {
           ++(get_context->get_context_stats_.num_sst_read);
         }
diff --git a/table/block_based/block_based_table_reader_test.cc b/table/block_based/block_based_table_reader_test.cc
index 4a18b6fcda84..7b20759caa54 100644
--- a/table/block_based/block_based_table_reader_test.cc
+++ b/table/block_based/block_based_table_reader_test.cc
@@ -22,12 +22,16 @@
 #include "rocksdb/options.h"
 #include "table/block_based/block_based_table_builder.h"
 #include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_based_table_iterator.h"
 #include "table/block_based/partitioned_index_iterator.h"
 #include "table/format.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/random.h"
 
+// Enable io_uring support for this test
+extern "C" bool RocksDbIOUringEnable() { return true; }
+
 namespace ROCKSDB_NAMESPACE {
 
 class BlockBasedTableReaderBaseTest : public testing::Test {
@@ -49,7 +53,8 @@ class BlockBasedTableReaderBaseTest : public testing::Test {
   // user defined timestamps and different sequence number to differentiate them
   static std::vector<std::pair<std::string, std::string>> GenerateKVMap(
       int num_block = 2, bool mixed_with_human_readable_string_value = false,
-      size_t ts_sz = 0, bool same_key_diff_ts = false) {
+      size_t ts_sz = 0, bool same_key_diff_ts = false,
+      const Comparator* comparator = BytewiseComparator()) {
     std::vector<std::pair<std::string, std::string>> kv;
 
     SequenceNumber seq_no = 0;
@@ -97,6 +102,10 @@ class BlockBasedTableReaderBaseTest : public testing::Test {
         }
       }
     }
+    auto comparator_name = std::string(comparator->Name());
+    if (comparator_name.find("Reverse") != std::string::npos) {
+      std::reverse(kv.begin(), kv.end());
+    }
     return kv;
   }
 
@@ -125,6 +134,7 @@ class BlockBasedTableReaderBaseTest : public testing::Test {
 
     InternalKeyComparator comparator(ioptions.user_comparator);
     ColumnFamilyOptions cf_options;
+    cf_options.comparator = ioptions.user_comparator;
     cf_options.prefix_extractor = options_.prefix_extractor;
     MutableCFOptions moptions(cf_options);
     CompressionOptions compression_opts;
@@ -163,16 +173,18 @@ class BlockBasedTableReaderBaseTest : public testing::Test {
                                 bool user_defined_timestamps_persisted = true) {
     const MutableCFOptions moptions(options_);
     TableReaderOptions table_reader_options = TableReaderOptions(
-        ioptions, moptions.prefix_extractor, foptions, comparator,
-        0 /* block_protection_bytes_per_key */, false /* _skip_filters */,
-        false /* _immortal */, false /* _force_direct_prefetch */,
-        -1 /* _level */, nullptr /* _block_cache_tracer */,
+        ioptions, moptions.prefix_extractor, moptions.compression_manager.get(),
+        foptions, comparator, 0 /* block_protection_bytes_per_key */,
+        false /* _skip_filters */, false /* _immortal */,
+        false /* _force_direct_prefetch */, -1 /* _level */,
+        nullptr /* _block_cache_tracer */,
         0 /* _max_file_size_for_l0_meta_pin */, "" /* _cur_db_session_id */,
-        0 /* _cur_file_num */, {} /* _unique_id */, 0 /* _largest_seqno */,
-        0 /* _tail_size */, user_defined_timestamps_persisted);
+        table_num_++ /* _cur_file_num */, {} /* _unique_id */,
+        0 /* _largest_seqno */, 0 /* _tail_size */,
+        user_defined_timestamps_persisted);
 
     std::unique_ptr<RandomAccessFileReader> file;
-    NewFileReader(table_name, foptions, &file);
+    NewFileReader(table_name, foptions, &file, ioptions.statistics.get());
 
     uint64_t file_size = 0;
     ASSERT_OK(env_->GetFileSize(Path(table_name), &file_size));
@@ -190,6 +202,8 @@ class BlockBasedTableReaderBaseTest : public testing::Test {
 
     if (status) {
       *status = s;
+    } else {
+      ASSERT_OK(s);
     }
   }
 
@@ -199,6 +213,7 @@ class BlockBasedTableReaderBaseTest : public testing::Test {
   Env* env_;
   std::shared_ptr<FileSystem> fs_;
   Options options_;
+  uint64_t table_num_{0};
 
  private:
   void WriteToFile(const std::string& content, const std::string& filename) {
@@ -219,15 +234,82 @@ class BlockBasedTableReaderBaseTest : public testing::Test {
   }
 
   void NewFileReader(const std::string& filename, const FileOptions& opt,
-                     std::unique_ptr<RandomAccessFileReader>* reader) {
+                     std::unique_ptr<RandomAccessFileReader>* reader,
+                     Statistics* stats = nullptr) {
     std::string path = Path(filename);
     std::unique_ptr<FSRandomAccessFile> f;
     ASSERT_OK(fs_->NewRandomAccessFile(path, opt, &f, nullptr));
     reader->reset(new RandomAccessFileReader(std::move(f), path,
-                                             env_->GetSystemClock().get()));
+                                             env_->GetSystemClock().get(),
+                                             /*io_tracer=*/nullptr,
+                                             /*stats=*/stats));
   }
 };
 
+struct BlockBasedTableReaderTestParam {
+  BlockBasedTableReaderTestParam(
+      CompressionType _compression_type, bool _use_direct_reads,
+      BlockBasedTableOptions::IndexType _index_type, bool _no_block_cache,
+      test::UserDefinedTimestampTestMode _udt_test_mode,
+      uint32_t _compression_parallel_threads, uint32_t _compression_dict_bytes,
+      bool _same_key_diff_ts, const Comparator* _comparator, bool _fill_cache,
+      bool _use_async_io, bool _block_align, size_t _super_block_alignment_size,
+      size_t _super_block_alignment_space_overhead_ratio)
+      : compression_type(_compression_type),
+        use_direct_reads(_use_direct_reads),
+        index_type(_index_type),
+        no_block_cache(_no_block_cache),
+        udt_test_mode(_udt_test_mode),
+        compression_parallel_threads(_compression_parallel_threads),
+        compression_dict_bytes(_compression_dict_bytes),
+        same_key_diff_ts(_same_key_diff_ts),
+        comparator(_comparator),
+        fill_cache(_fill_cache),
+        use_async_io(_use_async_io),
+        block_align(_block_align),
+        super_block_alignment_size(_super_block_alignment_size),
+        super_block_alignment_space_overhead_ratio(
+            _super_block_alignment_space_overhead_ratio) {}
+
+  CompressionType compression_type;
+  bool use_direct_reads;
+  BlockBasedTableOptions::IndexType index_type;
+  bool no_block_cache;
+  test::UserDefinedTimestampTestMode udt_test_mode;
+  uint32_t compression_parallel_threads;
+  uint32_t compression_dict_bytes;
+  bool same_key_diff_ts;
+  const Comparator* comparator;
+  bool fill_cache;
+  bool use_async_io;
+  bool block_align;
+  size_t super_block_alignment_size;
+  size_t super_block_alignment_space_overhead_ratio;
+};
+
+// Define operator<< for SpotLockManagerTestParam to stop valgrind from
+// complaining uinitialized value when printing SpotLockManagerTestParam.
+std::ostream& operator<<(std::ostream& os,
+                         const BlockBasedTableReaderTestParam& param) {
+  os << "compression_type: " << CompressionTypeToString(param.compression_type)
+     << " use_direct_reads: " << param.use_direct_reads
+     << " index_type: " << static_cast<int>(param.index_type)
+     << " no_block_cache: " << param.no_block_cache
+     << " udt_test_mode: " << static_cast<int>(param.udt_test_mode)
+     << " compression_parallel_threads: " << param.compression_parallel_threads
+     << " compression_dict_bytes: " << param.compression_dict_bytes
+     << " same_key_diff_ts: " << param.same_key_diff_ts
+     << " comparator: " << param.comparator->Name()
+     << " fill_cache: " << param.fill_cache
+     << " use_async_io: " << param.use_async_io
+     << " block_align: " << param.block_align
+     << " super_block_alignment_size: " << param.super_block_alignment_size
+     << " super_block_alignment_space_overhead_ratio: "
+     << param.super_block_alignment_space_overhead_ratio;
+
+  return os;
+}
+
 // Param 1: compression type
 // Param 2: whether to use direct reads
 // Param 3: Block Based Table Index type
@@ -244,28 +326,33 @@ class BlockBasedTableReaderBaseTest : public testing::Test {
 //          generate keys with different user provided key, same user-defined
 //          timestamps (if udt enabled), same sequence number. This test mode is
 //          used for testing `Get`, `MultiGet`, and `NewIterator`.
+// Param 9: test both the default comparator and a reverse comparator.
 class BlockBasedTableReaderTest
     : public BlockBasedTableReaderBaseTest,
-      public testing::WithParamInterface<std::tuple<
-          CompressionType, bool, BlockBasedTableOptions::IndexType, bool,
-          test::UserDefinedTimestampTestMode, uint32_t, uint32_t, bool>> {
+      public testing::WithParamInterface<BlockBasedTableReaderTestParam> {
  protected:
   void SetUp() override {
-    compression_type_ = std::get<0>(GetParam());
-    use_direct_reads_ = std::get<1>(GetParam());
-    test::UserDefinedTimestampTestMode udt_test_mode = std::get<4>(GetParam());
+    auto param = GetParam();
+    compression_type_ = param.compression_type;
+    use_direct_reads_ = param.use_direct_reads;
+    test::UserDefinedTimestampTestMode udt_test_mode = param.udt_test_mode;
     udt_enabled_ = test::IsUDTEnabled(udt_test_mode);
     persist_udt_ = test::ShouldPersistUDT(udt_test_mode);
-    compression_parallel_threads_ = std::get<5>(GetParam());
-    compression_dict_bytes_ = std::get<6>(GetParam());
-    same_key_diff_ts_ = std::get<7>(GetParam());
+    compression_parallel_threads_ = param.compression_parallel_threads;
+    compression_dict_bytes_ = param.compression_dict_bytes;
+    same_key_diff_ts_ = param.same_key_diff_ts;
+    comparator_ = param.comparator;
     BlockBasedTableReaderBaseTest::SetUp();
   }
 
   void ConfigureTableFactory() override {
     BlockBasedTableOptions opts;
-    opts.index_type = std::get<2>(GetParam());
-    opts.no_block_cache = std::get<3>(GetParam());
+    auto param = GetParam();
+    opts.index_type = param.index_type;
+    opts.no_block_cache = param.no_block_cache;
+    opts.super_block_alignment_size = param.super_block_alignment_size;
+    opts.super_block_alignment_space_overhead_ratio =
+        param.super_block_alignment_space_overhead_ratio;
     opts.filter_policy.reset(NewBloomFilterPolicy(10, false));
     opts.partition_filters =
         opts.index_type ==
@@ -284,6 +371,7 @@ class BlockBasedTableReaderTest
   uint32_t compression_parallel_threads_;
   uint32_t compression_dict_bytes_;
   bool same_key_diff_ts_;
+  const Comparator* comparator_{};
 };
 
 class BlockBasedTableReaderGetTest : public BlockBasedTableReaderTest {};
@@ -987,61 +1075,924 @@ TEST_P(BlockBasedTableReaderTestVerifyChecksum, ChecksumMismatch) {
   ASSERT_EQ(s.code(), Status::kCorruption);
 }
 
-// Param 1: compression type
-// Param 2: whether to use direct reads
-// Param 3: Block Based Table Index type, partitioned filters are also enabled
-//          when index type is kTwoLevelIndexSearch
-// Param 4: BBTO no_block_cache option
-// Param 5: test mode for the user-defined timestamp feature
-// Param 6: number of parallel compression threads
-// Param 7: CompressionOptions.max_dict_bytes and
-//          CompressionOptions.max_dict_buffer_bytes. This enable/disables
-//          compression dictionary.
-// Param 8: test mode to specify the pattern for generating key / value pairs.
+class BlockBasedTableReaderMultiScanTest : public BlockBasedTableReaderTest {
+ public:
+  void SetUp() override {
+    BlockBasedTableReaderTest::SetUp();
+    options_.comparator = comparator_;
+  }
+};
+
+class BlockBasedTableReaderMultiScanAsyncIOTest
+    : public BlockBasedTableReaderMultiScanTest {};
+
+// TODO: test no block cache case
+TEST_P(BlockBasedTableReaderMultiScanAsyncIOTest, MultiScanPrepare) {
+  auto param = GetParam();
+  auto fill_cache = param.fill_cache;
+  auto use_async_io = param.use_async_io;
+
+  options_.statistics = CreateDBStatistics();
+  std::shared_ptr<FileSystem> fs = options_.env->GetFileSystem();
+  ReadOptions read_opts;
+  read_opts.fill_cache = fill_cache;
+  size_t ts_sz = options_.comparator->timestamp_size();
+  std::vector<std::pair<std::string, std::string>> kv =
+      BlockBasedTableReaderBaseTest::GenerateKVMap(
+          100 /* num_block */,
+          true /* mixed_with_human_readable_string_value */, ts_sz,
+          same_key_diff_ts_, comparator_);
+  std::string table_name = "BlockBasedTableReaderTest_NewIterator" +
+                           CompressionTypeToString(compression_type_) +
+                           "_async" + std::to_string(use_async_io);
+  ImmutableOptions ioptions(options_);
+  // Only insert 60 out of 100 blocks
+  CreateTable(table_name, ioptions, compression_type_,
+              std::vector<std::pair<std::string, std::string>>{
+                  kv.begin() + 20 * kEntriesPerBlock,
+                  kv.begin() + 80 * kEntriesPerBlock},
+              compression_parallel_threads_, compression_dict_bytes_);
+
+  std::unique_ptr<BlockBasedTable> table;
+  FileOptions foptions;
+  foptions.use_direct_reads = use_direct_reads_;
+  InternalKeyComparator comparator(options_.comparator);
+  NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table,
+                           true /* bool prefetch_index_and_filter_in_cache */,
+                           nullptr /* status */, persist_udt_);
+
+  // 1. Should coalesce into a single I/O
+  std::unique_ptr<InternalIterator> iter;
+  iter.reset(table->NewIterator(
+      read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+  MultiScanArgs scan_options(comparator_);
+  scan_options.use_async_io = use_async_io;
+  scan_options.insert(ExtractUserKey(kv[30 * kEntriesPerBlock].first),
+                      ExtractUserKey(kv[31 * kEntriesPerBlock].first));
+  scan_options.insert(ExtractUserKey(kv[32 * kEntriesPerBlock].first),
+                      ExtractUserKey(kv[33 * kEntriesPerBlock].first));
+  auto read_count_before =
+      options_.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
+
+  iter->Prepare(&scan_options);
+  iter->Seek(kv[30 * kEntriesPerBlock].first);
+  for (size_t i = 30 * kEntriesPerBlock; i <= 31 * kEntriesPerBlock; ++i) {
+    ASSERT_TRUE(iter->status().ok()) << iter->status().ToString();
+    ASSERT_TRUE(iter->Valid()) << i;
+    ASSERT_EQ(iter->key().ToString(), kv[i].first);
+    iter->Next();
+  }
+  // Iter may still be valid after scan range. Upper layer (DBIter) handles
+  // exact upper bound checking. So we don't check !iter->Valid() here.
+  ASSERT_OK(iter->status());
+  iter->Seek(kv[32 * kEntriesPerBlock].first);
+  for (size_t i = 32 * kEntriesPerBlock; i < 33 * kEntriesPerBlock; ++i) {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().ToString(), kv[i].first);
+    iter->Next();
+  }
+  ASSERT_OK(iter->status());
+  auto read_count_after =
+      options_.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
+  ASSERT_EQ(read_count_before + 1, read_count_after);
+
+  // 2. No IO coalesce, should do MultiRead/ReadAsync with 2 read requests.
+  iter.reset(table->NewIterator(
+      read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+  scan_options = MultiScanArgs(comparator_);
+  scan_options.insert(ExtractUserKey(kv[40 * kEntriesPerBlock].first),
+                      ExtractUserKey(kv[45 * kEntriesPerBlock].first));
+  scan_options.insert(ExtractUserKey(kv[70 * kEntriesPerBlock].first),
+                      ExtractUserKey(kv[75 * kEntriesPerBlock].first));
+
+  read_count_before =
+      options_.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
+  iter->Prepare(&scan_options);
+
+  iter->Seek(kv[40 * kEntriesPerBlock].first);
+  for (size_t i = 40 * kEntriesPerBlock; i < 45 * kEntriesPerBlock; ++i) {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().ToString(), kv[i].first);
+    iter->Next();
+  }
+  ASSERT_OK(iter->status());
+  iter->Seek(kv[70 * kEntriesPerBlock].first);
+  for (size_t i = 70 * kEntriesPerBlock; i < 75 * kEntriesPerBlock; ++i) {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().ToString(), kv[i].first);
+    iter->Next();
+  }
+  ASSERT_OK(iter->status());
+
+  read_count_after =
+      options_.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
+  ASSERT_EQ(read_count_before + 2, read_count_after);
+
+  iter.reset(table->NewIterator(
+      read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+  // 3. Tests I/O excludes blocks already in cache.
+  // Reading blocks from 40-79
+  // From reads above, blocks 40-44 and 70-74 already in cache
+  // So we should read 45-69, 75-79 in two I/Os.
+  // If fill_cache is false, then we'll do one giant I/O.
+  scan_options = MultiScanArgs(comparator_);
+  scan_options.use_async_io = use_async_io;
+  scan_options.insert(ExtractUserKey(kv[40 * kEntriesPerBlock].first),
+                      ExtractUserKey(kv[80 * kEntriesPerBlock].first));
+  read_count_before =
+      options_.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
+  iter->Prepare(&scan_options);
+  read_count_after =
+      options_.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
+  if (!use_async_io) {
+    if (!fill_cache) {
+      ASSERT_EQ(read_count_before + 1, read_count_after);
+    } else {
+      ASSERT_EQ(read_count_before + 2, read_count_after);
+    }
+  } else {
+    // stat is recorded in async callback which happens in Poll(), and
+    // Poll() happens during scanning.
+    ASSERT_EQ(read_count_before, read_count_after);
+  }
+
+  iter->Seek(kv[40 * kEntriesPerBlock].first);
+  for (size_t i = 40 * kEntriesPerBlock; i < 80 * kEntriesPerBlock; ++i) {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().ToString(), kv[i].first);
+    iter->Next();
+  }
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_OK(iter->status());
+  read_count_after =
+      options_.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
+  if (!fill_cache) {
+    ASSERT_EQ(read_count_before + 1, read_count_after);
+  } else {
+    ASSERT_EQ(read_count_before + 2, read_count_after);
+  }
+
+  // 4. Check cases when Seek key does not match start key in ScanOptions
+  iter.reset(table->NewIterator(
+      read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+  scan_options = MultiScanArgs(comparator_);
+  scan_options.use_async_io = use_async_io;
+  scan_options.insert(ExtractUserKey(kv[30 * kEntriesPerBlock].first),
+                      ExtractUserKey(kv[40 * kEntriesPerBlock].first));
+  scan_options.insert(ExtractUserKey(kv[50 * kEntriesPerBlock].first),
+                      ExtractUserKey(kv[60 * kEntriesPerBlock].first));
+  iter->Prepare(&scan_options);
+  // Match start key
+  iter->Seek(kv[30 * kEntriesPerBlock].first);
+  for (size_t i = 30 * kEntriesPerBlock; i < 40 * kEntriesPerBlock; ++i) {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().ToString(), kv[i].first);
+    iter->Next();
+  }
+  ASSERT_OK(iter->status());
+
+  // Seek a key that is larger than next start key is allowed, as long as it is
+  // larger than the previous key
+  iter->Seek(kv[50 * kEntriesPerBlock + 1].first);
+  ASSERT_OK(iter->status());
+
+  // Check seek key going backward
+  iter.reset(table->NewIterator(
+      read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+  scan_options = MultiScanArgs(comparator_);
+  scan_options.use_async_io = use_async_io;
+  scan_options.insert(ExtractUserKey(kv[30 * kEntriesPerBlock].first),
+                      ExtractUserKey(kv[31 * kEntriesPerBlock].first));
+  scan_options.insert(ExtractUserKey(kv[32 * kEntriesPerBlock].first),
+                      ExtractUserKey(kv[33 * kEntriesPerBlock].first));
+  iter->Prepare(&scan_options);
+  iter->Seek(kv[32 * kEntriesPerBlock].first);
+  auto key = iter->key();
+  ASSERT_OK(iter->status());
+  iter->Seek(kv[30 * kEntriesPerBlock].first);
+  // When seek key goes backward, it is adjusted to the last seeked position.
+  // Assert the key read is same as before.
+  ASSERT_EQ(key, iter->key());
+  ASSERT_OK(iter->status());
+
+  // Test prefetch limit reached.
+  iter.reset(table->NewIterator(
+      read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+  scan_options = MultiScanArgs(comparator_);
+  scan_options.use_async_io = use_async_io;
+  scan_options.max_prefetch_size = 1024;  // less than block size
+  scan_options.insert(ExtractUserKey(kv[30 * kEntriesPerBlock].first),
+                      ExtractUserKey(kv[40 * kEntriesPerBlock].first));
+  iter->Prepare(&scan_options);
+  iter->Seek(kv[31 * kEntriesPerBlock].first);
+  ASSERT_TRUE(iter->status().IsIncomplete());
+
+  // Randomly seek keys on the file, as long as the key is moving forward, it
+  // is allowed
+
+  if (use_async_io) {
+    // Skip following test when async io is enabled. There is some issue with
+    // IO_uring that I am still trying to root cause.
+    // TODO : enable the test again with async IO
+    return;
+  }
+  for (int i = 0; i < 100; i++) {
+    iter.reset(table->NewIterator(
+        read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+    scan_options = MultiScanArgs(comparator_);
+    scan_options.use_async_io = use_async_io;
+    scan_options.insert(ExtractUserKey(kv[5 * kEntriesPerBlock].first),
+                        ExtractUserKey(kv[10 * kEntriesPerBlock].first));
+    scan_options.insert(ExtractUserKey(kv[25 * kEntriesPerBlock].first),
+                        ExtractUserKey(kv[35 * kEntriesPerBlock].first));
+    scan_options.insert(ExtractUserKey(kv[35 * kEntriesPerBlock].first),
+                        ExtractUserKey(kv[40 * kEntriesPerBlock].first));
+    scan_options.insert(ExtractUserKey(kv[45 * kEntriesPerBlock].first),
+                        ExtractUserKey(kv[50 * kEntriesPerBlock].first));
+    scan_options.insert(ExtractUserKey(kv[75 * kEntriesPerBlock].first),
+                        ExtractUserKey(kv[85 * kEntriesPerBlock].first));
+    scan_options.insert(ExtractUserKey(kv[85 * kEntriesPerBlock].first),
+                        ExtractUserKey(kv[95 * kEntriesPerBlock].first));
+
+    iter->Prepare(&scan_options);
+
+    auto random_seed = static_cast<uint32_t>(
+        std::chrono::duration_cast<std::chrono::nanoseconds>(
+            std::chrono::system_clock::now().time_since_epoch())
+            .count());
+    Random rnd(random_seed);
+    std::cout << random_seed << std::endl;
+    SCOPED_TRACE("Random seed " + std::to_string(random_seed));
+
+    // Search key always start from the start key of first prepared range.
+    int last_read_key_index = rnd.Uniform(100) + 5 * kEntriesPerBlock;
+    while (last_read_key_index < 100 * kEntriesPerBlock) {
+      iter->Seek(kv[last_read_key_index].first);
+      EXPECT_OK(iter->status());
+      // iterate for a few keys
+      while (iter->Valid()) {
+        iter->Next();
+        last_read_key_index++;
+        EXPECT_OK(iter->status());
+      }
+      last_read_key_index += rnd.Uniform(100);
+    }
+  }
+}
+
+TEST_P(BlockBasedTableReaderMultiScanTest, MultiScanPrefetchSizeLimit) {
+  if (compression_type_ != kNoCompression) {
+    // This test relies on block sizes to be close to what's set in option.
+    ROCKSDB_GTEST_BYPASS("This test assumes no compression.");
+    return;
+  }
+  ReadOptions read_opts;
+  size_t ts_sz = options_.comparator->timestamp_size();
+
+  // Generate data that spans multiple blocks
+  std::vector<std::pair<std::string, std::string>> kv =
+      BlockBasedTableReaderBaseTest::GenerateKVMap(
+          20 /* num_block */, true /* mixed_with_human_readable_string_value */,
+          ts_sz, same_key_diff_ts_, comparator_);
+
+  std::string table_name = "BlockBasedTableReaderTest_PrefetchSizeLimit" +
+                           CompressionTypeToString(compression_type_);
+
+  ImmutableOptions ioptions(options_);
+  CreateTable(table_name, ioptions, compression_type_, kv,
+              compression_parallel_threads_, compression_dict_bytes_);
+
+  std::unique_ptr<BlockBasedTable> table;
+  FileOptions foptions;
+  foptions.use_direct_reads = use_direct_reads_;
+  InternalKeyComparator comparator(options_.comparator);
+  NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table,
+                           true /* bool prefetch_index_and_filter_in_cache */,
+                           nullptr /* status */, persist_udt_);
+
+  // Default block size is 4KB
+  //
+  // Tests when no block is loaded
+  {
+    std::unique_ptr<InternalIterator> iter;
+    iter.reset(table->NewIterator(
+        read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+    MultiScanArgs scan_options(comparator_);
+    scan_options.max_prefetch_size = 1024;  // less than block size
+    scan_options.insert(ExtractUserKey(kv[0].first),
+                        ExtractUserKey(kv[5].first));
+
+    iter->Prepare(&scan_options);
+
+    // Should be able to scan the first block, but not more
+    iter->Seek(kv[0].first);
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_TRUE(iter->status().IsPrefetchLimitReached());
+  }
+
+  // Some blocks are loaded
+  {
+    std::unique_ptr<InternalIterator> iter;
+    iter.reset(table->NewIterator(
+        read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+    MultiScanArgs scan_options(comparator_);
+    scan_options.max_prefetch_size = 9 * 1024;  // 9KB - 2 blocks with buffer
+    scan_options.insert(ExtractUserKey(kv[1 * kEntriesPerBlock].first),
+                        ExtractUserKey(kv[8 * kEntriesPerBlock].first));
+
+    iter->Prepare(&scan_options);
+    iter->Seek(kv[1 * kEntriesPerBlock].first);
+    size_t scanned_keys = 0;
+
+    // Should be able to scan up to 2 blocks worth of data
+    while (iter->Valid()) {
+      ASSERT_EQ(iter->key().ToString(),
+                kv[scanned_keys + 1 * kEntriesPerBlock].first);
+      iter->Next();
+      scanned_keys++;
+    }
+
+    ASSERT_TRUE(iter->status().IsPrefetchLimitReached());
+    ASSERT_EQ(scanned_keys, 2 * kEntriesPerBlock);
+  }
+
+  // Tests with some block loaded in cache already:
+  // Blocks 1 and 2 are already in cache by the above test.
+  // Here we try blocks 0 - 5, with prefetch limit to 3 blocks, and expect to
+  // read 3 blocks.
+  {
+    std::unique_ptr<InternalIterator> iter;
+    iter.reset(table->NewIterator(
+        read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+    MultiScanArgs scan_options(comparator_);
+    scan_options.max_prefetch_size = 3 * 4 * 1024 + 1024;  // 3 blocks + 1KB
+    scan_options.insert(ExtractUserKey(kv[0].first),
+                        ExtractUserKey(kv[5 * kEntriesPerBlock].first));
+
+    iter->Prepare(&scan_options);
+    iter->Seek(kv[0].first);
+    size_t scanned_keys = 0;
+    // Should only read 3 blocks (blocks 0, 1, 2)
+    // already cached.
+    while (iter->Valid()) {
+      ASSERT_EQ(iter->key().ToString(), kv[scanned_keys].first);
+      iter->Next();
+      scanned_keys++;
+    }
+    ASSERT_TRUE(iter->status().IsPrefetchLimitReached());
+    ASSERT_EQ(scanned_keys, 3 * kEntriesPerBlock);
+  }
+
+  // Multiple scan ranges with prefetch limit
+  {
+    std::unique_ptr<InternalIterator> iter;
+    iter.reset(table->NewIterator(
+        read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+    MultiScanArgs scan_options(comparator_);
+    scan_options.max_prefetch_size = 5 * 4 * 1024 + 1024;  // 5 blocks + 1KB
+    // Will read 5 entries from first scan range, and 4 blocks from the second
+    // scan range
+    scan_options.insert(ExtractUserKey(kv[0].first),
+                        ExtractUserKey(kv[5].first));
+    scan_options.insert(ExtractUserKey(kv[12 * kEntriesPerBlock].first),
+                        ExtractUserKey(kv[17 * kEntriesPerBlock].first));
+    scan_options.insert(ExtractUserKey(kv[18 * kEntriesPerBlock].first),
+                        ExtractUserKey(kv[19 * kEntriesPerBlock].first));
+
+    iter->Prepare(&scan_options);
+
+    iter->Seek(kv[0].first);
+    size_t scanned_keys = 0;
+    size_t key_idx = 0;
+    while (iter->Valid()) {
+      ASSERT_EQ(iter->key().ToString(), kv[key_idx].first);
+      iter->Next();
+      scanned_keys++;
+      key_idx++;
+      if (key_idx == 5) {
+        iter->Seek(kv[12 * kEntriesPerBlock].first);
+        key_idx = 12 * kEntriesPerBlock;
+      }
+    }
+    ASSERT_EQ(scanned_keys, 5 + 4 * kEntriesPerBlock);
+    ASSERT_TRUE(iter->status().IsPrefetchLimitReached());
+  }
+
+  // Prefetch limit is big enough for all scan ranges.
+  {
+    std::unique_ptr<InternalIterator> iter;
+    iter.reset(table->NewIterator(
+        read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+    MultiScanArgs scan_options(comparator_);
+    scan_options.max_prefetch_size = 10 * 1024 * 1024;  // 10MB
+    scan_options.insert(ExtractUserKey(kv[0].first),
+                        ExtractUserKey(kv[5].first));
+    scan_options.insert(ExtractUserKey(kv[8 * kEntriesPerBlock].first),
+                        ExtractUserKey(kv[12 * kEntriesPerBlock].first));
+    scan_options.insert(ExtractUserKey(kv[18 * kEntriesPerBlock].first),
+                        ExtractUserKey(kv[19 * kEntriesPerBlock].first));
+
+    iter->Prepare(&scan_options);
+
+    iter->Seek(kv[0].first);
+    size_t scanned_keys = 0;
+    size_t key_idx = 0;
+    // Scan first range
+    while (iter->Valid() && key_idx < 5) {
+      ASSERT_EQ(iter->key().ToString(), kv[key_idx].first);
+      iter->Next();
+      scanned_keys++;
+      key_idx++;
+    }
+    // Move to second range
+    iter->Seek(kv[8 * kEntriesPerBlock].first);
+    key_idx = 8 * kEntriesPerBlock;
+    while (iter->Valid() && key_idx < 12 * kEntriesPerBlock) {
+      ASSERT_EQ(iter->key().ToString(), kv[key_idx].first);
+      iter->Next();
+      scanned_keys++;
+      key_idx++;
+    }
+    // Move to third range
+    iter->Seek(kv[18 * kEntriesPerBlock].first);
+    key_idx = 18 * kEntriesPerBlock;
+    while (iter->Valid() && key_idx < 19 * kEntriesPerBlock) {
+      ASSERT_EQ(iter->key().ToString(), kv[key_idx].first);
+      iter->Next();
+      scanned_keys++;
+      key_idx++;
+    }
+    // Should not hit prefetch limit
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(scanned_keys, 5 + 4 * kEntriesPerBlock + 1 * kEntriesPerBlock);
+  }
+}
+
+TEST_P(BlockBasedTableReaderMultiScanTest, MultiScanUnpinPreviousBlocks) {
+  std::vector<std::pair<std::string, std::string>> kv =
+      BlockBasedTableReaderBaseTest::GenerateKVMap(
+          30 /* num_block */, true /* mixed_with_human_readable_string_value */,
+          comparator_->timestamp_size(), same_key_diff_ts_, comparator_);
+  std::string table_name = "BlockBasedTableReaderTest_UnpinPreviousBlocks" +
+                           CompressionTypeToString(compression_type_);
+  ImmutableOptions ioptions(options_);
+  CreateTable(table_name, ioptions, compression_type_, kv,
+              compression_parallel_threads_, compression_dict_bytes_);
+
+  std::unique_ptr<BlockBasedTable> table;
+  FileOptions foptions;
+  foptions.use_direct_reads = use_direct_reads_;
+  InternalKeyComparator comparator(options_.comparator);
+  NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table,
+                           true /* bool prefetch_index_and_filter_in_cache */,
+                           nullptr /* status */, persist_udt_);
+
+  ReadOptions read_opts;
+  std::unique_ptr<InternalIterator> iter;
+  iter.reset(table->NewIterator(
+      read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+  MultiScanArgs scan_options(BytewiseComparator());
+  // Range 1: block 0-4, Range 2: block 4-4, Range 3: block 5-15
+  scan_options.insert(ExtractUserKey(kv[0 * kEntriesPerBlock].first),
+                      ExtractUserKey(kv[5 * kEntriesPerBlock - 5].first));
+  scan_options.insert(ExtractUserKey(kv[5 * kEntriesPerBlock - 4].first),
+                      ExtractUserKey(kv[5 * kEntriesPerBlock - 3].first));
+  scan_options.insert(ExtractUserKey(kv[5 * kEntriesPerBlock - 2].first),
+                      ExtractUserKey(kv[15 * kEntriesPerBlock - 1].first));
+
+  iter->Prepare(&scan_options);
+  auto* bbiter = dynamic_cast<BlockBasedTableIterator*>(iter.get());
+  ASSERT_TRUE(bbiter);
+  for (int block = 0; block < 15; ++block) {
+    ASSERT_TRUE(bbiter->TEST_IsBlockPinnedByMultiScan(block)) << block;
+  }
+
+  // MultiScan require seeks to be called in scan_option order
+  iter->Seek(kv[0 * kEntriesPerBlock].first);
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+
+  // Seek to second range - should unpin blocks from first range
+  iter->Seek(kv[5 * kEntriesPerBlock - 4].first);
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ(iter->key(), kv[5 * kEntriesPerBlock - 4].first);
+  ASSERT_EQ(iter->value(), kv[5 * kEntriesPerBlock - 4].second);
+
+  // The last block (block 4) is shared with the second range, so
+  // it's not unpinned yet.
+  for (int block = 0; block < 4; ++block) {
+    ASSERT_FALSE(bbiter->TEST_IsBlockPinnedByMultiScan(block)) << block;
+  }
+  // Blocks from second range still in cache.
+  // We skip block 4 here since it's ownership is moved to the actual data
+  // block iter.
+  for (int block = 5; block < 15; ++block) {
+    ASSERT_TRUE(bbiter->TEST_IsBlockPinnedByMultiScan(block)) << block;
+  }
+
+  iter->Seek(kv[5 * kEntriesPerBlock - 2].first);
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ(iter->key(), kv[5 * kEntriesPerBlock - 2].first);
+  ASSERT_EQ(iter->value(), kv[5 * kEntriesPerBlock - 2].second);
+
+  // Still pinned
+  for (int block = 5; block < 15; ++block) {
+    ASSERT_TRUE(bbiter->TEST_IsBlockPinnedByMultiScan(block)) << block;
+  }
+}
+
+// Test that fs_prefetch_support flag is correctly initialized during table
+// construction based on filesystem capabilities
+TEST_P(BlockBasedTableReaderTest, FSPrefetchSupportInitializedCorrectly) {
+  class ConfigurablePrefetchFS : public FileSystemWrapper {
+   public:
+    ConfigurablePrefetchFS(const std::shared_ptr<FileSystem>& target,
+                           bool support_prefetch)
+        : FileSystemWrapper(target), support_prefetch_(support_prefetch) {}
+
+    static const char* kClassName() { return "ConfigurablePrefetchFS"; }
+    const char* Name() const override { return kClassName(); }
+
+    void SupportedOps(int64_t& supported_ops) override {
+      target()->SupportedOps(supported_ops);
+      if (!support_prefetch_) {  // Disable prefetch support if requested
+        supported_ops &= ~(1 << FSSupportedOps::kFSPrefetch);
+      }
+    }
+
+   private:
+    bool support_prefetch_;
+  };
+
+  // Prepare test table
+  Options options;
+  options.persist_user_defined_timestamps = persist_udt_;
+  if (udt_enabled_) {
+    options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+  }
+  size_t ts_sz = options.comparator->timestamp_size();
+  auto kv = BlockBasedTableReaderBaseTest::GenerateKVMap(5, true, ts_sz);
+  std::string table_name = "BlockBasedTableReaderTest_BlockPrefetcherTest" +
+                           CompressionTypeToString(compression_type_);
+  ImmutableOptions ioptions(options);
+  CreateTable(table_name, ioptions, compression_type_, kv,
+              compression_parallel_threads_, compression_dict_bytes_);
+
+  // Test Case 1: Filesystem supports prefetch, fs_prefetch_support should be
+  // true
+  {
+    auto fs_with_prefetch = std::make_shared<ConfigurablePrefetchFS>(
+        env_->GetFileSystem(), true /* support_prefetch */);
+    std::unique_ptr<Env> env_wrapper(
+        new CompositeEnvWrapper(env_, fs_with_prefetch));
+    options.env = env_wrapper.get();
+
+    FileOptions fopts;
+    fopts.use_direct_reads = use_direct_reads_;
+    InternalKeyComparator cmp(options.comparator);
+    ImmutableOptions iopts(options);
+
+    std::unique_ptr<BlockBasedTable> table;
+    NewBlockBasedTableReader(fopts, iopts, cmp, table_name, &table,
+                             false /* prefetch_index_and_filter_in_cache */,
+                             nullptr, persist_udt_);
+
+    ASSERT_TRUE(table->get_rep()->fs_prefetch_support);
+    ASSERT_TRUE(CheckFSFeatureSupport(fs_with_prefetch.get(),
+                                      FSSupportedOps::kFSPrefetch));
+  }
+
+  // Test Case 2: Filesystem doesn't support prefetch, fs_prefetch_support
+  // should be false
+  {
+    auto fs_without_prefetch = std::make_shared<ConfigurablePrefetchFS>(
+        env_->GetFileSystem(), false /* support_prefetch */);
+    std::unique_ptr<Env> env_wrapper(
+        new CompositeEnvWrapper(env_, fs_without_prefetch));
+    options.env = env_wrapper.get();
+
+    FileOptions fopts;
+    fopts.use_direct_reads = use_direct_reads_;
+    InternalKeyComparator cmp(options.comparator);
+    ImmutableOptions iopts(options);
+
+    std::unique_ptr<BlockBasedTable> table;
+    NewBlockBasedTableReader(fopts, iopts, cmp, table_name, &table,
+                             false /* prefetch_index_and_filter_in_cache */,
+                             nullptr, persist_udt_);
+
+    ASSERT_FALSE(table->get_rep()->fs_prefetch_support);
+    ASSERT_FALSE(CheckFSFeatureSupport(fs_without_prefetch.get(),
+                                       FSSupportedOps::kFSPrefetch));
+  }
+}
+std::vector<BlockBasedTableReaderTestParam> GenerateCombinedParameters(
+    const std::vector<CompressionType>& compression_types,
+    const std::vector<bool>& use_direct_read_flags,
+    const std::vector<BlockBasedTableOptions::IndexType>& index_types,
+    const std::vector<bool>& no_block_cache_flags,
+    const std::vector<test::UserDefinedTimestampTestMode>& udt_test_modes,
+    const std::vector<int>& parallel_compression_thread_counts,
+    const std::vector<uint32_t>& compression_dict_byte_counts,
+    const std::vector<bool>& same_key_diff_ts_flags,
+    const std::vector<const Comparator*>& comparators,
+    const std::vector<bool>& fill_cache_flags,
+    const std::vector<bool>& use_async_io_flags,
+    const std::vector<bool>& block_align_flags,
+    const std::vector<size_t>& super_block_alignment_sizes,
+    const std::vector<size_t>& super_block_alignment_space_overhead_ratios) {
+  std::vector<BlockBasedTableReaderTestParam> params;
+  for (const auto& compression_type : compression_types) {
+    for (auto use_direct_read : use_direct_read_flags) {
+      for (const auto& index_type : index_types) {
+        for (auto no_block_cache : no_block_cache_flags) {
+          for (const auto& udt_test_mode : udt_test_modes) {
+            for (auto parallel_compression_thread_count :
+                 parallel_compression_thread_counts) {
+              for (auto compression_dict_byte_count :
+                   compression_dict_byte_counts) {
+                for (auto same_key_diff_ts_flag : same_key_diff_ts_flags) {
+                  for (const auto& comparator : comparators) {
+                    for (auto fill_cache : fill_cache_flags) {
+                      for (auto use_async_io : use_async_io_flags) {
+                        for (auto block_align : block_align_flags) {
+                          for (auto super_block_alignment_size :
+                               super_block_alignment_sizes) {
+                            for (
+                                auto
+                                    super_block_alignment_space_overhead_ratio :
+                                super_block_alignment_space_overhead_ratios) {
+                              if (super_block_alignment_size == 0) {
+                                // Override padding size to 0 if alignment size
+                                // is 0, which means no super block alignment
+                                super_block_alignment_space_overhead_ratio = 0;
+                              }
+                              params.emplace_back(
+                                  compression_type, use_direct_read, index_type,
+                                  no_block_cache, udt_test_mode,
+                                  parallel_compression_thread_count,
+                                  compression_dict_byte_count,
+                                  same_key_diff_ts_flag, comparator, fill_cache,
+                                  use_async_io, block_align,
+                                  super_block_alignment_size,
+                                  super_block_alignment_space_overhead_ratio);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  return params;
+}
+
+std::vector<bool> Bool() { return {true, false}; }
+
+struct BlockBasedTableReaderTestParamBuilder {
+  BlockBasedTableReaderTestParamBuilder() {
+    // Default values
+    compression_types = GetSupportedCompressions();
+    use_direct_read_flags = Bool();
+    index_types = {
+        BlockBasedTableOptions::IndexType::kBinarySearch,
+        BlockBasedTableOptions::IndexType::kHashSearch,
+        BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch,
+        BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey};
+    no_block_cache_flags = {false};
+    udt_test_modes = {
+        test::UserDefinedTimestampTestMode::kStripUserDefinedTimestamp};
+    parallel_compression_thread_counts = {1, 2};
+    compression_dict_byte_counts = {0, 4096};
+    same_key_diff_ts_flags = {false};
+    comparators = {BytewiseComparator()};
+    fill_cache_flags = {true};
+    use_async_io_flags = {false};
+    block_align_flags = {false};
+    super_block_alignment_sizes = {0};
+    super_block_alignment_space_overhead_ratios = {128};
+  }
+
+  // builder methods for each member
+  BlockBasedTableReaderTestParamBuilder& WithCompressionTypes(
+      const std::vector<CompressionType>& _compression_types) {
+    compression_types = _compression_types;
+    return *this;
+  }
+
+  BlockBasedTableReaderTestParamBuilder& WithUseDirectReadFlags(
+      const std::vector<bool>& _use_direct_read_flags) {
+    use_direct_read_flags = _use_direct_read_flags;
+    return *this;
+  }
+
+  BlockBasedTableReaderTestParamBuilder& WithIndexTypes(
+      const std::vector<BlockBasedTableOptions::IndexType>& _index_types) {
+    index_types = _index_types;
+    return *this;
+  }
+
+  BlockBasedTableReaderTestParamBuilder& WithNoBlockCacheFlags(
+      const std::vector<bool>& _no_block_cache_flags) {
+    no_block_cache_flags = _no_block_cache_flags;
+    return *this;
+  }
+
+  BlockBasedTableReaderTestParamBuilder& WithUDTTestModes(
+      const std::vector<test::UserDefinedTimestampTestMode>& _udt_test_modes) {
+    udt_test_modes = _udt_test_modes;
+    return *this;
+  }
+
+  BlockBasedTableReaderTestParamBuilder& WithParallelCompressionThreadCounts(
+      const std::vector<int>& _parallel_compression_thread_counts) {
+    parallel_compression_thread_counts = _parallel_compression_thread_counts;
+    return *this;
+  }
+
+  BlockBasedTableReaderTestParamBuilder& WithCompressionDictByteCounts(
+      const std::vector<uint32_t>& _compression_dict_byte_counts) {
+    compression_dict_byte_counts = _compression_dict_byte_counts;
+    return *this;
+  }
+
+  BlockBasedTableReaderTestParamBuilder& WithSameKeyDiffTsFlags(
+      const std::vector<bool>& _same_key_diff_ts_flags) {
+    same_key_diff_ts_flags = _same_key_diff_ts_flags;
+    return *this;
+  }
+
+  BlockBasedTableReaderTestParamBuilder& WithComparators(
+      const std::vector<const Comparator*>& _comparators) {
+    comparators = _comparators;
+    return *this;
+  }
+
+  BlockBasedTableReaderTestParamBuilder& WithFillCacheFlags(
+      const std::vector<bool>& _fill_cache_flags) {
+    fill_cache_flags = _fill_cache_flags;
+    return *this;
+  }
+
+  BlockBasedTableReaderTestParamBuilder& WithUseAsyncIoFlags(
+      const std::vector<bool>& _use_async_io_flags) {
+    use_async_io_flags = _use_async_io_flags;
+    return *this;
+  }
+
+  BlockBasedTableReaderTestParamBuilder& WithBlockAlignFlags(
+      const std::vector<bool>& _block_align_flags) {
+    block_align_flags = _block_align_flags;
+    return *this;
+  }
+
+  BlockBasedTableReaderTestParamBuilder& WithSuperBlockAlignmentSizes(
+      const std::vector<size_t>& _super_block_alignment_sizes) {
+    super_block_alignment_sizes = _super_block_alignment_sizes;
+    return *this;
+  }
+
+  BlockBasedTableReaderTestParamBuilder&
+  WithSuperBlockAlignmentSpaceOverheadRatios(
+      const std::vector<size_t>& _super_block_alignment_space_overhead_ratios) {
+    super_block_alignment_space_overhead_ratios =
+        _super_block_alignment_space_overhead_ratios;
+    return *this;
+  }
+
+  std::vector<BlockBasedTableReaderTestParam> build() {
+    return GenerateCombinedParameters(
+        compression_types, use_direct_read_flags, index_types,
+        no_block_cache_flags, udt_test_modes,
+        parallel_compression_thread_counts, compression_dict_byte_counts,
+        same_key_diff_ts_flags, comparators, fill_cache_flags,
+        use_async_io_flags, block_align_flags, super_block_alignment_sizes,
+        super_block_alignment_space_overhead_ratios);
+  }
+
+  std::vector<CompressionType> compression_types;
+  std::vector<bool> use_direct_read_flags;
+  std::vector<BlockBasedTableOptions::IndexType> index_types;
+  std::vector<bool> no_block_cache_flags;
+  std::vector<test::UserDefinedTimestampTestMode> udt_test_modes;
+  std::vector<int> parallel_compression_thread_counts;
+  std::vector<uint32_t> compression_dict_byte_counts;
+  std::vector<bool> same_key_diff_ts_flags;
+  std::vector<const Comparator*> comparators;
+  std::vector<bool> fill_cache_flags;
+  std::vector<bool> use_async_io_flags;
+  std::vector<bool> block_align_flags;
+  std::vector<size_t> super_block_alignment_sizes;
+  std::vector<size_t> super_block_alignment_space_overhead_ratios;
+};
+
+std::vector<bool> IOUringFlags() {
+#ifdef ROCKSDB_IOURING_PRESENT
+  return {false, true};
+#else
+  return {false};
+#endif
+}
+
 INSTANTIATE_TEST_CASE_P(
     BlockBasedTableReaderTest, BlockBasedTableReaderTest,
-    ::testing::Combine(
-        ::testing::ValuesIn(GetSupportedCompressions()), ::testing::Bool(),
-        ::testing::Values(
-            BlockBasedTableOptions::IndexType::kBinarySearch,
-            BlockBasedTableOptions::IndexType::kHashSearch,
-            BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch,
-            BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey),
-        ::testing::Values(false), ::testing::ValuesIn(test::GetUDTTestModes()),
-        ::testing::Values(1, 2), ::testing::Values(0, 4096),
-        ::testing::Values(false)));
+    ::testing::ValuesIn(BlockBasedTableReaderTestParamBuilder()
+                            .WithUDTTestModes(test::GetUDTTestModes())
+                            .build()));
+
+INSTANTIATE_TEST_CASE_P(
+    BlockBasedTableReaderMultiScanAsyncIOTest,
+    BlockBasedTableReaderMultiScanAsyncIOTest,
+    ::testing::ValuesIn(BlockBasedTableReaderTestParamBuilder()
+                            .WithComparators({BytewiseComparator(),
+                                              ReverseBytewiseComparator()})
+                            .WithFillCacheFlags(Bool())
+                            .WithUseAsyncIoFlags(IOUringFlags())
+                            .build()));
+
+INSTANTIATE_TEST_CASE_P(
+    BlockBasedTableReaderMultiScanTest, BlockBasedTableReaderMultiScanTest,
+    ::testing::ValuesIn(BlockBasedTableReaderTestParamBuilder()
+                            .WithComparators({BytewiseComparator(),
+                                              ReverseBytewiseComparator()})
+                            .build()));
+
 INSTANTIATE_TEST_CASE_P(
     BlockBasedTableReaderGetTest, BlockBasedTableReaderGetTest,
-    ::testing::Combine(
-        ::testing::ValuesIn(GetSupportedCompressions()), ::testing::Bool(),
-        ::testing::Values(
-            BlockBasedTableOptions::IndexType::kBinarySearch,
-            BlockBasedTableOptions::IndexType::kHashSearch,
-            BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch,
-            BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey),
-        ::testing::Values(false), ::testing::ValuesIn(test::GetUDTTestModes()),
-        ::testing::Values(1, 2), ::testing::Values(0, 4096),
-        ::testing::Values(false, true)));
+    ::testing::ValuesIn(BlockBasedTableReaderTestParamBuilder()
+                            .WithUDTTestModes(test::GetUDTTestModes())
+                            .WithSameKeyDiffTsFlags(Bool())
+                            .WithComparators({BytewiseComparator(),
+                                              ReverseBytewiseComparator()})
+                            .WithFillCacheFlags({false})
+                            .build()));
+
+INSTANTIATE_TEST_CASE_P(
+    BlockBasedTableReaderSuperBlockAlignTest, BlockBasedTableReaderGetTest,
+    ::testing::ValuesIn(
+        BlockBasedTableReaderTestParamBuilder()
+            .WithIndexTypes(
+                {BlockBasedTableOptions::IndexType::kBinarySearch,
+                 BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch})
+            .WithFillCacheFlags({false})
+            .WithBlockAlignFlags(Bool())
+            .WithSuperBlockAlignmentSizes({0, 32 * 1024, 16 * 1024})
+            .WithSuperBlockAlignmentSpaceOverheadRatios({0, 4, 256})
+            .build()));
+
 INSTANTIATE_TEST_CASE_P(
     StrictCapacityLimitReaderTest, StrictCapacityLimitReaderTest,
-    ::testing::Combine(
-        ::testing::ValuesIn(GetSupportedCompressions()), ::testing::Bool(),
-        ::testing::Values(
-            BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch),
-        ::testing::Values(false), ::testing::ValuesIn(test::GetUDTTestModes()),
-        ::testing::Values(1, 2), ::testing::Values(0),
-        ::testing::Values(false, true)));
+    ::testing::ValuesIn(
+        BlockBasedTableReaderTestParamBuilder()
+            .WithIndexTypes(
+                {BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch})
+            .WithUDTTestModes(test::GetUDTTestModes())
+            .WithCompressionDictByteCounts({0})
+            .WithSameKeyDiffTsFlags(Bool())
+            .WithFillCacheFlags({false})
+            .build()));
+
 INSTANTIATE_TEST_CASE_P(
     VerifyChecksum, BlockBasedTableReaderTestVerifyChecksum,
-    ::testing::Combine(
-        ::testing::ValuesIn(GetSupportedCompressions()),
-        ::testing::Values(false),
-        ::testing::Values(
-            BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch),
-        ::testing::Values(true), ::testing::ValuesIn(test::GetUDTTestModes()),
-        ::testing::Values(1, 2), ::testing::Values(0),
-        ::testing::Values(false)));
-
+    ::testing::ValuesIn(
+        BlockBasedTableReaderTestParamBuilder()
+            .WithUseDirectReadFlags({false})
+            .WithIndexTypes(
+                {BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch})
+            .WithNoBlockCacheFlags({true})
+            .WithUDTTestModes(test::GetUDTTestModes())
+            .WithCompressionDictByteCounts({0})
+            .WithFillCacheFlags({false})
+            .build()));
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/table/block_based/block_builder.cc b/table/block_based/block_builder.cc
index e4950e4356bf..541ff6ea23da 100644
--- a/table/block_based/block_builder.cc
+++ b/table/block_based/block_builder.cc
@@ -21,15 +21,19 @@
 // An entry for a particular key-value pair has the form:
 //     shared_bytes: varint32
 //     unshared_bytes: varint32
-//     value_length: varint32
+//     value_length: varint32 (NOTE1)
 //     key_delta: char[unshared_bytes]
 //     value: char[value_length]
-// shared_bytes == 0 for restart points.
+// shared_bytes == 0 (explicitly stored) for restart points.
 //
 // The trailer of the block has the form:
 //     restarts: uint32[num_restarts]
 //     num_restarts: uint32
 // restarts[i] contains the offset within the block of the ith restart point.
+//
+// NOTE1: omitted for format_version >= 4 index blocks, because the value is
+// composed of one (shared_bytes > 0) or two (shared_bytes == 0) varints, whose
+// length is self-describing.
 
 #include "table/block_based/block_builder.h"
 
@@ -129,29 +133,28 @@ Slice BlockBuilder::Finish() {
     PutFixed32(&buffer_, restarts_[i]);
   }
 
-  uint32_t num_restarts = static_cast<uint32_t>(restarts_.size());
-  BlockBasedTableOptions::DataBlockIndexType index_type =
-      BlockBasedTableOptions::kDataBlockBinarySearch;
+  DataBlockFooter footer;
+  footer.num_restarts = static_cast<uint32_t>(restarts_.size());
+  footer.index_type = BlockBasedTableOptions::kDataBlockBinarySearch;
   if (data_block_hash_index_builder_.Valid() &&
       CurrentSizeEstimate() <= kMaxBlockSizeSupportedByHashIndex) {
     data_block_hash_index_builder_.Finish(buffer_);
-    index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash;
+    footer.index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash;
   }
 
-  // footer is a packed format of data_block_index_type and num_restarts
-  uint32_t block_footer = PackIndexTypeAndNumRestarts(index_type, num_restarts);
-
-  PutFixed32(&buffer_, block_footer);
+  footer.EncodeTo(&buffer_);
   finished_ = true;
   return Slice(buffer_);
 }
 
 void BlockBuilder::Add(const Slice& key, const Slice& value,
-                       const Slice* const delta_value) {
+                       const Slice* const delta_value,
+                       bool skip_delta_encoding) {
   // Ensure no unsafe mixing of Add and AddWithLastKey
   assert(!add_with_last_key_called_);
 
-  AddWithLastKeyImpl(key, value, last_key_, delta_value, buffer_.size());
+  AddWithLastKeyImpl(key, value, last_key_, delta_value, skip_delta_encoding,
+                     buffer_.size());
   if (use_delta_encoding_) {
     // Update state
     // We used to just copy the changed data, but it appears to be
@@ -162,7 +165,8 @@ void BlockBuilder::Add(const Slice& key, const Slice& value,
 
 void BlockBuilder::AddWithLastKey(const Slice& key, const Slice& value,
                                   const Slice& last_key_param,
-                                  const Slice* const delta_value) {
+                                  const Slice* const delta_value,
+                                  bool skip_delta_encoding) {
   // Ensure no unsafe mixing of Add and AddWithLastKey
   assert(last_key_.empty());
 #ifndef NDEBUG
@@ -181,17 +185,18 @@ void BlockBuilder::AddWithLastKey(const Slice& key, const Slice& value,
 
   Slice last_key(last_key_param.data(), last_key_size * (buffer_size > 0));
 
-  AddWithLastKeyImpl(key, value, last_key, delta_value, buffer_size);
+  AddWithLastKeyImpl(key, value, last_key, delta_value, skip_delta_encoding,
+                     buffer_size);
 }
 
 inline void BlockBuilder::AddWithLastKeyImpl(const Slice& key,
                                              const Slice& value,
                                              const Slice& last_key,
                                              const Slice* const delta_value,
+                                             bool skip_delta_encoding,
                                              size_t buffer_size) {
   assert(!finished_);
   assert(counter_ <= block_restart_interval_);
-  assert(!use_value_delta_encoding_ || delta_value);
   std::string key_buf;
   std::string last_key_buf;
   const Slice key_to_persist = MaybeStripTimestampFromKey(&key_buf, key);
@@ -207,7 +212,7 @@ inline void BlockBuilder::AddWithLastKeyImpl(const Slice& key,
     restarts_.push_back(static_cast<uint32_t>(buffer_size));
     estimate_ += sizeof(uint32_t);
     counter_ = 0;
-  } else if (use_delta_encoding_) {
+  } else if (use_delta_encoding_ && !skip_delta_encoding) {
     // See how much sharing to do with previous string
     shared = key_to_persist.difference_offset(last_key_persisted);
   }
@@ -231,6 +236,7 @@ inline void BlockBuilder::AddWithLastKeyImpl(const Slice& key,
   // simplify the decoding, where it can figure which decoding to use simply by
   // looking at the shared bytes size.
   if (shared != 0 && use_value_delta_encoding_) {
+    assert(delta_value != nullptr);
     buffer_.append(delta_value->data(), delta_value->size());
   } else {
     buffer_.append(value.data(), value.size());
diff --git a/table/block_based/block_builder.h b/table/block_based/block_builder.h
index f167470bb5f5..6cc9d836ab31 100644
--- a/table/block_based/block_builder.h
+++ b/table/block_based/block_builder.h
@@ -46,7 +46,8 @@ class BlockBuilder {
   // AddWithLastKey() in contexts where previous added key is already known
   // and delta encoding might be used.
   void Add(const Slice& key, const Slice& value,
-           const Slice* const delta_value = nullptr);
+           const Slice* const delta_value = nullptr,
+           bool skip_delta_encoding = false);
 
   // A faster version of Add() if the previous key is already known for all
   // Add()s.
@@ -59,7 +60,8 @@ class BlockBuilder {
   // DO NOT mix with Add() between Resets.
   void AddWithLastKey(const Slice& key, const Slice& value,
                       const Slice& last_key,
-                      const Slice* const delta_value = nullptr);
+                      const Slice* const delta_value = nullptr,
+                      bool skip_delta_encoding = false);
 
   // Finish building the block and return a slice that refers to the
   // block contents.  The returned slice will remain valid for the
@@ -80,11 +82,13 @@ class BlockBuilder {
   // Return true iff no entries have been added since the last Reset()
   bool empty() const { return buffer_.empty(); }
 
+  std::string& MutableBuffer() { return buffer_; }
+
  private:
   inline void AddWithLastKeyImpl(const Slice& key, const Slice& value,
                                  const Slice& last_key,
                                  const Slice* const delta_value,
-                                 size_t buffer_size);
+                                 bool skip_delta_encoding, size_t buffer_size);
 
   inline const Slice MaybeStripTimestampFromKey(std::string* key_buf,
                                                 const Slice& key);
diff --git a/table/block_based/block_cache.cc b/table/block_based/block_cache.cc
index 08f5d2158dc5..28d181db5652 100644
--- a/table/block_based/block_cache.cc
+++ b/table/block_based/block_cache.cc
@@ -46,16 +46,22 @@ void BlockCreateContext::Create(std::unique_ptr<Block_kMetaIndex>* parsed_out,
       protection_bytes_per_key);
 }
 
+void BlockCreateContext::Create(
+    std::unique_ptr<Block_kUserDefinedIndex>* parsed_out,
+    BlockContents&& block) {
+  parsed_out->reset(new Block_kUserDefinedIndex(std::move(block)));
+}
+
 void BlockCreateContext::Create(
     std::unique_ptr<ParsedFullFilterBlock>* parsed_out, BlockContents&& block) {
   parsed_out->reset(new ParsedFullFilterBlock(
       table_options->filter_policy.get(), std::move(block)));
 }
 
-void BlockCreateContext::Create(std::unique_ptr<UncompressionDict>* parsed_out,
+void BlockCreateContext::Create(std::unique_ptr<DecompressorDict>* parsed_out,
                                 BlockContents&& block) {
-  parsed_out->reset(new UncompressionDict(
-      block.data, std::move(block.allocation), using_zstd));
+  parsed_out->reset(new DecompressorDict(
+      block.data, std::move(block.allocation), *decompressor));
 }
 
 namespace {
@@ -69,7 +75,7 @@ const std::array<const Cache::CacheItemHelper*,
         BlockCacheInterface<ParsedFullFilterBlock>::GetFullHelper(),
         BlockCacheInterface<Block_kFilterPartitionIndex>::GetFullHelper(),
         nullptr,  // kProperties
-        BlockCacheInterface<UncompressionDict>::GetFullHelper(),
+        BlockCacheInterface<DecompressorDict>::GetFullHelper(),
         BlockCacheInterface<Block_kRangeDeletion>::GetFullHelper(),
         nullptr,  // kHashIndexPrefixes
         nullptr,  // kHashIndexMetadata
@@ -86,7 +92,7 @@ const std::array<const Cache::CacheItemHelper*,
         BlockCacheInterface<ParsedFullFilterBlock>::GetBasicHelper(),
         BlockCacheInterface<Block_kFilterPartitionIndex>::GetBasicHelper(),
         nullptr,  // kProperties
-        BlockCacheInterface<UncompressionDict>::GetBasicHelper(),
+        BlockCacheInterface<DecompressorDict>::GetBasicHelper(),
         BlockCacheInterface<Block_kRangeDeletion>::GetBasicHelper(),
         nullptr,  // kHashIndexPrefixes
         nullptr,  // kHashIndexMetadata
diff --git a/table/block_based/block_cache.h b/table/block_based/block_cache.h
index d48a88f07137..564dcf0062db 100644
--- a/table/block_based/block_cache.h
+++ b/table/block_based/block_cache.h
@@ -67,19 +67,30 @@ class Block_kMetaIndex : public Block {
   static constexpr BlockType kBlockType = BlockType::kMetaIndex;
 };
 
+class Block_kUserDefinedIndex : public BlockContents {
+ public:
+  static constexpr CacheEntryRole kCacheEntryRole = CacheEntryRole::kIndexBlock;
+  static constexpr BlockType kBlockType = BlockType::kUserDefinedIndex;
+
+  explicit Block_kUserDefinedIndex(BlockContents&& other)
+      : BlockContents(std::move(other)) {}
+  const Slice& ContentSlice() const { return data; }
+};
+
 struct BlockCreateContext : public Cache::CreateContext {
   BlockCreateContext() {}
   BlockCreateContext(const BlockBasedTableOptions* _table_options,
                      const ImmutableOptions* _ioptions, Statistics* _statistics,
-                     bool _using_zstd, uint8_t _protection_bytes_per_key,
+                     Decompressor* _decompressor,
+                     uint8_t _protection_bytes_per_key,
                      const Comparator* _raw_ucmp,
                      bool _index_value_is_full = false,
                      bool _index_has_first_key = false)
       : table_options(_table_options),
         ioptions(_ioptions),
         statistics(_statistics),
+        decompressor(_decompressor),
         raw_ucmp(_raw_ucmp),
-        using_zstd(_using_zstd),
         protection_bytes_per_key(_protection_bytes_per_key),
         index_value_is_full(_index_value_is_full),
         index_has_first_key(_index_has_first_key) {}
@@ -87,10 +98,9 @@ struct BlockCreateContext : public Cache::CreateContext {
   const BlockBasedTableOptions* table_options = nullptr;
   const ImmutableOptions* ioptions = nullptr;
   Statistics* statistics = nullptr;
+  // TODO: refactor to avoid copying BlockCreateContext for dict in block cache
+  Decompressor* decompressor = nullptr;
   const Comparator* raw_ucmp = nullptr;
-  const UncompressionDict* dict = nullptr;
-  uint32_t format_version;
-  bool using_zstd = false;
   uint8_t protection_bytes_per_key = 0;
   bool index_value_is_full;
   bool index_has_first_key;
@@ -102,12 +112,10 @@ struct BlockCreateContext : public Cache::CreateContext {
                      CompressionType type, MemoryAllocator* alloc) {
     BlockContents uncompressed_block_contents;
     if (type != CompressionType::kNoCompression) {
-      assert(dict != nullptr);
-      UncompressionContext context(type);
-      UncompressionInfo info(context, *dict, type);
-      Status s = UncompressBlockData(
-          info, data.data(), data.size(), &uncompressed_block_contents,
-          table_options->format_version, *ioptions, alloc);
+      assert(decompressor != nullptr);
+      Status s =
+          DecompressBlockData(data.data(), data.size(), type, *decompressor,
+                              &uncompressed_block_contents, *ioptions, alloc);
       if (!s.ok()) {
         parsed_out->reset();
         return;
@@ -128,9 +136,11 @@ struct BlockCreateContext : public Cache::CreateContext {
               BlockContents&& block);
   void Create(std::unique_ptr<Block_kMetaIndex>* parsed_out,
               BlockContents&& block);
+  void Create(std::unique_ptr<Block_kUserDefinedIndex>* parsed_out,
+              BlockContents&& block);
   void Create(std::unique_ptr<ParsedFullFilterBlock>* parsed_out,
               BlockContents&& block);
-  void Create(std::unique_ptr<UncompressionDict>* parsed_out,
+  void Create(std::unique_ptr<DecompressorDict>* parsed_out,
               BlockContents&& block);
 };
 
diff --git a/table/block_based/block_prefetcher.cc b/table/block_based/block_prefetcher.cc
index 52f0ef8fdfc2..bcebf5d36db0 100644
--- a/table/block_based/block_prefetcher.cc
+++ b/table/block_based/block_prefetcher.cc
@@ -39,16 +39,21 @@ void BlockPrefetcher::PrefetchIfNeeded(
         return;
       }
       IOOptions opts;
-      Status s = rep->file->PrepareIOOptions(read_options, opts);
+      IODebugContext dbg;
+      Status s = rep->file->PrepareIOOptions(read_options, opts, &dbg);
       if (!s.ok()) {
         return;
       }
-      s = rep->file->Prefetch(opts, offset, len + compaction_readahead_size_);
-      if (s.ok()) {
-        readahead_limit_ = offset + len + compaction_readahead_size_;
-        return;
-      } else if (!s.IsNotSupported()) {
-        return;
+      if (rep->fs_prefetch_support) {
+        s = rep->file->Prefetch(opts, offset, len + compaction_readahead_size_);
+        if (s.ok()) {
+          readahead_limit_ = offset + len + compaction_readahead_size_;
+          return;
+        } else if (!s.IsNotSupported()) {
+          return;
+        }
+        // If FS prefetch returned NotSupported despite feature bit being set,
+        // fall through to use internal prefetch buffer.
       }
     }
     // If FS prefetch is not supported, fall back to use internal prefetch
@@ -58,9 +63,10 @@ void BlockPrefetcher::PrefetchIfNeeded(
     // implicit_auto_readahead is set.
     readahead_params.initial_readahead_size = compaction_readahead_size_;
     readahead_params.max_readahead_size = compaction_readahead_size_;
-    rep->CreateFilePrefetchBufferIfNotExists(readahead_params,
-                                             &prefetch_buffer_,
-                                             /*readaheadsize_cb=*/nullptr);
+    rep->CreateFilePrefetchBufferIfNotExists(
+        readahead_params, &prefetch_buffer_,
+        /*readaheadsize_cb=*/nullptr,
+        /*usage=*/FilePrefetchBufferUsage::kCompactionPrefetch);
     return;
   }
 
@@ -140,19 +146,23 @@ void BlockPrefetcher::PrefetchIfNeeded(
   if (!s.ok()) {
     return;
   }
-  s = rep->file->Prefetch(
-      opts, handle.offset(),
-      BlockBasedTable::BlockSizeWithTrailer(handle) + readahead_size_);
-  if (s.IsNotSupported()) {
-    rep->CreateFilePrefetchBufferIfNotExists(
-        readahead_params, &prefetch_buffer_, readaheadsize_cb,
-        /*usage=*/FilePrefetchBufferUsage::kUserScanPrefetch);
-    return;
-  }
 
-  readahead_limit_ = offset + len + readahead_size_;
-  // Keep exponentially increasing readahead size until
-  // max_auto_readahead_size.
-  readahead_size_ = std::min(max_auto_readahead_size, readahead_size_ * 2);
+  if (rep->fs_prefetch_support) {
+    s = rep->file->Prefetch(
+        opts, handle.offset(),
+        BlockBasedTable::BlockSizeWithTrailer(handle) + readahead_size_);
+    if (s.ok()) {
+      readahead_limit_ = offset + len + readahead_size_;
+      // Keep exponentially increasing readahead size until
+      // max_auto_readahead_size.
+      readahead_size_ = std::min(max_auto_readahead_size, readahead_size_ * 2);
+      return;
+    }
+  }
+  // If FS prefetch is not supported or returned NotSupported, fall back to use
+  // internal prefetch buffer.
+  rep->CreateFilePrefetchBufferIfNotExists(
+      readahead_params, &prefetch_buffer_, readaheadsize_cb,
+      /*usage=*/FilePrefetchBufferUsage::kUserScanPrefetch);
 }
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/block_test.cc b/table/block_based/block_test.cc
index b1a855263daa..49bec09084f6 100644
--- a/table/block_based/block_test.cc
+++ b/table/block_based/block_test.cc
@@ -33,10 +33,10 @@
 namespace ROCKSDB_NAMESPACE {
 
 std::string GenerateInternalKey(int primary_key, int secondary_key,
-                                int padding_size, Random *rnd,
+                                int padding_size, Random* rnd,
                                 size_t ts_sz = 0) {
   char buf[50];
-  char *p = &buf[0];
+  char* p = &buf[0];
   snprintf(buf, sizeof(buf), "%6d%4d", primary_key, secondary_key);
   std::string k(p);
   if (padding_size) {
@@ -55,8 +55,8 @@ std::string GenerateInternalKey(int primary_key, int secondary_key,
 // Generate random key value pairs.
 // The generated key will be sorted. You can tune the parameters to generated
 // different kinds of test key/value pairs for different scenario.
-void GenerateRandomKVs(std::vector<std::string> *keys,
-                       std::vector<std::string> *values, const int from,
+void GenerateRandomKVs(std::vector<std::string>* keys,
+                       std::vector<std::string>* values, const int from,
                        const int len, const int step = 1,
                        const int padding_size = 0,
                        const int keys_share_prefix = 1, size_t ts_sz = 0) {
@@ -133,7 +133,7 @@ TEST_P(BlockTest, SimpleTest) {
 
   // read contents of block sequentially
   int count = 0;
-  InternalIterator *iter = reader.NewDataIterator(
+  InternalIterator* iter = reader.NewDataIterator(
       options.comparator, kDisableGlobalSequenceNumber, nullptr /* iter */,
       nullptr /* stats */, false /* block_contents_pinned */,
       shouldPersistUDT());
@@ -169,9 +169,9 @@ TEST_P(BlockTest, SimpleTest) {
 
 // return the block contents
 BlockContents GetBlockContents(
-    std::unique_ptr<BlockBuilder> *builder,
-    const std::vector<std::string> &keys,
-    const std::vector<std::string> &values, bool key_use_delta_encoding,
+    std::unique_ptr<BlockBuilder>* builder,
+    const std::vector<std::string>& keys,
+    const std::vector<std::string>& values, bool key_use_delta_encoding,
     size_t ts_sz, bool should_persist_udt, const int /*prefix_group_size*/ = 1,
     BlockBasedTableOptions::DataBlockIndexType dblock_index_type =
         BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch) {
@@ -194,8 +194,8 @@ BlockContents GetBlockContents(
 }
 
 void CheckBlockContents(BlockContents contents, const int max_key,
-                        const std::vector<std::string> &keys,
-                        const std::vector<std::string> &values,
+                        const std::vector<std::string>& keys,
+                        const std::vector<std::string>& values,
                         bool is_udt_enabled, bool should_persist_udt) {
   const size_t prefix_size = 6;
   // create block reader
@@ -356,8 +356,8 @@ class BlockReadAmpBitmapSlowAndAccurate {
 TEST_F(BlockTest, BlockReadAmpBitmap) {
   uint32_t pin_offset = 0;
   SyncPoint::GetInstance()->SetCallBack(
-      "BlockReadAmpBitmap:rnd", [&pin_offset](void *arg) {
-        pin_offset = *(static_cast<uint32_t *>(arg));
+      "BlockReadAmpBitmap:rnd", [&pin_offset](void* arg) {
+        pin_offset = *(static_cast<uint32_t*>(arg));
       });
   SyncPoint::GetInstance()->EnableProcessing();
   std::vector<size_t> block_sizes = {
@@ -414,7 +414,7 @@ TEST_F(BlockTest, BlockReadAmpBitmap) {
 
     for (size_t i = 0; i < random_entries.size(); i++) {
       read_amp_slow_and_accurate.ResetCheckSequence();
-      auto &current_entry = random_entries[rnd.Next() % random_entries.size()];
+      auto& current_entry = random_entries[rnd.Next() % random_entries.size()];
 
       read_amp_bitmap.Mark(static_cast<uint32_t>(current_entry.first),
                            static_cast<uint32_t>(current_entry.second));
@@ -465,7 +465,7 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) {
 
     // read contents of block sequentially
     size_t read_bytes = 0;
-    DataBlockIter *iter = reader.NewDataIterator(
+    DataBlockIter* iter = reader.NewDataIterator(
         options.comparator, kDisableGlobalSequenceNumber, nullptr, stats.get());
     for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
       iter->value();
@@ -496,7 +496,7 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) {
     Block reader(std::move(contents), kBytesPerBit, stats.get());
 
     size_t read_bytes = 0;
-    DataBlockIter *iter = reader.NewDataIterator(
+    DataBlockIter* iter = reader.NewDataIterator(
         options.comparator, kDisableGlobalSequenceNumber, nullptr, stats.get());
     for (int i = 0; i < num_records; i++) {
       Slice k(keys[i]);
@@ -530,7 +530,7 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) {
     Block reader(std::move(contents), kBytesPerBit, stats.get());
 
     size_t read_bytes = 0;
-    DataBlockIter *iter = reader.NewDataIterator(
+    DataBlockIter* iter = reader.NewDataIterator(
         options.comparator, kDisableGlobalSequenceNumber, nullptr, stats.get());
     std::unordered_set<int> read_keys;
     for (int i = 0; i < num_records; i++) {
@@ -576,10 +576,29 @@ TEST_F(BlockTest, ReadAmpBitmapPow2) {
   ASSERT_EQ(BlockReadAmpBitmap(100, 35, stats.get()).GetBytesPerBit(), 32u);
 }
 
+void AddIndexBlockEntry(BlockBuilder& builder, const Slice& key,
+                        const BlockHandle& bh, const BlockHandle* prev,
+                        bool include_first_key,
+                        const Slice& first_internal_key = Slice()) {
+  IndexValue entry(bh, first_internal_key);
+  std::string encoded_entry;
+  entry.EncodeTo(&encoded_entry, include_first_key, nullptr);
+  std::string delta_encoded_entry;
+  if (prev) {
+    entry.EncodeTo(&delta_encoded_entry, include_first_key, prev);
+  }
+  const Slice delta_slice(delta_encoded_entry);
+  builder.Add(key, encoded_entry, &delta_slice);
+}
+
+enum class KeyDistribution { kUniform, kNonUniform };
+
 class IndexBlockTest
     : public testing::Test,
       public testing::WithParamInterface<
-          std::tuple<bool, bool, bool, test::UserDefinedTimestampTestMode>> {
+          std::tuple<bool, bool, bool, test::UserDefinedTimestampTestMode,
+                     BlockBasedTableOptions::BlockSearchType, int, int, int,
+                     int, KeyDistribution>> {
  public:
   IndexBlockTest() = default;
 
@@ -592,25 +611,52 @@ class IndexBlockTest
   bool shouldPersistUDT() const {
     return test::ShouldPersistUDT(std::get<3>(GetParam()));
   }
+  BlockBasedTableOptions::BlockSearchType indexSearchType() const {
+    return isUDTEnabled() ? BlockBasedTableOptions::kBinary
+                          : std::get<4>(GetParam());
+  }
+  int numRecords() const {
+    return std::min(1 << keyLength(), std::get<5>(GetParam()));
+  }
+  int indexBlockRestartInterval() const { return std::get<6>(GetParam()); }
+  int keyLength() const { return std::get<7>(GetParam()); }
+  int prefixLength() const { return std::get<8>(GetParam()); }
+  KeyDistribution keyDistribution() const { return std::get<9>(GetParam()); }
 };
 
-// Similar to GenerateRandomKVs but for index block contents.
-void GenerateRandomIndexEntries(std::vector<std::string> *separators,
-                                std::vector<BlockHandle> *block_handles,
-                                std::vector<std::string> *first_keys,
-                                const int len, size_t ts_sz = 0,
-                                bool zero_seqno = false) {
+// Similar to GenerateRandomKVs but for index block contents. Keys always
+// contain a 0-sequence number, callers may extract the user key if needed.
+void GenerateRandomIndexEntries(
+    std::vector<std::string>* separators,
+    std::vector<BlockHandle>* block_handles,
+    std::vector<std::string>* first_keys, const int len, size_t ts_sz = 0,
+    int key_length = 12, int prefix_length = 0,
+    KeyDistribution distribution = KeyDistribution::kUniform) {
   Random rnd(42);
+  std::string prefix(prefix_length, 'x');
 
   // For each of `len` blocks, we need to generate a first and last key.
-  // Let's generate n*2 random keys, sort them, group into consecutive pairs.
+  // Generate n*2 random keys, sort them, group into consecutive pairs.
   std::set<std::string> keys;
+
+  // Two clusters with shared prefixes of effective_key_length - 2. This
+  // stresses interpolation search's uniform distribution assumption.
+  int cluster_prefix_len = std::max(0, key_length - 5);
+  std::string cluster1_prefix = prefix + rnd.RandomString(cluster_prefix_len);
+  std::string cluster2_prefix = prefix + rnd.RandomString(cluster_prefix_len);
+
   while ((int)keys.size() < len * 2) {
-    // Keys need to be at least 8 bytes long to look like internal keys.
-    std::string new_key = test::RandomKey(&rnd, 12);
-    if (zero_seqno) {
-      AppendInternalKeyFooter(&new_key, 0 /* seqno */, kTypeValue);
+    std::string new_key;
+    if (distribution == KeyDistribution::kNonUniform) {
+      int remaining = key_length - cluster_prefix_len;
+      const std::string& cp =
+          (keys.size() % 2 == 0) ? cluster1_prefix : cluster2_prefix;
+      new_key = cp + rnd.RandomString(std::max(1, remaining));
+    } else {
+      new_key = prefix + test::RandomKey(&rnd, key_length);
     }
+
+    AppendInternalKeyFooter(&new_key, 0 /* seqno */, kTypeValue);
     if (ts_sz > 0) {
       std::string key;
       PadInternalKeyWithMinTimestamp(&key, new_key, ts_sz);
@@ -643,15 +689,17 @@ TEST_P(IndexBlockTest, IndexValueEncodingTest) {
   std::vector<BlockHandle> block_handles;
   std::vector<std::string> first_keys;
   const bool kUseDeltaEncoding = true;
-  BlockBuilder builder(16, kUseDeltaEncoding, useValueDeltaEncoding(),
+  BlockBuilder builder(indexBlockRestartInterval(), kUseDeltaEncoding,
+                       useValueDeltaEncoding(),
                        BlockBasedTableOptions::kDataBlockBinarySearch,
                        0.75 /* data_block_hash_table_util_ratio */, ts_sz,
                        shouldPersistUDT(), !keyIncludesSeq());
 
-  int num_records = 100;
+  int num_records = numRecords();
 
   GenerateRandomIndexEntries(&separators, &block_handles, &first_keys,
-                             num_records, ts_sz, false /* zero_seqno */);
+                             num_records, ts_sz, keyLength(), prefixLength(),
+                             keyDistribution());
   BlockHandle last_encoded_handle;
   for (int i = 0; i < num_records; i++) {
     std::string first_key_to_persist_buf;
@@ -661,23 +709,13 @@ TEST_P(IndexBlockTest, IndexValueEncodingTest) {
                                     ts_sz);
       first_internal_key = first_key_to_persist_buf;
     }
-    IndexValue entry(block_handles[i], first_internal_key);
-    std::string encoded_entry;
-    std::string delta_encoded_entry;
-    entry.EncodeTo(&encoded_entry, includeFirstKey(), nullptr);
-    if (useValueDeltaEncoding() && i > 0) {
-      entry.EncodeTo(&delta_encoded_entry, includeFirstKey(),
-                     &last_encoded_handle);
-    }
-    last_encoded_handle = entry.handle;
-    const Slice delta_encoded_entry_slice(delta_encoded_entry);
-
-    if (keyIncludesSeq()) {
-      builder.Add(separators[i], encoded_entry, &delta_encoded_entry_slice);
-    } else {
-      const Slice user_key = ExtractUserKey(separators[i]);
-      builder.Add(user_key, encoded_entry, &delta_encoded_entry_slice);
-    }
+    const BlockHandle* prev =
+        (useValueDeltaEncoding() && i > 0) ? &last_encoded_handle : nullptr;
+    Slice add_key =
+        keyIncludesSeq() ? Slice(separators[i]) : ExtractUserKey(separators[i]);
+    AddIndexBlockEntry(builder, add_key, block_handles[i], prev,
+                       includeFirstKey(), first_internal_key);
+    last_encoded_handle = block_handles[i];
   }
 
   // read serialized contents of the block
@@ -689,14 +727,14 @@ TEST_P(IndexBlockTest, IndexValueEncodingTest) {
   Block reader(std::move(contents));
 
   const bool kTotalOrderSeek = true;
-  IndexBlockIter *kNullIter = nullptr;
-  Statistics *kNullStats = nullptr;
+  IndexBlockIter* kNullIter = nullptr;
+  Statistics* kNullStats = nullptr;
   // read contents of block sequentially
-  InternalIteratorBase<IndexValue> *iter = reader.NewIndexIterator(
+  InternalIteratorBase<IndexValue>* iter = reader.NewIndexIterator(
       options.comparator, kDisableGlobalSequenceNumber, kNullIter, kNullStats,
       kTotalOrderSeek, includeFirstKey(), keyIncludesSeq(),
       !useValueDeltaEncoding(), false /* block_contents_pinned */,
-      shouldPersistUDT());
+      shouldPersistUDT(), nullptr /* prefix_index */, indexSearchType());
   iter->SeekToFirst();
   for (int index = 0; index < num_records; ++index) {
     ASSERT_TRUE(iter->Valid());
@@ -724,7 +762,7 @@ TEST_P(IndexBlockTest, IndexValueEncodingTest) {
       options.comparator, kDisableGlobalSequenceNumber, kNullIter, kNullStats,
       kTotalOrderSeek, includeFirstKey(), keyIncludesSeq(),
       !useValueDeltaEncoding(), false /* block_contents_pinned */,
-      shouldPersistUDT());
+      shouldPersistUDT(), nullptr /* prefix_index */, indexSearchType());
   for (int i = 0; i < num_records * 2; i++) {
     // find a random key in the lookaside array
     int index = rnd.Uniform(num_records);
@@ -753,10 +791,205 @@ TEST_P(IndexBlockTest, IndexValueEncodingTest) {
 // Param 1: use value delta encoding
 // Param 2: include first key
 // Param 3: user-defined timestamp test mode
+// Param 4: index search type (binary search or interpolation search)
+// Param 5: number of records
+// Param 6: index block restart interval
+// Param 7: key length
+// Param 8: prefix length
+// Param 9: key distribution (uniform or non-uniform)
 INSTANTIATE_TEST_CASE_P(
     P, IndexBlockTest,
-    ::testing::Combine(::testing::Bool(), ::testing::Bool(), ::testing::Bool(),
-                       ::testing::ValuesIn(test::GetUDTTestModes())));
+    ::testing::Combine(
+        ::testing::Bool(), ::testing::Bool(), ::testing::Bool(),
+        ::testing::ValuesIn(test::GetUDTTestModes()),
+        ::testing::Values(
+            BlockBasedTableOptions::BlockSearchType::kBinary,
+            BlockBasedTableOptions::BlockSearchType::kInterpolation),
+        ::testing::Values(1, 100),    // num_records
+        ::testing::Values(1, 16),     // index_block_restart_interval
+        ::testing::Values(1, 8, 12),  // key_length
+        ::testing::Values(0, 50),     // prefix_length
+        ::testing::Values(KeyDistribution::kUniform,
+                          KeyDistribution::kNonUniform)));
+
+TEST(IndexBlockTest, InterpolationSearchPrefixBoundary) {
+  const bool kIncludeFirstKey = false;
+  const bool kUseValueDeltaEncoding = true;
+  const uint64_t kBlockSize = 50;
+
+  // 20 user keys sharing prefix "ABCDEFGHIJ" with evenly spaced suffixes.
+  const std::string kPrefix = "ABCDEFGHIJ";
+  const int kNumKeys = 20;
+  std::vector<std::string> keys;
+  keys.reserve(kNumKeys);
+  for (int i = 0; i < kNumKeys; i++) {
+    std::string suffix = std::to_string(i);
+    char formatted_suffix[4];
+    snprintf(formatted_suffix, sizeof(formatted_suffix), "%03d", i);
+    keys.push_back(kPrefix + formatted_suffix);
+  }
+
+  std::vector<BlockHandle> handles;
+  handles.reserve(kNumKeys);
+  for (int i = 0; i < kNumKeys; i++) {
+    handles.emplace_back(i * (kBlockSize + BlockBasedTable::kBlockTrailerSize),
+                         kBlockSize);
+  }
+
+  BlockBuilder builder(
+      1 /* restart_interval */, true /* use_delta_encoding */,
+      kUseValueDeltaEncoding, BlockBasedTableOptions::kDataBlockBinarySearch,
+      0.75 /* data_block_hash_table_util_ratio */, 0 /* ts_sz */,
+      false /* persist_udt */, true /* is_user_key */);
+
+  for (int i = 0; i < kNumKeys; i++) {
+    BlockHandle* prev = i > 0 ? &handles[i - 1] : nullptr;
+    AddIndexBlockEntry(builder, keys[i], handles[i], prev, kIncludeFirstKey);
+  }
+
+  Slice rawblock = builder.Finish();
+  BlockContents contents;
+  contents.data = rawblock;
+  Block reader(std::move(contents));
+
+  // Seek targets must be internal keys since SeekImpl calls ExtractUserKey().
+  auto make_target = [](const std::string& user_key) {
+    std::string target = user_key;
+    AppendInternalKeyFooter(&target, kMaxSequenceNumber, kValueTypeForSeek);
+    return target;
+  };
+
+  std::unique_ptr<InternalIteratorBase<IndexValue>> iter(
+      reader.NewIndexIterator(
+          BytewiseComparator(), kDisableGlobalSequenceNumber,
+          nullptr /* iter */, nullptr /* stats */, true /* total_order_seek */,
+          kIncludeFirstKey, false /* key_includes_seq */,
+          !kUseValueDeltaEncoding /* value_is_full */,
+          false /* block_contents_pinned */,
+          true /* user_defined_timestamps_persisted */,
+          nullptr /* prefix_index */,
+          BlockBasedTableOptions::BlockSearchType::kInterpolation));
+
+  // Case 1: target prefix < shared prefix
+  iter->Seek(make_target("AAAAAA"));
+  ASSERT_TRUE(iter->Valid());
+  EXPECT_EQ(iter->key(), keys[0]);
+
+  iter->Seek(make_target(""));
+  ASSERT_TRUE(iter->Valid());
+  EXPECT_EQ(iter->key(), keys[0]);
+
+  // Case 2: target prefix > shared prefix
+  iter->Seek(make_target("ABCDEFGHZZ"));
+  ASSERT_FALSE(iter->Valid());
+
+  // Case 3: target is the prefix
+  iter->Seek(make_target("ABCDEFGHIJ"));
+  ASSERT_TRUE(iter->Valid());
+  EXPECT_EQ(iter->key(), keys[0]);
+
+  // Case 4: target a subset of the prefix
+  iter->Seek(make_target("ABCDEFG"));
+  ASSERT_TRUE(iter->Valid());
+  EXPECT_EQ(iter->key(), keys[0]);
+}
+
+// Like the above test, but extend the shared prefix into internal bytes
+TEST(IndexBlockTest, InterpolationSearchPrefixBoundary2) {
+  const bool kIncludeFirstKey = false;
+  const bool kUseValueDeltaEncoding = true;
+  const uint64_t kBlockSize = 50;
+
+  // 20 internal keys with the same user key but decreasing sequence numbers
+  // (which is ascending InternalKeyComparator order).
+  const std::string kUserKey = "ABCDEFGHIJ";
+  const int kNumKeys = 20;
+  std::vector<std::string> keys;
+  keys.reserve(kNumKeys);
+  for (int i = 0; i < kNumKeys; i++) {
+    std::string ikey = kUserKey;
+    SequenceNumber seq = static_cast<SequenceNumber>(kNumKeys - i);
+    AppendInternalKeyFooter(&ikey, seq, kTypeValue);
+    keys.push_back(ikey);
+  }
+
+  std::vector<BlockHandle> handles;
+  handles.reserve(kNumKeys);
+  for (int i = 0; i < kNumKeys; i++) {
+    handles.emplace_back(i * (kBlockSize + BlockBasedTable::kBlockTrailerSize),
+                         kBlockSize);
+  }
+
+  BlockBuilder builder(
+      1 /* restart_interval */, true /* use_delta_encoding */,
+      kUseValueDeltaEncoding, BlockBasedTableOptions::kDataBlockBinarySearch,
+      0.75 /* data_block_hash_table_util_ratio */, 0 /* ts_sz */,
+      false /* persist_udt */, false /* is_user_key */);
+
+  for (int i = 0; i < kNumKeys; i++) {
+    BlockHandle* prev = i > 0 ? &handles[i - 1] : nullptr;
+    AddIndexBlockEntry(builder, keys[i], handles[i], prev, kIncludeFirstKey);
+  }
+
+  Slice rawblock = builder.Finish();
+  BlockContents contents;
+  contents.data = rawblock;
+  Block reader(std::move(contents));
+
+  auto make_target = [&](const std::string& user_key,
+                         SequenceNumber seq = kMaxSequenceNumber) {
+    std::string target = user_key;
+    AppendInternalKeyFooter(&target, seq, kTypeValue);
+    return target;
+  };
+
+  std::unique_ptr<InternalIteratorBase<IndexValue>> iter(
+      reader.NewIndexIterator(
+          BytewiseComparator(), kDisableGlobalSequenceNumber,
+          nullptr /* iter */, nullptr /* stats */, true /* total_order_seek */,
+          kIncludeFirstKey, true /* key_includes_seq */,
+          !kUseValueDeltaEncoding /* value_is_full */,
+          false /* block_contents_pinned */,
+          true /* user_defined_timestamps_persisted */,
+          nullptr /* prefix_index */,
+          BlockBasedTableOptions::BlockSearchType::kInterpolation));
+
+  // Seek to each existing sequence number
+  for (int i = 0; i < kNumKeys; i++) {
+    SequenceNumber seq = static_cast<SequenceNumber>(kNumKeys - i);
+    iter->Seek(make_target(kUserKey, seq));
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(iter->key(), keys[i]);
+  }
+
+  // Case 1: target prefix < shared prefix
+  iter->Seek(make_target("AAAAAA"));
+  ASSERT_TRUE(iter->Valid());
+  EXPECT_EQ(iter->key(), keys[0]);
+
+  iter->Seek(make_target(""));
+  ASSERT_TRUE(iter->Valid());
+  EXPECT_EQ(iter->key(), keys[0]);
+
+  // Case 2: target prefix > shared prefix
+  iter->Seek(make_target("ABCDEFGHZZ"));
+  ASSERT_FALSE(iter->Valid());
+
+  // Case 3: target has the same user key with kMaxSequenceNumber
+  iter->Seek(make_target("ABCDEFGHIJ"));
+  ASSERT_TRUE(iter->Valid());
+  EXPECT_EQ(iter->key(), keys[0]);
+
+  // Case 4: target a subset of the prefix
+  iter->Seek(make_target("ABCDEFG"));
+  ASSERT_TRUE(iter->Valid());
+  EXPECT_EQ(iter->key(), keys[0]);
+
+  // Case 5: target key is a prefix that also extends into the internal bytes
+  // footer
+  iter->Seek(make_target("ABCDEFGHIJ" + std::string(1, kTypeValue)));
+  ASSERT_FALSE(iter->Valid());
+}
 
 class BlockPerKVChecksumTest : public DBTestBase {
  public:
@@ -764,8 +997,8 @@ class BlockPerKVChecksumTest : public DBTestBase {
       : DBTestBase("block_per_kv_checksum", /*env_do_fsync=*/false) {}
 
   template <typename TBlockIter>
-  void TestIterateForward(std::unique_ptr<TBlockIter> &biter,
-                          size_t &verification_count) {
+  void TestIterateForward(std::unique_ptr<TBlockIter>& biter,
+                          size_t& verification_count) {
     while (biter->Valid()) {
       verification_count = 0;
       biter->Next();
@@ -776,8 +1009,8 @@ class BlockPerKVChecksumTest : public DBTestBase {
   }
 
   template <typename TBlockIter>
-  void TestIterateBackward(std::unique_ptr<TBlockIter> &biter,
-                           size_t &verification_count) {
+  void TestIterateBackward(std::unique_ptr<TBlockIter>& biter,
+                           size_t& verification_count) {
     while (biter->Valid()) {
       verification_count = 0;
       biter->Prev();
@@ -788,8 +1021,8 @@ class BlockPerKVChecksumTest : public DBTestBase {
   }
 
   template <typename TBlockIter>
-  void TestSeekToFirst(std::unique_ptr<TBlockIter> &biter,
-                       size_t &verification_count) {
+  void TestSeekToFirst(std::unique_ptr<TBlockIter>& biter,
+                       size_t& verification_count) {
     verification_count = 0;
     biter->SeekToFirst();
     ASSERT_GE(verification_count, 1);
@@ -797,8 +1030,8 @@ class BlockPerKVChecksumTest : public DBTestBase {
   }
 
   template <typename TBlockIter>
-  void TestSeekToLast(std::unique_ptr<TBlockIter> &biter,
-                      size_t &verification_count) {
+  void TestSeekToLast(std::unique_ptr<TBlockIter>& biter,
+                      size_t& verification_count) {
     verification_count = 0;
     biter->SeekToLast();
     ASSERT_GE(verification_count, 1);
@@ -806,8 +1039,8 @@ class BlockPerKVChecksumTest : public DBTestBase {
   }
 
   template <typename TBlockIter>
-  void TestSeekForPrev(std::unique_ptr<TBlockIter> &biter,
-                       size_t &verification_count, std::string k) {
+  void TestSeekForPrev(std::unique_ptr<TBlockIter>& biter,
+                       size_t& verification_count, const std::string& k) {
     verification_count = 0;
     biter->SeekForPrev(k);
     ASSERT_GE(verification_count, 1);
@@ -815,16 +1048,16 @@ class BlockPerKVChecksumTest : public DBTestBase {
   }
 
   template <typename TBlockIter>
-  void TestSeek(std::unique_ptr<TBlockIter> &biter, size_t &verification_count,
-                std::string k) {
+  void TestSeek(std::unique_ptr<TBlockIter>& biter, size_t& verification_count,
+                const std::string& k) {
     verification_count = 0;
     biter->Seek(k);
     ASSERT_GE(verification_count, 1);
     TestIterateForward(biter, verification_count);
   }
 
-  bool VerifyChecksum(uint32_t checksum_len, const char *checksum_ptr,
-                      const Slice &key, const Slice &val) {
+  bool VerifyChecksum(uint32_t checksum_len, const char* checksum_ptr,
+                      const Slice& key, const Slice& val) {
     if (!checksum_len) {
       return checksum_ptr == nullptr;
     }
@@ -833,6 +1066,18 @@ class BlockPerKVChecksumTest : public DBTestBase {
   }
 };
 
+namespace {
+const BlockBasedTableOptions* kTableOptions() {
+  static BlockBasedTableOptions opts{};
+  return &opts;
+}
+Decompressor* kDecompressor() {
+  static auto mgr = GetBuiltinV2CompressionManager();
+  static auto decomp = mgr->GetDecompressor();
+  return decomp.get();
+}
+}  // namespace
+
 TEST_F(BlockPerKVChecksumTest, EmptyBlock) {
   // Tests that empty block code path is not broken by per kv checksum.
   BlockBuilder builder(
@@ -845,14 +1090,11 @@ TEST_F(BlockPerKVChecksumTest, EmptyBlock) {
 
   std::unique_ptr<Block_kData> data_block;
   Options options = Options();
-  BlockBasedTableOptions tbo;
   uint8_t protection_bytes_per_key = 8;
-  BlockCreateContext create_context{&tbo,
-                                    nullptr,
-                                    nullptr /* statistics */,
-                                    false /* using_zstd */,
-                                    protection_bytes_per_key,
-                                    options.comparator};
+  BlockCreateContext create_context{
+      kTableOptions(),          nullptr,
+      nullptr /* statistics */, kDecompressor(),
+      protection_bytes_per_key, options.comparator};
   create_context.Create(&data_block, std::move(contents));
   std::unique_ptr<DataBlockIter> biter{data_block->NewDataIterator(
       options.comparator, kDisableGlobalSequenceNumber)};
@@ -885,14 +1127,10 @@ TEST_F(BlockPerKVChecksumTest, InitializeProtectionInfo) {
   // Make sure that the checksum construction code path does not break
   // when the block is itself already corrupted.
   Options options = Options();
-  BlockBasedTableOptions tbo;
   uint8_t protection_bytes_per_key = 8;
-  BlockCreateContext create_context{&tbo,
-                                    nullptr /* ioptions */,
-                                    nullptr /* statistics */,
-                                    false /* using_zstd */,
-                                    protection_bytes_per_key,
-                                    options.comparator};
+  BlockCreateContext create_context{
+      kTableOptions(), nullptr /* ioptions */,   nullptr /* statistics */,
+      kDecompressor(), protection_bytes_per_key, options.comparator};
 
   {
     std::string invalid_content = "1";
@@ -950,20 +1188,19 @@ TEST_F(BlockPerKVChecksumTest, ApproximateMemory) {
   };
 
   Options options = Options();
-  BlockBasedTableOptions tbo;
   uint8_t protection_bytes_per_key = 8;
   BlockCreateContext with_checksum_create_context{
-      &tbo,
+      kTableOptions(),
       nullptr /* ioptions */,
       nullptr /* statistics */,
-      false /* using_zstd */,
+      kDecompressor(),
       protection_bytes_per_key,
       options.comparator,
       true /* index_value_is_full */};
-  BlockCreateContext create_context{&tbo,
+  BlockCreateContext create_context{kTableOptions(),
                                     nullptr /* ioptions */,
                                     nullptr /* statistics */,
-                                    false /* using_zstd */,
+                                    kDecompressor(),
                                     0,
                                     options.comparator,
                                     true /* index_value_is_full */};
@@ -1052,15 +1289,11 @@ class DataBlockKVChecksumTest
   bool GetUseDeltaEncoding() const { return std::get<3>(GetParam()); }
 
   std::unique_ptr<Block_kData> GenerateDataBlock(
-      std::vector<std::string> &keys, std::vector<std::string> &values,
+      std::vector<std::string>& keys, std::vector<std::string>& values,
       int num_record) {
-    BlockBasedTableOptions tbo;
-    BlockCreateContext create_context{&tbo,
-                                      nullptr /* statistics */,
-                                      nullptr /* ioptions */,
-                                      false /* using_zstd */,
-                                      GetChecksumLen(),
-                                      Options().comparator};
+    BlockCreateContext create_context{
+        kTableOptions(), nullptr /* statistics */, nullptr /* ioptions */,
+        kDecompressor(), GetChecksumLen(),         Options().comparator};
     builder_ = std::make_unique<BlockBuilder>(
         static_cast<int>(GetRestartInterval()),
         GetUseDeltaEncoding() /* use_delta_encoding */,
@@ -1089,9 +1322,9 @@ INSTANTIATE_TEST_CASE_P(
         ::testing::Values(0, 1, 2, 4, 8) /* protection_bytes_per_key */,
         ::testing::Values(1, 2, 3, 8, 16) /* restart_interval */,
         ::testing::Values(false, true)) /* delta_encoding */,
-    [](const testing::TestParamInfo<std::tuple<
-           BlockBasedTableOptions::DataBlockIndexType, uint8_t, uint32_t, bool>>
-           &args) {
+    [](const testing::TestParamInfo<
+        std::tuple<BlockBasedTableOptions::DataBlockIndexType, uint8_t,
+                   uint32_t, bool>>& args) {
       std::ostringstream oss;
       oss << GetDataBlockIndexTypeStr(std::get<0>(args.param))
           << "ProtectionPerKey" << std::to_string(std::get<1>(args.param))
@@ -1114,7 +1347,7 @@ TEST_P(DataBlockKVChecksumTest, ChecksumConstructionAndVerification) {
     std::unique_ptr<Block_kData> data_block =
         GenerateDataBlock(keys, values, kNumRecords);
 
-    const char *checksum_ptr = data_block->TEST_GetKVChecksum();
+    const char* checksum_ptr = data_block->TEST_GetKVChecksum();
     // Check checksum of correct length is generated
     for (int i = 0; i < kNumRecords; i++) {
       ASSERT_TRUE(VerifyChecksum(protection_bytes_per_key,
@@ -1132,8 +1365,8 @@ TEST_P(DataBlockKVChecksumTest, ChecksumConstructionAndVerification) {
     // that case (see Block::VerifyChecksum()).
     SyncPoint::GetInstance()->SetCallBack(
         "Block::VerifyChecksum::checksum_len",
-        [&verification_count, protection_bytes_per_key](void *checksum_len) {
-          ASSERT_EQ((*static_cast<uint8_t *>(checksum_len)),
+        [&verification_count, protection_bytes_per_key](void* checksum_len) {
+          ASSERT_EQ((*static_cast<uint8_t*>(checksum_len)),
                     protection_bytes_per_key);
           ++verification_count;
         });
@@ -1177,17 +1410,16 @@ class IndexBlockKVChecksumTest
   bool IncludeFirstKey() const { return std::get<4>(GetParam()); }
 
   std::unique_ptr<Block_kIndex> GenerateIndexBlock(
-      std::vector<std::string> &separators,
-      std::vector<BlockHandle> &block_handles,
-      std::vector<std::string> &first_keys, int num_record) {
+      std::vector<std::string>& separators,
+      std::vector<BlockHandle>& block_handles,
+      std::vector<std::string>& first_keys, int num_record) {
     Options options = Options();
-    BlockBasedTableOptions tbo;
     uint8_t protection_bytes_per_key = GetChecksumLen();
     BlockCreateContext create_context{
-        &tbo,
+        kTableOptions(),
         nullptr /* ioptions */,
         nullptr /* statistics */,
-        false /* _using_zstd */,
+        kDecompressor(),
         protection_bytes_per_key,
         options.comparator,
         !UseValueDeltaEncoding() /* value_is_full */,
@@ -1236,7 +1468,7 @@ INSTANTIATE_TEST_CASE_P(
         ::testing::Values(true, false), ::testing::Values(true, false)),
     [](const testing::TestParamInfo<
         std::tuple<BlockBasedTableOptions::DataBlockIndexType, uint8_t,
-                   uint32_t, bool, bool>> &args) {
+                   uint32_t, bool, bool>>& args) {
       std::ostringstream oss;
       oss << GetDataBlockIndexTypeStr(std::get<0>(args.param)) << "ProtBytes"
           << std::to_string(std::get<1>(args.param)) << "RestartInterval"
@@ -1260,13 +1492,12 @@ TEST_P(IndexBlockKVChecksumTest, ChecksumConstructionAndVerification) {
       std::vector<BlockHandle> block_handles;
       std::vector<std::string> first_keys;
       GenerateRandomIndexEntries(&separators, &block_handles, &first_keys,
-                                 kNumRecords, 0 /* ts_sz */,
-                                 seqno != kDisableGlobalSequenceNumber);
+                                 kNumRecords, 0 /* ts_sz */);
       SyncPoint::GetInstance()->DisableProcessing();
       std::unique_ptr<Block_kIndex> index_block = GenerateIndexBlock(
           separators, block_handles, first_keys, kNumRecords);
-      IndexBlockIter *kNullIter = nullptr;
-      Statistics *kNullStats = nullptr;
+      IndexBlockIter* kNullIter = nullptr;
+      Statistics* kNullStats = nullptr;
       // read contents of block sequentially
       std::unique_ptr<IndexBlockIter> biter{index_block->NewIndexIterator(
           options.comparator, seqno, kNullIter, kNullStats,
@@ -1277,7 +1508,7 @@ TEST_P(IndexBlockKVChecksumTest, ChecksumConstructionAndVerification) {
           true /* user_defined_timestamps_persisted */,
           nullptr /* prefix_index */)};
       biter->SeekToFirst();
-      const char *checksum_ptr = index_block->TEST_GetKVChecksum();
+      const char* checksum_ptr = index_block->TEST_GetKVChecksum();
       // Check checksum of correct length is generated
       for (int i = 0; i < kNumRecords; i++) {
         // Obtaining the actual content written as value to index block is not
@@ -1297,8 +1528,8 @@ TEST_P(IndexBlockKVChecksumTest, ChecksumConstructionAndVerification) {
       // assert checking on checksum_len here.
       SyncPoint::GetInstance()->SetCallBack(
           "Block::VerifyChecksum::checksum_len",
-          [&verification_count, protection_bytes_per_key](void *checksum_len) {
-            ASSERT_EQ((*static_cast<uint8_t *>(checksum_len)),
+          [&verification_count, protection_bytes_per_key](void* checksum_len) {
+            ASSERT_EQ((*static_cast<uint8_t*>(checksum_len)),
                       protection_bytes_per_key);
             ++verification_count;
           });
@@ -1321,17 +1552,13 @@ class MetaIndexBlockKVChecksumTest
   uint32_t GetRestartInterval() const { return 1; }
 
   std::unique_ptr<Block_kMetaIndex> GenerateMetaIndexBlock(
-      std::vector<std::string> &keys, std::vector<std::string> &values,
+      std::vector<std::string>& keys, std::vector<std::string>& values,
       int num_record) {
     Options options = Options();
-    BlockBasedTableOptions tbo;
     uint8_t protection_bytes_per_key = GetChecksumLen();
-    BlockCreateContext create_context{&tbo,
-                                      nullptr /* ioptions */,
-                                      nullptr /* statistics */,
-                                      false /* using_zstd */,
-                                      protection_bytes_per_key,
-                                      options.comparator};
+    BlockCreateContext create_context{
+        kTableOptions(), nullptr /* ioptions */,   nullptr /* statistics */,
+        kDecompressor(), protection_bytes_per_key, options.comparator};
     builder_ =
         std::make_unique<BlockBuilder>(static_cast<int>(GetRestartInterval()));
     // add a bunch of records to a block
@@ -1351,7 +1578,7 @@ class MetaIndexBlockKVChecksumTest
 
 INSTANTIATE_TEST_CASE_P(P, MetaIndexBlockKVChecksumTest,
                         ::testing::Values(0, 1, 2, 4, 8),
-                        [](const testing::TestParamInfo<uint8_t> &args) {
+                        [](const testing::TestParamInfo<uint8_t>& args) {
                           std::ostringstream oss;
                           oss << "ProtBytes" << std::to_string(args.param);
                           return oss.str();
@@ -1359,14 +1586,10 @@ INSTANTIATE_TEST_CASE_P(P, MetaIndexBlockKVChecksumTest,
 
 TEST_P(MetaIndexBlockKVChecksumTest, ChecksumConstructionAndVerification) {
   Options options = Options();
-  BlockBasedTableOptions tbo;
   uint8_t protection_bytes_per_key = GetChecksumLen();
-  BlockCreateContext create_context{&tbo,
-                                    nullptr /* ioptions */,
-                                    nullptr /* statistics */,
-                                    false /* using_zstd */,
-                                    protection_bytes_per_key,
-                                    options.comparator};
+  BlockCreateContext create_context{
+      kTableOptions(), nullptr /* ioptions */,   nullptr /* statistics */,
+      kDecompressor(), protection_bytes_per_key, options.comparator};
   std::vector<int> num_restart_intervals = {1, 16};
   for (const auto num_restart_interval : num_restart_intervals) {
     const int kNumRecords = num_restart_interval * GetRestartInterval();
@@ -1377,7 +1600,7 @@ TEST_P(MetaIndexBlockKVChecksumTest, ChecksumConstructionAndVerification) {
     SyncPoint::GetInstance()->DisableProcessing();
     std::unique_ptr<Block_kMetaIndex> meta_block =
         GenerateMetaIndexBlock(keys, values, kNumRecords);
-    const char *checksum_ptr = meta_block->TEST_GetKVChecksum();
+    const char* checksum_ptr = meta_block->TEST_GetKVChecksum();
     // Check checksum of correct length is generated
     for (int i = 0; i < kNumRecords; i++) {
       ASSERT_TRUE(VerifyChecksum(protection_bytes_per_key,
@@ -1392,8 +1615,8 @@ TEST_P(MetaIndexBlockKVChecksumTest, ChecksumConstructionAndVerification) {
     // checking on checksum_len here.
     SyncPoint::GetInstance()->SetCallBack(
         "Block::VerifyChecksum::checksum_len",
-        [&verification_count, protection_bytes_per_key](void *checksum_len) {
-          ASSERT_EQ((*static_cast<uint8_t *>(checksum_len)),
+        [&verification_count, protection_bytes_per_key](void* checksum_len) {
+          ASSERT_EQ((*static_cast<uint8_t*>(checksum_len)),
                     protection_bytes_per_key);
           ++verification_count;
         });
@@ -1413,7 +1636,7 @@ class DataBlockKVChecksumCorruptionTest : public DataBlockKVChecksumTest {
   DataBlockKVChecksumCorruptionTest() = default;
 
   std::unique_ptr<DataBlockIter> GenerateDataBlockIter(
-      std::vector<std::string> &keys, std::vector<std::string> &values,
+      std::vector<std::string>& keys, std::vector<std::string>& values,
       int num_record) {
     // During Block construction, we may create block iter to initialize per kv
     // checksum. Disable syncpoint that may be created for block iter methods.
@@ -1439,15 +1662,15 @@ TEST_P(DataBlockKVChecksumCorruptionTest, CorruptEntry) {
     GenerateRandomKVs(&keys, &values, 0, kNumRecords + 1, 1 /* step */,
                       24 /* padding_size */);
     SyncPoint::GetInstance()->SetCallBack(
-        "BlockIter::UpdateKey::value", [](void *arg) {
-          char *value = static_cast<char *>(arg);
+        "BlockIter::UpdateKey::value", [](void* arg) {
+          char* value = static_cast<char*>(arg);
           // values generated by GenerateRandomKVs are of length 100
           ++value[10];
         });
 
     // Purely for reducing the number of lines of code.
     typedef std::unique_ptr<DataBlockIter> IterPtr;
-    typedef void(IterAPI)(IterPtr & iter, std::string &);
+    typedef void(IterAPI)(IterPtr & iter, std::string&);
 
     std::string seek_key = keys[kNumRecords / 2];
     auto test_seek = [&](IterAPI iter_api) {
@@ -1458,14 +1681,14 @@ TEST_P(DataBlockKVChecksumCorruptionTest, CorruptEntry) {
       ASSERT_TRUE(biter->status().IsCorruption());
     };
 
-    test_seek([](IterPtr &iter, std::string &) { iter->SeekToFirst(); });
-    test_seek([](IterPtr &iter, std::string &) { iter->SeekToLast(); });
-    test_seek([](IterPtr &iter, std::string &k) { iter->Seek(k); });
-    test_seek([](IterPtr &iter, std::string &k) { iter->SeekForPrev(k); });
-    test_seek([](IterPtr &iter, std::string &k) { iter->SeekForGet(k); });
+    test_seek([](IterPtr& iter, std::string&) { iter->SeekToFirst(); });
+    test_seek([](IterPtr& iter, std::string&) { iter->SeekToLast(); });
+    test_seek([](IterPtr& iter, std::string& k) { iter->Seek(k); });
+    test_seek([](IterPtr& iter, std::string& k) { iter->SeekForPrev(k); });
+    test_seek([](IterPtr& iter, std::string& k) { iter->SeekForGet(k); });
 
     typedef void (DataBlockIter::*IterStepAPI)();
-    auto test_step = [&](IterStepAPI iter_api, std::string &k) {
+    auto test_step = [&](IterStepAPI iter_api, std::string& k) {
       IterPtr biter = GenerateDataBlockIter(keys, values, kNumRecords);
       SyncPoint::GetInstance()->DisableProcessing();
       biter->Seek(k);
@@ -1494,9 +1717,9 @@ INSTANTIATE_TEST_CASE_P(
         ::testing::Values(4, 8) /* block_protection_bytes_per_key */,
         ::testing::Values(1, 3, 8, 16) /* restart_interval */,
         ::testing::Values(false, true)),
-    [](const testing::TestParamInfo<std::tuple<
-           BlockBasedTableOptions::DataBlockIndexType, uint8_t, uint32_t, bool>>
-           &args) {
+    [](const testing::TestParamInfo<
+        std::tuple<BlockBasedTableOptions::DataBlockIndexType, uint8_t,
+                   uint32_t, bool>>& args) {
       std::ostringstream oss;
       oss << GetDataBlockIndexTypeStr(std::get<0>(args.param)) << "ProtBytes"
           << std::to_string(std::get<1>(args.param)) << "RestartInterval"
@@ -1510,9 +1733,9 @@ class IndexBlockKVChecksumCorruptionTest : public IndexBlockKVChecksumTest {
   IndexBlockKVChecksumCorruptionTest() = default;
 
   std::unique_ptr<IndexBlockIter> GenerateIndexBlockIter(
-      std::vector<std::string> &separators,
-      std::vector<BlockHandle> &block_handles,
-      std::vector<std::string> &first_keys, int num_record,
+      std::vector<std::string>& separators,
+      std::vector<BlockHandle>& block_handles,
+      std::vector<std::string>& first_keys, int num_record,
       SequenceNumber seqno) {
     SyncPoint::GetInstance()->DisableProcessing();
     block_ =
@@ -1545,7 +1768,7 @@ INSTANTIATE_TEST_CASE_P(
         ::testing::Values(true, false), ::testing::Values(true, false)),
     [](const testing::TestParamInfo<
         std::tuple<BlockBasedTableOptions::DataBlockIndexType, uint8_t,
-                   uint32_t, bool, bool>> &args) {
+                   uint32_t, bool, bool>>& args) {
       std::ostringstream oss;
       oss << GetDataBlockIndexTypeStr(std::get<0>(args.param)) << "ProtBytes"
           << std::to_string(std::get<1>(args.param)) << "RestartInterval"
@@ -1567,18 +1790,17 @@ TEST_P(IndexBlockKVChecksumCorruptionTest, CorruptEntry) {
       std::vector<BlockHandle> block_handles;
       std::vector<std::string> first_keys;
       GenerateRandomIndexEntries(&separators, &block_handles, &first_keys,
-                                 kNumRecords, 0 /* ts_sz */,
-                                 seqno != kDisableGlobalSequenceNumber);
+                                 kNumRecords, 0 /* ts_sz */);
       SyncPoint::GetInstance()->SetCallBack(
-          "BlockIter::UpdateKey::value", [](void *arg) {
-            char *value = static_cast<char *>(arg);
+          "BlockIter::UpdateKey::value", [](void* arg) {
+            char* value = static_cast<char*>(arg);
             // value can be delta-encoded with different lengths, so we corrupt
             // first bytes here to be safe
             ++value[0];
           });
 
       typedef std::unique_ptr<IndexBlockIter> IterPtr;
-      typedef void(IterAPI)(IterPtr & iter, std::string &);
+      typedef void(IterAPI)(IterPtr & iter, std::string&);
       std::string seek_key = first_keys[kNumRecords / 2];
       auto test_seek = [&](IterAPI iter_api) {
         std::unique_ptr<IndexBlockIter> biter = GenerateIndexBlockIter(
@@ -1588,12 +1810,12 @@ TEST_P(IndexBlockKVChecksumCorruptionTest, CorruptEntry) {
         ASSERT_FALSE(biter->Valid());
         ASSERT_TRUE(biter->status().IsCorruption());
       };
-      test_seek([](IterPtr &iter, std::string &) { iter->SeekToFirst(); });
-      test_seek([](IterPtr &iter, std::string &) { iter->SeekToLast(); });
-      test_seek([](IterPtr &iter, std::string &k) { iter->Seek(k); });
+      test_seek([](IterPtr& iter, std::string&) { iter->SeekToFirst(); });
+      test_seek([](IterPtr& iter, std::string&) { iter->SeekToLast(); });
+      test_seek([](IterPtr& iter, std::string& k) { iter->Seek(k); });
 
       typedef void (IndexBlockIter::*IterStepAPI)();
-      auto test_step = [&](IterStepAPI iter_api, std::string &k) {
+      auto test_step = [&](IterStepAPI iter_api, std::string& k) {
         std::unique_ptr<IndexBlockIter> biter = GenerateIndexBlockIter(
             separators, block_handles, first_keys, kNumRecords, seqno);
         SyncPoint::GetInstance()->DisableProcessing();
@@ -1619,7 +1841,7 @@ class MetaIndexBlockKVChecksumCorruptionTest
   MetaIndexBlockKVChecksumCorruptionTest() = default;
 
   std::unique_ptr<MetaBlockIter> GenerateMetaIndexBlockIter(
-      std::vector<std::string> &keys, std::vector<std::string> &values,
+      std::vector<std::string>& keys, std::vector<std::string>& values,
       int num_record) {
     SyncPoint::GetInstance()->DisableProcessing();
     block_ = GenerateMetaIndexBlock(keys, values, num_record);
@@ -1636,7 +1858,7 @@ class MetaIndexBlockKVChecksumCorruptionTest
 INSTANTIATE_TEST_CASE_P(
     P, MetaIndexBlockKVChecksumCorruptionTest,
     ::testing::Values(4, 8) /* block_protection_bytes_per_key */,
-    [](const testing::TestParamInfo<uint8_t> &args) {
+    [](const testing::TestParamInfo<uint8_t>& args) {
       std::ostringstream oss;
       oss << "ProtBytes" << std::to_string(args.param);
       return oss.str();
@@ -1653,14 +1875,14 @@ TEST_P(MetaIndexBlockKVChecksumCorruptionTest, CorruptEntry) {
     GenerateRandomKVs(&keys, &values, 0, kNumRecords + 1, 1 /* step */,
                       24 /* padding_size */);
     SyncPoint::GetInstance()->SetCallBack(
-        "BlockIter::UpdateKey::value", [](void *arg) {
-          char *value = static_cast<char *>(arg);
+        "BlockIter::UpdateKey::value", [](void* arg) {
+          char* value = static_cast<char*>(arg);
           // values generated by GenerateRandomKVs are of length 100
           ++value[10];
         });
 
     typedef std::unique_ptr<MetaBlockIter> IterPtr;
-    typedef void(IterAPI)(IterPtr & iter, std::string &);
+    typedef void(IterAPI)(IterPtr & iter, std::string&);
     typedef void (MetaBlockIter::*IterStepAPI)();
     std::string seek_key = keys[kNumRecords / 2];
     auto test_seek = [&](IterAPI iter_api) {
@@ -1671,12 +1893,12 @@ TEST_P(MetaIndexBlockKVChecksumCorruptionTest, CorruptEntry) {
       ASSERT_TRUE(biter->status().IsCorruption());
     };
 
-    test_seek([](IterPtr &iter, std::string &) { iter->SeekToFirst(); });
-    test_seek([](IterPtr &iter, std::string &) { iter->SeekToLast(); });
-    test_seek([](IterPtr &iter, std::string &k) { iter->Seek(k); });
-    test_seek([](IterPtr &iter, std::string &k) { iter->SeekForPrev(k); });
+    test_seek([](IterPtr& iter, std::string&) { iter->SeekToFirst(); });
+    test_seek([](IterPtr& iter, std::string&) { iter->SeekToLast(); });
+    test_seek([](IterPtr& iter, std::string& k) { iter->Seek(k); });
+    test_seek([](IterPtr& iter, std::string& k) { iter->SeekForPrev(k); });
 
-    auto test_step = [&](IterStepAPI iter_api, const std::string &k) {
+    auto test_step = [&](IterStepAPI iter_api, const std::string& k) {
       IterPtr biter = GenerateMetaIndexBlockIter(keys, values, kNumRecords);
       SyncPoint::GetInstance()->DisableProcessing();
       biter->Seek(k);
@@ -1696,7 +1918,7 @@ TEST_P(MetaIndexBlockKVChecksumCorruptionTest, CorruptEntry) {
 }
 }  // namespace ROCKSDB_NAMESPACE
 
-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
   ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
diff --git a/table/block_based/block_type.h b/table/block_based/block_type.h
index a9d6a1a773b4..b96f27385493 100644
--- a/table/block_based/block_type.h
+++ b/table/block_based/block_type.h
@@ -27,8 +27,39 @@ enum class BlockType : uint8_t {
   kHashIndexMetadata,
   kMetaIndex,
   kIndex,
+  kUserDefinedIndex,
   // Note: keep kInvalid the last value when adding new enum values.
   kInvalid
 };
 
+inline const char* BlockTypeToString(BlockType block_type) {
+  switch (block_type) {
+    case BlockType::kData:
+      return "Data";
+    case BlockType::kFilter:
+      return "Filter";
+    case BlockType::kFilterPartitionIndex:
+      return "FilterPartitionIndex";
+    case BlockType::kProperties:
+      return "Properties";
+    case BlockType::kCompressionDictionary:
+      return "CompressionDictionary";
+    case BlockType::kRangeDeletion:
+      return "RangeDeletion";
+    case BlockType::kHashIndexPrefixes:
+      return "HashIndexPrefixes";
+    case BlockType::kHashIndexMetadata:
+      return "HashIndexMetadata";
+    case BlockType::kMetaIndex:
+      return "MetaIndex";
+    case BlockType::kIndex:
+      return "Index";
+    case BlockType::kUserDefinedIndex:
+      return "UserDefinedIndex";
+    case BlockType::kInvalid:
+      return "Invalid";
+  }
+  return "Unknown";
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/data_block_footer.cc b/table/block_based/data_block_footer.cc
index 5d5d8ed55e4e..24a31c0d52b5 100644
--- a/table/block_based/data_block_footer.cc
+++ b/table/block_based/data_block_footer.cc
@@ -9,51 +9,55 @@
 
 #include "table/block_based/data_block_footer.h"
 
-#include "rocksdb/table.h"
+#include "util/coding.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-const int kDataBlockIndexTypeBitShift = 31;
+// Hash index bit (bit 31)
+constexpr uint32_t kHashIndexBit = 1u << 31;
 
-// 0x7FFFFFFF
-const uint32_t kMaxNumRestarts = (1u << kDataBlockIndexTypeBitShift) - 1u;
+void DataBlockFooter::EncodeTo(std::string* dst) const {
+  assert(num_restarts <= kMaxNumRestarts);
 
-// 0x7FFFFFFF
-const uint32_t kNumRestartsMask = (1u << kDataBlockIndexTypeBitShift) - 1u;
-
-uint32_t PackIndexTypeAndNumRestarts(
-    BlockBasedTableOptions::DataBlockIndexType index_type,
-    uint32_t num_restarts) {
-  if (num_restarts > kMaxNumRestarts) {
-    assert(0);  // mute travis "unused" warning
-  }
-
-  uint32_t block_footer = num_restarts;
+  uint32_t packed = num_restarts;
   if (index_type == BlockBasedTableOptions::kDataBlockBinaryAndHash) {
-    block_footer |= 1u << kDataBlockIndexTypeBitShift;
-  } else if (index_type != BlockBasedTableOptions::kDataBlockBinarySearch) {
-    assert(0);
+    packed |= kHashIndexBit;
+  } else {
+    assert(index_type == BlockBasedTableOptions::kDataBlockBinarySearch);
   }
 
-  return block_footer;
+  PutFixed32(dst, packed);
 }
 
-void UnPackIndexTypeAndNumRestarts(
-    uint32_t block_footer,
-    BlockBasedTableOptions::DataBlockIndexType* index_type,
-    uint32_t* num_restarts) {
-  if (index_type) {
-    if (block_footer & 1u << kDataBlockIndexTypeBitShift) {
-      *index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash;
-    } else {
-      *index_type = BlockBasedTableOptions::kDataBlockBinarySearch;
-    }
+Status DataBlockFooter::DecodeFrom(Slice* input) {
+  if (input->size() < kMinEncodedLength) {
+    return Status::Corruption("Block too small for footer");
   }
 
-  if (num_restarts) {
-    *num_restarts = block_footer & kNumRestartsMask;
-    assert(*num_restarts <= kMaxNumRestarts);
+  // Decode from the end of the input
+  const char* footer_ptr = input->data() + input->size() - kMinEncodedLength;
+  uint32_t packed = DecodeFixed32(footer_ptr);
+
+  if (packed & kHashIndexBit) {
+    index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash;
+    packed &= ~kHashIndexBit;
+  } else {
+    index_type = BlockBasedTableOptions::kDataBlockBinarySearch;
   }
+
+  // Check for reserved/unrecognized feature bits (anything beyond
+  // kMaxNumRestarts)
+  if (packed > kMaxNumRestarts) {
+    return Status::Corruption(
+        "Unrecognized feature in block footer (reserved bits set)");
+  }
+
+  num_restarts = packed;
+
+  // Remove the footer from the input slice
+  input->remove_suffix(kMinEncodedLength);
+
+  return Status::OK();
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/data_block_footer.h b/table/block_based/data_block_footer.h
index c1cfd473099a..74301d0e0a1a 100644
--- a/table/block_based/data_block_footer.h
+++ b/table/block_based/data_block_footer.h
@@ -9,17 +9,63 @@
 
 #pragma once
 
+#include <cstdint>
+#include <string>
+
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
 #include "rocksdb/table.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-uint32_t PackIndexTypeAndNumRestarts(
-    BlockBasedTableOptions::DataBlockIndexType index_type,
-    uint32_t num_restarts);
+// DataBlockFooter represents the footer of a data block, containing metadata
+// about the block's structure and features.
+//
+// Current encoding (may expand in future format versions):
+// - A single uint32_t where:
+//   - The low 28 bits store the number of restart points (num_restarts)
+//   - The high 4 bits are reserved for metadata/features:
+//     - Bit 31: Hash index present (kDataBlockBinaryAndHash)
+//     - Bits 28-30: Reserved for future features
+//
+// When any unrecognized reserved bit is set, DecodeFrom() returns an error,
+// allowing older versions to fail gracefully on newer formats.
+//
+// The encoding size is not fixed - future format versions may expand it.
+// Use kMaxEncodedLength for buffer sizing.
+struct DataBlockFooter {
+  // Maximum number of restarts that can be stored (2^28 - 1 = 268,435,455).
+  // This reserves the top 4 bits for metadata (bit 31 for hash index, bits
+  // 28-30 for future features). For historical compatibility purposes, the
+  // limit is adequate because a 4GiB block (maximum due to 32-bit block size)
+  // with restart_interval=1 and minimum entries (12 bytes: 3 varint bytes +
+  // 9-byte internal key + empty value) plus 4-byte restart offsets = 16 bytes
+  // per restart, fits at most (2^32 - 4) / 16 ≈ 268 million restarts.
+  static constexpr uint32_t kMaxNumRestarts = (1u << 28) - 1;
+
+  // Maximum encoded length of a DataBlockFooter (for buffer sizing)
+  // Currently 4 bytes, but may grow in future format versions.
+  static constexpr uint32_t kMaxEncodedLength = sizeof(uint32_t);
+
+  // Minimum encoded length (for current format version)
+  static constexpr uint32_t kMinEncodedLength = sizeof(uint32_t);
+
+  BlockBasedTableOptions::DataBlockIndexType index_type =
+      BlockBasedTableOptions::kDataBlockBinarySearch;
+  uint32_t num_restarts = 0;
+
+  DataBlockFooter() = default;
+  DataBlockFooter(BlockBasedTableOptions::DataBlockIndexType _index_type,
+                  uint32_t _num_restarts)
+      : index_type(_index_type), num_restarts(_num_restarts) {}
+
+  // Appends the encoded footer to dst.
+  void EncodeTo(std::string* dst) const;
 
-void UnPackIndexTypeAndNumRestarts(
-    uint32_t block_footer,
-    BlockBasedTableOptions::DataBlockIndexType* index_type,
-    uint32_t* num_restarts);
+  // Decodes a footer from the end of input (consumes bytes from the end).
+  // Returns an error if reserved/unrecognized feature bits are set.
+  // On success, advances input to exclude the consumed footer bytes.
+  Status DecodeFrom(Slice* input);
+};
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/data_block_hash_index_test.cc b/table/block_based/data_block_hash_index_test.cc
index 7970ca1d9f9b..5bf0faa14ab0 100644
--- a/table/block_based/data_block_hash_index_test.cc
+++ b/table/block_based/data_block_hash_index_test.cc
@@ -582,7 +582,8 @@ void TestBoundary(InternalKey& ik1, std::string& v1, InternalKey& ik2,
   const bool kSkipFilters = true;
   const bool kImmortal = true;
   ASSERT_OK(moptions.table_factory->NewTableReader(
-      TableReaderOptions(ioptions, moptions.prefix_extractor, soptions,
+      TableReaderOptions(ioptions, moptions.prefix_extractor,
+                         nullptr /* compression_manager */, soptions,
                          internal_comparator,
                          0 /* block_protection_bytes_per_key */, !kSkipFilters,
                          !kImmortal, level_),
diff --git a/table/block_based/filter_block.h b/table/block_based/filter_block.h
index 6f502cc0e59b..e0c0d094554e 100644
--- a/table/block_based/filter_block.h
+++ b/table/block_based/filter_block.h
@@ -68,6 +68,18 @@ class FilterBlockBuilder {
   // For reporting stats on how many entries the builder considered unique
   virtual size_t EstimateEntriesAdded() = 0;
 
+  // Returns an estimate of the current filter size based on the builder's
+  // state. Implementations should cache the estimate and update it via
+  // UpdateFilterSizeEstimate() to avoid recalculating on every key add.
+  //
+  // Can be called at any time during table construction, even before calling
+  // Finish(). Used during table construction to determine when to cut files.
+  virtual size_t CurrentFilterSizeEstimate() = 0;
+
+  // Provides a hook for filter builder when a data block is finalized, such as
+  // to update cached filter size estimates.
+  virtual void OnDataBlockFinalized(uint64_t /* num_data_blocks */) {}
+
   // When using AddWithPrevKey, this must be called before Finish(). (May also
   // be called without AddWithPrevKey, but prev_key_without_ts must be
   // accurate regardless.)
@@ -110,6 +122,11 @@ class FilterBlockBuilder {
     return filter;
   }
 #endif  // NDEBUG
+
+ protected:
+  // Update cached filter size estimate. Subclasses should override to update
+  // estimates based on their internal state.
+  virtual void UpdateFilterSizeEstimate(uint64_t /* num_data_blocks */) {}
 };
 
 // A FilterBlockReader is used to parse filter from SST table.
diff --git a/table/block_based/filter_block_reader_common.cc b/table/block_based/filter_block_reader_common.cc
index 343e9406b571..32c43ac09f3c 100644
--- a/table/block_based/filter_block_reader_common.cc
+++ b/table/block_based/filter_block_reader_common.cc
@@ -30,8 +30,7 @@ Status FilterBlockReaderCommon<TBlocklike>::ReadFilterBlock(
 
   const Status s = table->RetrieveBlock(
       prefetch_buffer, read_options, rep->filter_handle,
-      UncompressionDict::GetEmptyDict(), filter_block, get_context,
-      lookup_context,
+      /* decomp */ nullptr, filter_block, get_context, lookup_context,
       /* for_compaction */ false, use_cache,
       /* async_read */ false, /* use_block_cache_for_lookup */ true);
 
diff --git a/table/block_based/filter_policy.cc b/table/block_based/filter_policy.cc
index 3df973aa4ca8..cdc4c144c369 100644
--- a/table/block_based/filter_policy.cc
+++ b/table/block_based/filter_policy.cc
@@ -17,7 +17,6 @@
 #include <limits>
 #include <memory>
 
-#include "cache/cache_entry_roles.h"
 #include "cache/cache_reservation_manager.h"
 #include "logging/logging.h"
 #include "port/lang.h"
@@ -29,8 +28,8 @@
 #include "table/block_based/block_based_table_reader.h"
 #include "table/block_based/filter_policy_internal.h"
 #include "table/block_based/full_filter_block.h"
+#include "util/atomic.h"
 #include "util/bloom_impl.h"
-#include "util/coding.h"
 #include "util/hash.h"
 #include "util/math.h"
 #include "util/ribbon_config.h"
@@ -61,7 +60,7 @@ Slice FinishAlwaysTrue(std::unique_ptr<const char[]>* /*buf*/) {
 
 // Base class for filter builders using the XXH3 preview hash,
 // also known as Hash64 or GetSliceHash64.
-class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder {
+class XXPH3FilterBitsBuilder : public FilterBitsBuilder {
  public:
   explicit XXPH3FilterBitsBuilder(
       std::atomic<int64_t>* aggregate_rounding_balance,
@@ -126,8 +125,11 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder {
     }
   }
 
+  // Returns an estimate of the number of entries added to the
+  // filter. This method is thread-safe and can be safely called
+  // from background threads during parallel compression.
   size_t EstimateEntriesAdded() override {
-    return hash_entries_info_.entries.size();
+    return hash_entries_info_.entries_count.LoadRelaxed();
   }
 
   Status MaybePostVerify(const Slice& filter_content) override;
@@ -147,6 +149,7 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder {
       hash_entries_info_.xor_checksum ^= hash;
     }
     hash_entries_info_.entries.push_back(hash);
+    hash_entries_info_.entries_count.FetchAddRelaxed(1);
     if (cache_res_mgr_ &&
         // Traditional rounding to whole bucket size
         ((hash_entries_info_.entries.size() %
@@ -314,6 +317,10 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder {
     // and has near-minimal peak memory use.
     std::deque<uint64_t> entries;
 
+    // Tracks the number of entries added for thread-safe
+    // size estimation.
+    RelaxedAtomic<size_t> entries_count{0};
+
     // If cache_res_mgr_ != nullptr,
     // it manages cache charge for buckets of hash entries in (new) Bloom
     // or Ribbon Filter construction.
@@ -332,6 +339,8 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder {
     void Swap(HashEntriesInfo* other) {
       assert(other != nullptr);
       std::swap(entries, other->entries);
+      entries_count.StoreRelaxed(
+          other->entries_count.ExchangeRelaxed(entries_count.LoadRelaxed()));
       std::swap(cache_res_bucket_handles, other->cache_res_bucket_handles);
       std::swap(xor_checksum, other->xor_checksum);
       std::swap(prev_alt_hash, other->prev_alt_hash);
@@ -339,6 +348,7 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder {
 
     void Reset() {
       entries.clear();
+      entries_count.StoreRelaxed(0);
       cache_res_bucket_handles.clear();
       xor_checksum = 0;
       prev_alt_hash = {};
@@ -1012,9 +1022,6 @@ class Standard128RibbonBitsBuilder : public XXPH3FilterBitsBuilder {
   FastLocalBloomBitsBuilder bloom_fallback_;
 };
 
-// for the linker, at least with DEBUG_LEVEL=2
-constexpr uint32_t Standard128RibbonBitsBuilder::kMaxRibbonEntries;
-
 class Standard128RibbonBitsReader : public BuiltinFilterBitsReader {
  public:
   Standard128RibbonBitsReader(const char* data, size_t len_bytes,
@@ -1069,7 +1076,7 @@ class Standard128RibbonBitsReader : public BuiltinFilterBitsReader {
 
 using LegacyBloomImpl = LegacyLocalityBloomImpl</*ExtraRotates*/ false>;
 
-class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder {
+class LegacyBloomBitsBuilder : public FilterBitsBuilder {
  public:
   explicit LegacyBloomBitsBuilder(const int bits_per_key, Logger* info_log);
 
diff --git a/table/block_based/filter_policy_internal.h b/table/block_based/filter_policy_internal.h
index a823bf059732..3e6df57194dc 100644
--- a/table/block_based/filter_policy_internal.h
+++ b/table/block_based/filter_policy_internal.h
@@ -90,6 +90,19 @@ class FilterBitsBuilder {
   // <= the specified number of bytes. Callers (including RocksDB) should
   // only use this result for optimizing performance and not as a guarantee.
   virtual size_t ApproximateNumEntries(size_t bytes) = 0;
+
+  // Calculate number of bytes needed for a new filter, including
+  // metadata. Passing the result to ApproximateNumEntries should
+  // (ideally, usually) return >= the num_entry passed in.
+  // When optimize_filters_for_memory is enabled, this function
+  // is not authoritative but represents a target size that should
+  // be close to the average size.
+  virtual size_t CalculateSpace(size_t num_entries) = 0;
+
+  // Returns an estimate of the FP rate of the returned filter if
+  // `num_entries` keys are added and the filter returned by Finish
+  // is `bytes` bytes.
+  virtual double EstimatedFpRate(size_t num_entries, size_t bytes) = 0;
 };
 
 // A class that checks if a key can be in filter
@@ -109,24 +122,6 @@ class FilterBitsReader {
   }
 };
 
-// Exposes any extra information needed for testing built-in
-// FilterBitsBuilders
-class BuiltinFilterBitsBuilder : public FilterBitsBuilder {
- public:
-  // Calculate number of bytes needed for a new filter, including
-  // metadata. Passing the result to ApproximateNumEntries should
-  // (ideally, usually) return >= the num_entry passed in.
-  // When optimize_filters_for_memory is enabled, this function
-  // is not authoritative but represents a target size that should
-  // be close to the average size.
-  virtual size_t CalculateSpace(size_t num_entries) = 0;
-
-  // Returns an estimate of the FP rate of the returned filter if
-  // `num_entries` keys are added and the filter returned by Finish
-  // is `bytes` bytes.
-  virtual double EstimatedFpRate(size_t num_entries, size_t bytes) = 0;
-};
-
 // Base class for RocksDB built-in filter reader with
 // extra useful functionalities for inernal.
 class BuiltinFilterBitsReader : public FilterBitsReader {
diff --git a/table/block_based/flush_block_policy.cc b/table/block_based/flush_block_policy.cc
index d5cc310013f2..f01315ceb970 100644
--- a/table/block_based/flush_block_policy.cc
+++ b/table/block_based/flush_block_policy.cc
@@ -19,7 +19,7 @@
 namespace ROCKSDB_NAMESPACE {
 
 // Flush block by size
-class FlushBlockBySizePolicy : public FlushBlockPolicy {
+class FlushBlockBySizePolicy : public RetargetableFlushBlockPolicy {
  public:
   // @params block_size:           Approximate size of user data packed per
   //                               block.
@@ -28,19 +28,19 @@ class FlushBlockBySizePolicy : public FlushBlockPolicy {
   FlushBlockBySizePolicy(const uint64_t block_size,
                          const uint64_t block_size_deviation, const bool align,
                          const BlockBuilder& data_block_builder)
-      : block_size_(block_size),
+      : RetargetableFlushBlockPolicy(data_block_builder),
+        block_size_(block_size),
         block_size_deviation_limit_(
             ((block_size * (100 - block_size_deviation)) + 99) / 100),
-        align_(align),
-        data_block_builder_(data_block_builder) {}
+        align_(align) {}
 
   bool Update(const Slice& key, const Slice& value) override {
     // it makes no sense to flush when the data block is empty
-    if (data_block_builder_.empty()) {
+    if (data_block_builder_->empty()) {
       return false;
     }
 
-    auto curr_size = data_block_builder_.CurrentSizeEstimate();
+    auto curr_size = data_block_builder_->CurrentSizeEstimate();
 
     // Do flush if one of the below two conditions is true:
     // 1) if the current estimated size already exceeds the block size,
@@ -56,9 +56,9 @@ class FlushBlockBySizePolicy : public FlushBlockPolicy {
       return false;
     }
 
-    const auto curr_size = data_block_builder_.CurrentSizeEstimate();
+    const auto curr_size = data_block_builder_->CurrentSizeEstimate();
     auto estimated_size_after =
-        data_block_builder_.EstimateSizeAfterKV(key, value);
+        data_block_builder_->EstimateSizeAfterKV(key, value);
 
     if (align_) {
       estimated_size_after += BlockBasedTable::kBlockTrailerSize;
@@ -72,7 +72,6 @@ class FlushBlockBySizePolicy : public FlushBlockPolicy {
   const uint64_t block_size_;
   const uint64_t block_size_deviation_limit_;
   const bool align_;
-  const BlockBuilder& data_block_builder_;
 };
 
 FlushBlockPolicy* FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
@@ -83,10 +82,18 @@ FlushBlockPolicy* FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
       table_options.block_align, data_block_builder);
 }
 
+std::unique_ptr<RetargetableFlushBlockPolicy> NewFlushBlockBySizePolicy(
+    const uint64_t size, const int deviation,
+    const BlockBuilder& data_block_builder) {
+  return std::make_unique<FlushBlockBySizePolicy>(size, deviation, false,
+                                                  data_block_builder);
+}
+
 FlushBlockPolicy* FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
     const uint64_t size, const int deviation,
     const BlockBuilder& data_block_builder) {
-  return new FlushBlockBySizePolicy(size, deviation, false, data_block_builder);
+  return NewFlushBlockBySizePolicy(size, deviation, data_block_builder)
+      .release();
 }
 
 static int RegisterFlushBlockPolicyFactories(ObjectLibrary& library,
diff --git a/table/block_based/flush_block_policy_impl.h b/table/block_based/flush_block_policy_impl.h
index 4f79682bc25f..96132304d6e0 100644
--- a/table/block_based/flush_block_policy_impl.h
+++ b/table/block_based/flush_block_policy_impl.h
@@ -3,6 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#pragma once
 #include "rocksdb/flush_block_policy.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -37,4 +38,23 @@ class FlushBlockEveryKeyPolicyFactory : public FlushBlockPolicyFactory {
   }
 };
 
+// For internal use, policy that is stateless after creation, meaning it can
+// be safely re-targeted to another block builder.
+class RetargetableFlushBlockPolicy : public FlushBlockPolicy {
+ public:
+  explicit RetargetableFlushBlockPolicy(const BlockBuilder& data_block_builder)
+      : data_block_builder_(&data_block_builder) {}
+
+  void Retarget(const BlockBuilder& data_block_builder) {
+    data_block_builder_ = &data_block_builder;
+  }
+
+ protected:
+  const BlockBuilder* data_block_builder_;
+};
+
+std::unique_ptr<RetargetableFlushBlockPolicy> NewFlushBlockBySizePolicy(
+    const uint64_t size, const int deviation,
+    const BlockBuilder& data_block_builder);
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/full_filter_block.cc b/table/block_based/full_filter_block.cc
index af741787a32d..c7d069f3e524 100644
--- a/table/block_based/full_filter_block.cc
+++ b/table/block_based/full_filter_block.cc
@@ -30,6 +30,35 @@ size_t FullFilterBlockBuilder::EstimateEntriesAdded() {
   return filter_bits_builder_->EstimateEntriesAdded();
 }
 
+void FullFilterBlockBuilder::OnDataBlockFinalized(uint64_t num_data_blocks) {
+  UpdateFilterSizeEstimate(num_data_blocks);
+}
+
+size_t FullFilterBlockBuilder::CurrentFilterSizeEstimate() {
+  return estimated_filter_size_;
+}
+
+void FullFilterBlockBuilder::UpdateFilterSizeEstimate(
+    uint64_t num_data_blocks) {
+  size_t entries_added = filter_bits_builder_->EstimateEntriesAdded();
+
+  if (entries_added == 0) {
+    estimated_filter_size_ = 0;
+    return;
+  }
+
+  size_t filter_size = filter_bits_builder_->CalculateSpace(entries_added);
+
+  // Reserve filter space for next data block ~2x the average.
+  size_t buffer_size = 0;
+  if (num_data_blocks > 0) {
+    buffer_size = (filter_size / num_data_blocks) * 2;
+    estimated_filter_size_ = filter_size + buffer_size;
+  } else {
+    estimated_filter_size_ = filter_size;
+  }
+}
+
 void FullFilterBlockBuilder::AddWithPrevKey(
     const Slice& key_without_ts, const Slice& /*prev_key_without_ts*/) {
   FullFilterBlockBuilder::Add(key_without_ts);
diff --git a/table/block_based/full_filter_block.h b/table/block_based/full_filter_block.h
index 784f0eb881c3..96e8300b2086 100644
--- a/table/block_based/full_filter_block.h
+++ b/table/block_based/full_filter_block.h
@@ -57,6 +57,8 @@ class FullFilterBlockBuilder : public FilterBlockBuilder {
     return filter_bits_builder_->EstimateEntriesAdded() == 0;
   }
   size_t EstimateEntriesAdded() override;
+  size_t CurrentFilterSizeEstimate() override;
+  void OnDataBlockFinalized(uint64_t num_data_blocks) override;
   Status Finish(const BlockHandle& last_partition_block_handle, Slice* filter,
                 std::unique_ptr<const char[]>* filter_owner = nullptr) override;
   using FilterBlockBuilder::Finish;
@@ -73,6 +75,8 @@ class FullFilterBlockBuilder : public FilterBlockBuilder {
 
   std::unique_ptr<FilterBitsBuilder> filter_bits_builder_;
 
+  void UpdateFilterSizeEstimate(uint64_t num_data_blocks_written) override;
+
  private:
   // important: all of these might point to invalid addresses
   // at the time of destruction of this filter block. destructor
@@ -80,6 +84,8 @@ class FullFilterBlockBuilder : public FilterBlockBuilder {
   const SliceTransform* const prefix_extractor_;
   const bool whole_key_filtering_;
   std::unique_ptr<const char[]> filter_data_;
+
+  size_t estimated_filter_size_ = 0;
 };
 
 // A FilterBlockReader is used to parse filter from SST table.
diff --git a/table/block_based/full_filter_block_test.cc b/table/block_based/full_filter_block_test.cc
index f90492d8583b..1ce6844741eb 100644
--- a/table/block_based/full_filter_block_test.cc
+++ b/table/block_based/full_filter_block_test.cc
@@ -52,6 +52,13 @@ class TestFilterBitsBuilder : public FilterBitsBuilder {
 
   size_t ApproximateNumEntries(size_t bytes) override { return bytes / 4; }
 
+  size_t CalculateSpace(size_t num_entries) override { return num_entries * 4; }
+
+  double EstimatedFpRate(size_t /* num_entries */,
+                         size_t /* bytes */) override {
+    return 0.0;
+  }
+
  private:
   std::vector<uint32_t> hash_entries_;
 };
@@ -229,6 +236,14 @@ class CountUniqueFilterBitsBuilderWrapper : public FilterBitsBuilder {
     return b_->ApproximateNumEntries(bytes);
   }
 
+  size_t CalculateSpace(size_t num_entries) override {
+    return b_->CalculateSpace(num_entries);
+  }
+
+  double EstimatedFpRate(size_t num_entries, size_t bytes) override {
+    return b_->EstimatedFpRate(num_entries, bytes);
+  }
+
   size_t CountUnique() { return uniq_.size(); }
 };
 
diff --git a/table/block_based/hash_index_reader.cc b/table/block_based/hash_index_reader.cc
index 2cf67367b998..1a6c0aeb0f06 100644
--- a/table/block_based/hash_index_reader.cc
+++ b/table/block_based/hash_index_reader.cc
@@ -76,8 +76,8 @@ Status HashIndexReader::Create(const BlockBasedTable* table,
   BlockFetcher prefixes_block_fetcher(
       file, prefetch_buffer, footer, ro, prefixes_handle, &prefixes_contents,
       ioptions, true /*decompress*/, true /*maybe_compressed*/,
-      BlockType::kHashIndexPrefixes, UncompressionDict::GetEmptyDict(),
-      cache_options, memory_allocator);
+      BlockType::kHashIndexPrefixes, rep->decompressor.get(), cache_options,
+      memory_allocator);
   s = prefixes_block_fetcher.ReadBlockContents();
   if (!s.ok()) {
     return s;
@@ -87,7 +87,7 @@ Status HashIndexReader::Create(const BlockBasedTable* table,
       file, prefetch_buffer, footer, ro, prefixes_meta_handle,
       &prefixes_meta_contents, ioptions, true /*decompress*/,
       true /*maybe_compressed*/, BlockType::kHashIndexMetadata,
-      UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
+      rep->decompressor.get(), cache_options, memory_allocator);
   s = prefixes_meta_block_fetcher.ReadBlockContents();
   if (!s.ok()) {
     // TODO: log error
diff --git a/table/block_based/index_builder.cc b/table/block_based/index_builder.cc
index a5a34d65b670..8de01f0b7a22 100644
--- a/table/block_based/index_builder.cc
+++ b/table/block_based/index_builder.cc
@@ -66,7 +66,7 @@ IndexBuilder* IndexBuilder::CreateIndexBuilder(
       break;
     }
     default: {
-      assert(!"Do not recognize the index type ");
+      assert(false && "Do not recognize the index type ");
       break;
     }
   }
@@ -117,6 +117,20 @@ Slice ShortenedIndexBuilder::FindShortInternalKeySuccessor(
   }
 }
 
+void ShortenedIndexBuilder::UpdateIndexSizeEstimate() {
+  uint64_t current_size =
+      must_use_separator_with_seq_.LoadRelaxed()
+          ? index_block_builder_.CurrentSizeEstimate()
+          : index_block_builder_without_seq_.CurrentSizeEstimate();
+
+  uint64_t final_estimate = current_size;
+  if (num_index_entries_ > 0) {
+    // Add buffer to generously account (in most cases) for the next index entry
+    final_estimate += (2 * (current_size / num_index_entries_));
+  }
+  estimated_index_size_.StoreRelaxed(final_estimate);
+}
+
 PartitionedIndexBuilder* PartitionedIndexBuilder::CreateIndexBuilder(
     const InternalKeyComparator* comparator,
     const bool use_value_delta_encoding,
@@ -152,32 +166,43 @@ PartitionedIndexBuilder::PartitionedIndexBuilder(
       // sub_index_builder. Otherwise, it could be set to true even one of the
       // sub_index_builders could not safely exclude seq from the keys, then it
       // wil be enforced on all sub_index_builders on ::Finish.
-      seperator_is_key_plus_seq_(false),
-      use_value_delta_encoding_(use_value_delta_encoding) {}
+      must_use_separator_with_seq_(false),
+      use_value_delta_encoding_(use_value_delta_encoding) {
+  MakeNewSubIndexBuilder();
+}
 
 void PartitionedIndexBuilder::MakeNewSubIndexBuilder() {
-  assert(sub_index_builder_ == nullptr);
-  sub_index_builder_ = std::make_unique<ShortenedIndexBuilder>(
+  auto new_builder = std::make_unique<ShortenedIndexBuilder>(
       comparator_, table_opt_.index_block_restart_interval,
       table_opt_.format_version, use_value_delta_encoding_,
       table_opt_.index_shortening, /* include_first_key */ false, ts_sz_,
       persist_user_defined_timestamps_);
+  sub_index_builder_ = new_builder.get();
+  // Start next partition entry, where we will modify the key
+  entries_.push_back({{}, std::move(new_builder)});
 
-  // Set sub_index_builder_->seperator_is_key_plus_seq_ to true if
-  // seperator_is_key_plus_seq_ is true (internal-key mode) (set to false by
+  BlockBuilder* builder_to_monitor;
+  // Set sub_index_builder_->must_use_separator_with_seq_ to true if
+  // must_use_separator_with_seq_ is true (internal-key mode) (set to false by
   // default on Creation) so that flush policy can point to
   // sub_index_builder_->index_block_builder_
-  if (seperator_is_key_plus_seq_) {
-    sub_index_builder_->seperator_is_key_plus_seq_ = true;
+  if (must_use_separator_with_seq_.LoadRelaxed()) {
+    sub_index_builder_->must_use_separator_with_seq_.StoreRelaxed(true);
+    builder_to_monitor = &sub_index_builder_->index_block_builder_;
+  } else {
+    builder_to_monitor = &sub_index_builder_->index_block_builder_without_seq_;
   }
 
-  flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
-      table_opt_.metadata_block_size, table_opt_.block_size_deviation,
-      // Note: this is sub-optimal since sub_index_builder_ could later reset
-      // seperator_is_key_plus_seq_ but the probability of that is low.
-      sub_index_builder_->seperator_is_key_plus_seq_
-          ? sub_index_builder_->index_block_builder_
-          : sub_index_builder_->index_block_builder_without_seq_));
+  if (flush_policy_ == nullptr) {
+    // Note: some partitions could be sub-optimal since sub_index_builder_
+    // could later reset must_use_separator_with_seq_ but the probability and
+    // impact of that are low.
+    flush_policy_ = NewFlushBlockBySizePolicy(table_opt_.metadata_block_size,
+                                              table_opt_.block_size_deviation,
+                                              *builder_to_monitor);
+  } else {
+    flush_policy_->Retarget(*builder_to_monitor);
+  }
   partition_cut_requested_ = false;
 }
 
@@ -185,101 +210,135 @@ void PartitionedIndexBuilder::RequestPartitionCut() {
   partition_cut_requested_ = true;
 }
 
+std::unique_ptr<IndexBuilder::PreparedIndexEntry>
+PartitionedIndexBuilder::CreatePreparedIndexEntry() {
+  // Fortunately, for ShortenedIndexBuilder, we can prepare an entry from one
+  // similarly configured builder and finish it at another.
+  return entries_.front().value->CreatePreparedIndexEntry();
+}
+void PartitionedIndexBuilder::PrepareIndexEntry(
+    const Slice& last_key_in_current_block,
+    const Slice* first_key_in_next_block, PreparedIndexEntry* out) {
+  // Fortunately, for ShortenedIndexBuilder, we can prepare an entry from one
+  // similarly configured builder and finish it at another. We just have to
+  // keep in mind that this first sub builder keeps track of the original
+  // must_use_separator_with_seq_ in the pipeline that is then propagated.
+  return entries_.front().value->PrepareIndexEntry(
+      last_key_in_current_block, first_key_in_next_block, out);
+}
+
+void PartitionedIndexBuilder::MaybeFlush(const Slice& index_key,
+                                         const BlockHandle& index_value) {
+  bool do_flush = !sub_index_builder_->index_block_builder_.empty() &&
+                  (partition_cut_requested_ ||
+                   flush_policy_->Update(
+                       index_key, EncodedBlockHandle(index_value).AsSlice()));
+  if (do_flush) {
+    assert(entries_.back().value.get() == sub_index_builder_);
+
+    // Update estimate of completed partitions when a partition is flushed
+    estimated_completed_partitions_size_.FetchAddRelaxed(
+        sub_index_builder_->CurrentIndexSizeEstimate());
+
+    cut_filter_block = true;
+    MakeNewSubIndexBuilder();
+  }
+}
+
+void PartitionedIndexBuilder::FinishIndexEntry(const BlockHandle& block_handle,
+                                               PreparedIndexEntry* base_entry,
+                                               bool skip_delta_encoding) {
+  using SPIE = ShortenedIndexBuilder::ShortenedPreparedIndexEntry;
+  SPIE* entry = static_cast<SPIE*>(base_entry);
+
+  MaybeFlush(entry->separator_with_seq, block_handle);
+
+  sub_index_builder_->FinishIndexEntry(block_handle, base_entry,
+                                       skip_delta_encoding);
+  std::swap(entries_.back().key, entry->separator_with_seq);
+
+  // Update cached size estimate when data blocks are finalized for more
+  // accurate tail size estimation. This is needed for parallel compression
+  // which uses FinishIndexEntry() instead of AddIndexEntry().
+  UpdateIndexSizeEstimate();
+
+  if (!must_use_separator_with_seq_.LoadRelaxed() &&
+      entry->must_use_separator_with_seq) {
+    // We need to apply !must_use_separator_with_seq to all sub-index builders
+    must_use_separator_with_seq_.StoreRelaxed(true);
+    flush_policy_->Retarget(sub_index_builder_->index_block_builder_);
+  }
+  // NOTE: not compatible with coupled partitioned filters so don't need to
+  // cut_filter_block
+}
+
 Slice PartitionedIndexBuilder::AddIndexEntry(
     const Slice& last_key_in_current_block,
     const Slice* first_key_in_next_block, const BlockHandle& block_handle,
-    std::string* separator_scratch) {
-  // Note: to avoid two consecuitive flush in the same method call, we do not
-  // check flush policy when adding the last key
-  if (UNLIKELY(first_key_in_next_block == nullptr)) {  // no more keys
-    if (sub_index_builder_ == nullptr) {
-      MakeNewSubIndexBuilder();
-      // Reserve next partition entry, where we will modify the key and
-      // eventually set the value
-      entries_.push_back({{}, {}});
-    }
-    auto sep = sub_index_builder_->AddIndexEntry(
-        last_key_in_current_block, first_key_in_next_block, block_handle,
-        separator_scratch);
-    if (!seperator_is_key_plus_seq_ &&
-        sub_index_builder_->seperator_is_key_plus_seq_) {
-      // We need to apply !seperator_is_key_plus_seq to all sub-index builders
-      seperator_is_key_plus_seq_ = true;
-      // Would associate flush_policy with the appropriate builder, but it won't
-      // be used again with no more keys
-      flush_policy_.reset();
-    }
-    entries_.back().key.assign(sep.data(), sep.size());
-    assert(entries_.back().value == nullptr);
-    std::swap(entries_.back().value, sub_index_builder_);
+    std::string* separator_scratch, bool skip_delta_encoding) {
+  // At least when running without parallel compression, maintain behavior of
+  // avoiding a last index partition with just one entry
+  if (first_key_in_next_block) {
+    MaybeFlush(last_key_in_current_block, block_handle);
+  }
+
+  auto sep = sub_index_builder_->AddIndexEntry(
+      last_key_in_current_block, first_key_in_next_block, block_handle,
+      separator_scratch, skip_delta_encoding);
+  entries_.back().key.assign(sep.data(), sep.size());
+
+  // Update cached size estimate when data blocks are finalized for more
+  // accurate tail size estimation. This ensures the estimate reflects current
+  // state after each data block is added.
+  UpdateIndexSizeEstimate();
+
+  if (!must_use_separator_with_seq_.LoadRelaxed() &&
+      sub_index_builder_->must_use_separator_with_seq_.LoadRelaxed()) {
+    // We need to apply !must_use_separator_with_seq to all sub-index builders
+    must_use_separator_with_seq_.StoreRelaxed(true);
+    flush_policy_->Retarget(sub_index_builder_->index_block_builder_);
+  }
+  if (UNLIKELY(first_key_in_next_block == nullptr)) {
+    // no more keys
     cut_filter_block = true;
-    return sep;
-  } else {
-    // apply flush policy only to non-empty sub_index_builder_
-    if (sub_index_builder_ != nullptr) {
-      std::string handle_encoding;
-      block_handle.EncodeTo(&handle_encoding);
-      bool do_flush =
-          partition_cut_requested_ ||
-          flush_policy_->Update(last_key_in_current_block, handle_encoding);
-      if (do_flush) {
-        assert(entries_.back().value == nullptr);
-        std::swap(entries_.back().value, sub_index_builder_);
-        cut_filter_block = true;
-      }
-    }
-    if (sub_index_builder_ == nullptr) {
-      MakeNewSubIndexBuilder();
-      // Reserve next partition entry, where we will modify the key and
-      // eventually set the value
-      entries_.push_back({{}, {}});
-    }
-    auto sep = sub_index_builder_->AddIndexEntry(
-        last_key_in_current_block, first_key_in_next_block, block_handle,
-        separator_scratch);
-    entries_.back().key.assign(sep.data(), sep.size());
-    if (!seperator_is_key_plus_seq_ &&
-        sub_index_builder_->seperator_is_key_plus_seq_) {
-      // We need to apply !seperator_is_key_plus_seq to all sub-index builders
-      seperator_is_key_plus_seq_ = true;
-      // And use a flush_policy with the appropriate builder
-      flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
-          table_opt_.metadata_block_size, table_opt_.block_size_deviation,
-          sub_index_builder_->index_block_builder_));
-    }
-    return sep;
   }
+  return sep;
 }
 
 Status PartitionedIndexBuilder::Finish(
     IndexBlocks* index_blocks, const BlockHandle& last_partition_block_handle) {
   if (partition_cnt_ == 0) {
-    partition_cnt_ = entries_.size();
+    sub_index_builder_ = nullptr;
+    if (!entries_.empty()) {
+      // Remove the last entry if it is empty
+      if (entries_.back().value->index_block_builder_.empty()) {
+        assert(entries_.back().key.empty());
+        entries_.pop_back();
+      }
+      partition_cnt_ = entries_.size();
+    }
   }
-  // It must be set to null after last key is added
-  assert(sub_index_builder_ == nullptr);
-  if (finishing_indexes == true) {
+  if (finishing_indexes_ == true) {
     Entry& last_entry = entries_.front();
-    std::string handle_encoding;
-    last_partition_block_handle.EncodeTo(&handle_encoding);
+    EncodedBlockHandle handle_encoding(last_partition_block_handle);
     std::string handle_delta_encoding;
     PutVarsignedint64(
         &handle_delta_encoding,
         last_partition_block_handle.size() - last_encoded_handle_.size());
     last_encoded_handle_ = last_partition_block_handle;
     const Slice handle_delta_encoding_slice(handle_delta_encoding);
-    index_block_builder_.Add(last_entry.key, handle_encoding,
+    index_block_builder_.Add(last_entry.key, handle_encoding.AsSlice(),
                              &handle_delta_encoding_slice);
-    if (!seperator_is_key_plus_seq_) {
+    if (!must_use_separator_with_seq_.LoadRelaxed()) {
       index_block_builder_without_seq_.Add(ExtractUserKey(last_entry.key),
-                                           handle_encoding,
+                                           handle_encoding.AsSlice(),
                                            &handle_delta_encoding_slice);
     }
     entries_.pop_front();
   }
   // If there is no sub_index left, then return the 2nd level index.
   if (UNLIKELY(entries_.empty())) {
-    if (seperator_is_key_plus_seq_) {
+    if (must_use_separator_with_seq_.LoadRelaxed()) {
       index_blocks->index_block_contents = index_block_builder_.Finish();
     } else {
       index_blocks->index_block_contents =
@@ -293,13 +352,59 @@ Status PartitionedIndexBuilder::Finish(
     // expect more calls to Finish
     Entry& entry = entries_.front();
     // Apply the policy to all sub-indexes
-    entry.value->seperator_is_key_plus_seq_ = seperator_is_key_plus_seq_;
+    entry.value->must_use_separator_with_seq_.StoreRelaxed(
+        must_use_separator_with_seq_.LoadRelaxed());
     auto s = entry.value->Finish(index_blocks);
     index_size_ += index_blocks->index_block_contents.size();
-    finishing_indexes = true;
+    finishing_indexes_ = true;
     return s.ok() ? Status::Incomplete() : s;
   }
 }
 
 size_t PartitionedIndexBuilder::NumPartitions() const { return partition_cnt_; }
+
+void PartitionedIndexBuilder::UpdateIndexSizeEstimate() {
+  uint64_t total_size = 0;
+
+  // Ignore last entry which is a placeholder for the partition being built
+  size_t completed_partitions = entries_.size() > 0 ? entries_.size() - 1 : 0;
+
+  // Use running estimate of completed partitions instead of IndexSize() which
+  // is only available after calling Finish().
+  uint64_t completed_partitions_size =
+      estimated_completed_partitions_size_.LoadRelaxed();
+  total_size += completed_partitions_size;
+
+  // Add current active partition size if it exists
+  uint64_t current_sub_index_size = 0;
+  if (sub_index_builder_ != nullptr) {
+    current_sub_index_size = sub_index_builder_->CurrentIndexSizeEstimate();
+    total_size += current_sub_index_size;
+  }
+
+  // Add buffer for top-level index and next partition
+  uint64_t buffer_size = 0;
+  if (completed_partitions > 0) {
+    // Calculate top-level index size. Each top-level entry consists of:
+    // separator key (~20-50 bytes) + BlockHandle (~20 bytes) + overhead
+    // Estimate ~70 bytes per top-level entry as a reasonable average
+    auto estimated_top_level_size = completed_partitions * 70;
+    total_size += completed_partitions * 70;
+
+    // Buffer for next partition + next top-level entry
+    uint64_t avg_partition_size =
+        completed_partitions_size / completed_partitions;
+    uint64_t avg_top_level_entry_size =
+        estimated_top_level_size / completed_partitions;
+
+    buffer_size = 2 * (avg_partition_size + avg_top_level_entry_size);
+    total_size += buffer_size;
+  } else if (sub_index_builder_ != nullptr) {
+    // For the first partition, estimate using the current partition's state
+    buffer_size = 2 * current_sub_index_size;
+    total_size += buffer_size;
+  }
+  estimated_index_size_.StoreRelaxed(total_size);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/index_builder.h b/table/block_based/index_builder.h
index 99b348b2ff1d..a33935c051d3 100644
--- a/table/block_based/index_builder.h
+++ b/table/block_based/index_builder.h
@@ -18,7 +18,9 @@
 #include "rocksdb/comparator.h"
 #include "table/block_based/block_based_table_factory.h"
 #include "table/block_based/block_builder.h"
+#include "table/block_based/flush_block_policy_impl.h"
 #include "table/format.h"
+#include "util/atomic.h"
 
 namespace ROCKSDB_NAMESPACE {
 // The interface for building index.
@@ -46,7 +48,7 @@ class IndexBuilder {
   //     primary index.
   struct IndexBlocks {
     Slice index_block_contents;
-    std::unordered_map<std::string, Slice> meta_blocks;
+    std::unordered_map<std::string, std::pair<BlockType, Slice>> meta_blocks;
   };
   IndexBuilder(const InternalKeyComparator* comparator, size_t ts_sz,
                bool persist_user_defined_timestamps)
@@ -67,6 +69,9 @@ class IndexBuilder {
   //                           the last one in the table
   // @separator_scratch: a scratch buffer to back a computed separator between
   //                     those, as needed. May be modified on each call.
+  // @skip_delta_encoding: whether to skip delta encoding for this index entry
+  //                       for cases of violating the assumption that this
+  //                       block_handle starts where the last one ended.
   // @return: the key or separator stored in the index, which could be
   //          last_key_in_current_block or a computed separator backed by
   //          separator_scratch or last_key_in_current_block.
@@ -74,11 +79,57 @@ class IndexBuilder {
   virtual Slice AddIndexEntry(const Slice& last_key_in_current_block,
                               const Slice* first_key_in_next_block,
                               const BlockHandle& block_handle,
-                              std::string* separator_scratch) = 0;
+                              std::string* separator_scratch,
+                              bool skip_delta_encoding) = 0;
+
+  // An abstract (extensible) holder for passing data from PrepareIndexEntry to
+  // FinishIndexEntry (see below).
+  struct PreparedIndexEntry {
+    virtual ~PreparedIndexEntry() = default;
+  };
+
+  // Parallel compression/construction alternative to AddIndexEntry, 1/3
+  //
+  // This function creates a holder for data that needs to be passed from
+  // PrepareIndexEntry to FinishIndexEntry, depending on the implementation
+  // of those. Few of these are created and reused, so construction/destruction
+  // performance is not critical.
+  virtual std::unique_ptr<PreparedIndexEntry> CreatePreparedIndexEntry() = 0;
+
+  // Parallel compression/construction alternative to AddIndexEntry, 2/3
+  //
+  // One thread calls this function for successive index entries to compute and
+  // record in `out` what is needed to build the index entry EXCEPT for the
+  // BlockHandle, which will only be known later. That thread is generally the
+  // same thread as calls every other function such as OnKeyAdded EXCEPT
+  // FinishIndexEntry (see below). This function should be considered "mostly
+  // stateless" but might modify state distinct from what is modified by
+  // FinishIndexEntry. Ideally synchronization within the IndexBuilder can be
+  // avoided.
+  //
+  // The passed-in PreparedIndexEntry object is likely reused so might be
+  // passed-in in any state.
+  virtual void PrepareIndexEntry(const Slice& last_key_in_current_block,
+                                 const Slice* first_key_in_next_block,
+                                 PreparedIndexEntry* out) = 0;
+
+  // Parallel compression/construction alternative to AddIndexEntry, 3/3
+  //
+  // This function is called by a different thread than PrepareIndexEntry, but
+  // is called on entries in the same order as PrepareIndexEntry, passed in the
+  // PreparedIndexEntry objects populated by PrepareIndexEntry. This function
+  // finishes the same effect of AddIndexEntry but split across a few functions.
+  //
+  // External synchronization ensures Finish is only called after all the
+  // FinishIndexEntry calls have completed.
+  virtual void FinishIndexEntry(const BlockHandle& block_handle,
+                                PreparedIndexEntry* entry,
+                                bool skip_delta_encoding) = 0;
 
   // This method will be called whenever a key is added. The subclasses may
   // override OnKeyAdded() if they need to collect additional information.
-  virtual void OnKeyAdded(const Slice& /*key*/) {}
+  virtual void OnKeyAdded(const Slice& /*key*/,
+                          const std::optional<Slice>& /*value*/) {}
 
   // Inform the index builder that all entries has been written. Block builder
   // may therefore perform any operation required for block finalization.
@@ -108,7 +159,17 @@ class IndexBuilder {
   // Get the size for index block. Must be called after ::Finish.
   virtual size_t IndexSize() const = 0;
 
-  virtual bool seperator_is_key_plus_seq() { return true; }
+  // Returns an estimate of the current index size based on the builder's state.
+  // Implementations should cache the estimate and update it via
+  // UpdateIndexSizeEstimate() to avoid recalculating on every key add,
+  // which is critical for performance in the compaction hot path.
+  //
+  // This function is only called by the SST "emit thread" but must be
+  // thread safe with concurrent calls to UpdateIndexSizeEstimate() from another
+  // thread (such as during parallel compression).
+  virtual uint64_t CurrentIndexSizeEstimate() const = 0;
+
+  virtual bool separator_is_key_plus_seq() { return true; }
 
  protected:
   // Given the last key in current block and the first key in the next block,
@@ -116,7 +177,7 @@ class IndexBuilder {
   // can be used as separator.
   inline bool ShouldUseKeyPlusSeqAsSeparator(
       const Slice& last_key_in_current_block,
-      const Slice& first_key_in_next_block) {
+      const Slice& first_key_in_next_block) const {
     Slice l_user_key = ExtractUserKey(last_key_in_current_block);
     Slice r_user_key = ExtractUserKey(first_key_in_next_block);
     // If user defined timestamps are not persisted. All the user keys will
@@ -130,6 +191,13 @@ class IndexBuilder {
                      l_user_key, r_user_key) == 0;
   }
 
+  // Updates the cached index size estimate used by CurrentIndexSizeEstimate().
+  //
+  // This function can be called from the SST "write thread" (via
+  // FinishIndexEntry()), and needs to be thread safe with
+  // CurrentIndexSizeEstimate() called from the SST "emit thread".
+  virtual void UpdateIndexSizeEstimate() {}
+
   const InternalKeyComparator* comparator_;
   // Size of user-defined timestamp in bytes.
   size_t ts_sz_;
@@ -177,63 +245,78 @@ class ShortenedIndexBuilder : public IndexBuilder {
         include_first_key_(include_first_key),
         shortening_mode_(shortening_mode) {
     // Making the default true will disable the feature for old versions
-    seperator_is_key_plus_seq_ = (format_version <= 2);
+    must_use_separator_with_seq_.StoreRelaxed(format_version <= 2);
   }
 
-  void OnKeyAdded(const Slice& key) override {
+  void OnKeyAdded(const Slice& key,
+                  const std::optional<Slice>& /*value*/) override {
     if (include_first_key_ && current_block_first_internal_key_.empty()) {
       current_block_first_internal_key_.assign(key.data(), key.size());
     }
   }
 
-  Slice AddIndexEntry(const Slice& last_key_in_current_block,
-                      const Slice* first_key_in_next_block,
-                      const BlockHandle& block_handle,
-                      std::string* separator_scratch) override {
-    Slice separator;
+  Slice GetSeparatorWithSeq(const Slice& last_key_in_current_block,
+                            const Slice* first_key_in_next_block,
+                            std::string* separator_scratch) {
+    Slice separator_with_seq;
     if (first_key_in_next_block != nullptr) {
       if (shortening_mode_ !=
           BlockBasedTableOptions::IndexShorteningMode::kNoShortening) {
-        separator = FindShortestInternalKeySeparator(
+        separator_with_seq = FindShortestInternalKeySeparator(
             *comparator_->user_comparator(), last_key_in_current_block,
             *first_key_in_next_block, separator_scratch);
       } else {
-        separator = last_key_in_current_block;
+        separator_with_seq = last_key_in_current_block;
       }
-      if (!seperator_is_key_plus_seq_ &&
+      if (!must_use_separator_with_seq_.LoadRelaxed() &&
           ShouldUseKeyPlusSeqAsSeparator(last_key_in_current_block,
                                          *first_key_in_next_block)) {
-        seperator_is_key_plus_seq_ = true;
+        must_use_separator_with_seq_.StoreRelaxed(true);
       }
     } else {
       if (shortening_mode_ == BlockBasedTableOptions::IndexShorteningMode::
                                   kShortenSeparatorsAndSuccessor) {
-        separator = FindShortInternalKeySuccessor(
+        separator_with_seq = FindShortInternalKeySuccessor(
             *comparator_->user_comparator(), last_key_in_current_block,
             separator_scratch);
       } else {
-        separator = last_key_in_current_block;
+        separator_with_seq = last_key_in_current_block;
       }
     }
+    return separator_with_seq;
+  }
 
-    assert(!include_first_key_ || !current_block_first_internal_key_.empty());
+  Slice GetFirstInternalKey(std::string* first_internal_key_buf) const {
+    if (!include_first_key_) {
+      return Slice();
+    }
+    assert(!current_block_first_internal_key_.empty());
     // When UDT should not be persisted, the index block builders take care of
     // stripping UDT from the key, for the first internal key contained in the
     // IndexValue, we need to explicitly do the stripping here before passing
     // it to the block builders.
-    std::string first_internal_key_buf;
     Slice first_internal_key = current_block_first_internal_key_;
     if (!current_block_first_internal_key_.empty() && ts_sz_ > 0 &&
         !persist_user_defined_timestamps_) {
-      StripTimestampFromInternalKey(&first_internal_key_buf,
+      first_internal_key_buf->clear();
+      StripTimestampFromInternalKey(first_internal_key_buf,
                                     current_block_first_internal_key_, ts_sz_);
-      first_internal_key = first_internal_key_buf;
+      first_internal_key = *first_internal_key_buf;
     }
+    return first_internal_key;
+  }
+
+  void AddIndexEntryImpl(const Slice& separator_with_seq,
+                         const Slice& first_internal_key,
+                         const BlockHandle& block_handle,
+                         bool must_use_separator_with_seq,
+                         bool skip_delta_encoding) {
     IndexValue entry(block_handle, first_internal_key);
     std::string encoded_entry;
     std::string delta_encoded_entry;
     entry.EncodeTo(&encoded_entry, include_first_key_, nullptr);
-    if (use_value_delta_encoding_ && !last_encoded_handle_.IsNull()) {
+    if (use_value_delta_encoding_ && !last_encoded_handle_.IsNull() &&
+        !skip_delta_encoding) {
       entry.EncodeTo(&delta_encoded_entry, include_first_key_,
                      &last_encoded_handle_);
     } else {
@@ -252,21 +335,98 @@ class ShortenedIndexBuilder : public IndexBuilder {
     // away the UDT from key in index block as data block does the same thing.
     // What are the implications if a "FindShortInternalKeySuccessor"
     // optimization is provided.
-    index_block_builder_.Add(separator, encoded_entry,
-                             &delta_encoded_entry_slice);
-    if (!seperator_is_key_plus_seq_) {
+    index_block_builder_.Add(separator_with_seq, encoded_entry,
+                             &delta_encoded_entry_slice, skip_delta_encoding);
+    if (!must_use_separator_with_seq) {
       index_block_builder_without_seq_.Add(
-          ExtractUserKey(separator), encoded_entry, &delta_encoded_entry_slice);
+          ExtractUserKey(separator_with_seq), encoded_entry,
+          &delta_encoded_entry_slice, skip_delta_encoding);
+    }
+
+    ++num_index_entries_;
+    UpdateIndexSizeEstimate();
+  }
+
+  Slice AddIndexEntry(const Slice& last_key_in_current_block,
+                      const Slice* first_key_in_next_block,
+                      const BlockHandle& block_handle,
+                      std::string* separator_scratch,
+                      bool skip_delta_encoding) override {
+    Slice separator_with_seq = GetSeparatorWithSeq(
+        last_key_in_current_block, first_key_in_next_block, separator_scratch);
+
+    std::string first_internal_key_buf;
+    Slice first_internal_key = GetFirstInternalKey(&first_internal_key_buf);
+
+    AddIndexEntryImpl(separator_with_seq, first_internal_key, block_handle,
+                      must_use_separator_with_seq_.LoadRelaxed(),
+                      skip_delta_encoding);
+    current_block_first_internal_key_.clear();
+    return separator_with_seq;
+  }
+
+  struct ShortenedPreparedIndexEntry : public PreparedIndexEntry {
+    std::string separator_with_seq;
+    std::string first_internal_key;
+    bool must_use_separator_with_seq = false;
+    void SaveFrom(const Slice& from_separator,
+                  const Slice& from_first_internal_key,
+                  bool from_must_use_separator_with_seq) {
+      assert(from_separator.size() >= kNumInternalBytes);
+      if (from_separator.data() == separator_with_seq.data()) {
+        // No need to copy
+        assert(from_separator.size() == separator_with_seq.size());
+      } else {
+        // Copy the separator
+        separator_with_seq.assign(from_separator.data(), from_separator.size());
+      }
+      // first_internal_key is optional, so it may be empty.
+      assert(from_first_internal_key.empty() ||
+             from_first_internal_key.size() >= kNumInternalBytes);
+      if (from_first_internal_key.data() == first_internal_key.data()) {
+        // No need to copy
+        assert(from_first_internal_key.size() == first_internal_key.size());
+      } else {
+        // Copy the first internal key
+        first_internal_key.assign(from_first_internal_key.data(),
+                                  from_first_internal_key.size());
+      }
+      must_use_separator_with_seq = from_must_use_separator_with_seq;
     }
+  };
 
+  std::unique_ptr<PreparedIndexEntry> CreatePreparedIndexEntry() override {
+    return std::make_unique<ShortenedPreparedIndexEntry>();
+  }
+
+  void PrepareIndexEntry(const Slice& last_key_in_current_block,
+                         const Slice* first_key_in_next_block,
+                         PreparedIndexEntry* out) override {
+    ShortenedPreparedIndexEntry* entry =
+        static_cast<ShortenedPreparedIndexEntry*>(out);
+    Slice separator =
+        GetSeparatorWithSeq(last_key_in_current_block, first_key_in_next_block,
+                            &entry->separator_with_seq);
+    Slice first_internal_key = GetFirstInternalKey(&entry->first_internal_key);
+    entry->SaveFrom(separator, first_internal_key,
+                    must_use_separator_with_seq_.LoadRelaxed());
     current_block_first_internal_key_.clear();
-    return separator;
+  }
+
+  void FinishIndexEntry(const BlockHandle& block_handle,
+                        PreparedIndexEntry* base_entry,
+                        bool skip_delta_encoding) override {
+    ShortenedPreparedIndexEntry* entry =
+        static_cast<ShortenedPreparedIndexEntry*>(base_entry);
+    AddIndexEntryImpl(entry->separator_with_seq, entry->first_internal_key,
+                      block_handle, entry->must_use_separator_with_seq,
+                      skip_delta_encoding);
   }
 
   using IndexBuilder::Finish;
   Status Finish(IndexBlocks* index_blocks,
                 const BlockHandle& /*last_partition_block_handle*/) override {
-    if (seperator_is_key_plus_seq_) {
+    if (must_use_separator_with_seq_.LoadRelaxed()) {
       index_blocks->index_block_contents = index_block_builder_.Finish();
     } else {
       index_blocks->index_block_contents =
@@ -278,8 +438,15 @@ class ShortenedIndexBuilder : public IndexBuilder {
 
   size_t IndexSize() const override { return index_size_; }
 
-  bool seperator_is_key_plus_seq() override {
-    return seperator_is_key_plus_seq_;
+  uint64_t CurrentIndexSizeEstimate() const override {
+    return estimated_index_size_.LoadRelaxed();
+  }
+
+  // Updates the cached size estimate to minimize CPU usage in hot path
+  void UpdateIndexSizeEstimate() override;
+
+  bool separator_is_key_plus_seq() override {
+    return must_use_separator_with_seq_.LoadRelaxed();
   }
 
   // Changes *key to a short string >= *key.
@@ -297,13 +464,20 @@ class ShortenedIndexBuilder : public IndexBuilder {
 
  private:
   BlockBuilder index_block_builder_;
+  // TODO: consider optimizing to only one builder. When discovering that
+  // sequence numbers are needed, read existing entries without seq and rewrite
+  // them with seq (which should be trivial to populate since seq wasn't needed
+  // before).
   BlockBuilder index_block_builder_without_seq_;
   const bool use_value_delta_encoding_;
-  bool seperator_is_key_plus_seq_;
+  RelaxedAtomic<bool> must_use_separator_with_seq_;
   const bool include_first_key_;
   BlockBasedTableOptions::IndexShorteningMode shortening_mode_;
   BlockHandle last_encoded_handle_ = BlockHandle::NullBlockHandle();
   std::string current_block_first_internal_key_;
+  uint64_t num_index_entries_ = 0;
+  // Cache for index size estimate to avoid recalculating in hot path
+  RelaxedAtomic<uint64_t> estimated_index_size_{0};
 };
 
 // HashIndexBuilder contains a binary-searchable primary index and the
@@ -351,14 +525,35 @@ class HashIndexBuilder : public IndexBuilder {
   Slice AddIndexEntry(const Slice& last_key_in_current_block,
                       const Slice* first_key_in_next_block,
                       const BlockHandle& block_handle,
-                      std::string* separator_scratch) override {
+                      std::string* separator_scratch,
+                      bool skip_delta_encoding) override {
     ++current_restart_index_;
     return primary_index_builder_.AddIndexEntry(
         last_key_in_current_block, first_key_in_next_block, block_handle,
-        separator_scratch);
+        separator_scratch, skip_delta_encoding);
+  }
+
+  std::unique_ptr<PreparedIndexEntry> CreatePreparedIndexEntry() override {
+    return primary_index_builder_.CreatePreparedIndexEntry();
+  }
+
+  void PrepareIndexEntry(const Slice& last_key_in_current_block,
+                         const Slice* first_key_in_next_block,
+                         PreparedIndexEntry* out) override {
+    ++current_restart_index_;
+    primary_index_builder_.PrepareIndexEntry(last_key_in_current_block,
+                                             first_key_in_next_block, out);
+  }
+
+  void FinishIndexEntry(const BlockHandle& block_handle,
+                        PreparedIndexEntry* entry,
+                        bool skip_delta_encoding) override {
+    primary_index_builder_.FinishIndexEntry(block_handle, entry,
+                                            skip_delta_encoding);
   }
 
-  void OnKeyAdded(const Slice& key) override {
+  void OnKeyAdded(const Slice& key,
+                  const std::optional<Slice>& /*value*/) override {
     auto key_prefix = hash_key_extractor_->Transform(key);
     bool is_first_entry = pending_block_num_ == 0;
 
@@ -393,9 +588,9 @@ class HashIndexBuilder : public IndexBuilder {
     Status s = primary_index_builder_.Finish(index_blocks,
                                              last_partition_block_handle);
     index_blocks->meta_blocks.insert(
-        {kHashIndexPrefixesBlock.c_str(), prefix_block_});
-    index_blocks->meta_blocks.insert(
-        {kHashIndexPrefixesMetadataBlock.c_str(), prefix_meta_block_});
+        {kHashIndexPrefixesBlock.c_str(), {BlockType::kIndex, prefix_block_}});
+    index_blocks->meta_blocks.insert({kHashIndexPrefixesMetadataBlock.c_str(),
+                                      {BlockType::kIndex, prefix_meta_block_}});
     return s;
   }
 
@@ -404,8 +599,10 @@ class HashIndexBuilder : public IndexBuilder {
            prefix_meta_block_.size();
   }
 
-  bool seperator_is_key_plus_seq() override {
-    return primary_index_builder_.seperator_is_key_plus_seq();
+  uint64_t CurrentIndexSizeEstimate() const override { return 0; }
+
+  bool separator_is_key_plus_seq() override {
+    return primary_index_builder_.separator_is_key_plus_seq();
   }
 
  private:
@@ -461,7 +658,17 @@ class PartitionedIndexBuilder : public IndexBuilder {
   Slice AddIndexEntry(const Slice& last_key_in_current_block,
                       const Slice* first_key_in_next_block,
                       const BlockHandle& block_handle,
-                      std::string* separator_scratch) override;
+                      std::string* separator_scratch,
+                      bool skip_delta_encoding) override;
+
+  std::unique_ptr<PreparedIndexEntry> CreatePreparedIndexEntry() override;
+  void PrepareIndexEntry(const Slice& last_key_in_current_block,
+                         const Slice* first_key_in_next_block,
+                         PreparedIndexEntry* out) override;
+  void FinishIndexEntry(const BlockHandle& block_handle,
+                        PreparedIndexEntry* entry,
+                        bool skip_delta_encoding) override;
+  void MaybeFlush(const Slice& index_key, const BlockHandle& index_value);
 
   Status Finish(IndexBlocks* index_blocks,
                 const BlockHandle& last_partition_block_handle) override;
@@ -470,6 +677,12 @@ class PartitionedIndexBuilder : public IndexBuilder {
   size_t TopLevelIndexSize(uint64_t) const { return top_level_index_size_; }
   size_t NumPartitions() const;
 
+  // Returns a cached estimate of the current index size. This
+  // estimate is updated when data blocks are added.
+  uint64_t CurrentIndexSizeEstimate() const override {
+    return estimated_index_size_.LoadRelaxed();
+  }
+
   inline bool ShouldCutFilterBlock() {
     // Current policy is to align the partitions of index and filters
     if (cut_filter_block) {
@@ -488,8 +701,10 @@ class PartitionedIndexBuilder : public IndexBuilder {
   // cutting the next partition
   void RequestPartitionCut();
 
-  bool seperator_is_key_plus_seq() override {
-    return seperator_is_key_plus_seq_;
+  // This function must be thread safe because multiple worker threads might
+  // update the index builder state during parallel compression.
+  bool separator_is_key_plus_seq() override {
+    return must_use_separator_with_seq_.LoadRelaxed();
   }
 
   bool get_use_value_delta_encoding() const {
@@ -503,6 +718,7 @@ class PartitionedIndexBuilder : public IndexBuilder {
   size_t partition_cnt_ = 0;
 
   void MakeNewSubIndexBuilder();
+  void UpdateIndexSizeEstimate() override;
 
   struct Entry {
     std::string key;
@@ -515,14 +731,14 @@ class PartitionedIndexBuilder : public IndexBuilder {
   std::list<Entry> entries_;
   BlockBuilder index_block_builder_;              // top-level index builder
   BlockBuilder index_block_builder_without_seq_;  // same for user keys
-  // the active partition index builder
-  std::unique_ptr<ShortenedIndexBuilder> sub_index_builder_;
+  // the active partition index builder (owned by an Entry in entries_)
+  ShortenedIndexBuilder* sub_index_builder_;
   // the last key in the active partition index builder
-  std::unique_ptr<FlushBlockPolicy> flush_policy_;
+  std::unique_ptr<RetargetableFlushBlockPolicy> flush_policy_;
   // true if Finish is called once but not complete yet.
-  bool finishing_indexes = false;
+  bool finishing_indexes_ = false;
   const BlockBasedTableOptions& table_opt_;
-  bool seperator_is_key_plus_seq_;
+  RelaxedAtomic<bool> must_use_separator_with_seq_;
   bool use_value_delta_encoding_;
   // true if an external entity (such as filter partition builder) request
   // cutting the next partition
@@ -530,5 +746,9 @@ class PartitionedIndexBuilder : public IndexBuilder {
   // true if it should cut the next filter partition block
   bool cut_filter_block = false;
   BlockHandle last_encoded_handle_;
+  // Cached estimate of current index size, updated when data blocks are added
+  RelaxedAtomic<uint64_t> estimated_index_size_{0};
+  // Running estimate of completed partitions total size
+  RelaxedAtomic<uint64_t> estimated_completed_partitions_size_{0};
 };
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/index_reader_common.cc b/table/block_based/index_reader_common.cc
index 2c0b480e2f3f..6b0a6ab71dce 100644
--- a/table/block_based/index_reader_common.cc
+++ b/table/block_based/index_reader_common.cc
@@ -26,9 +26,9 @@ Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock(
   assert(rep != nullptr);
 
   const Status s = table->RetrieveBlock(
-      prefetch_buffer, read_options, rep->index_handle,
-      UncompressionDict::GetEmptyDict(), &index_block->As<Block_kIndex>(),
-      get_context, lookup_context, /* for_compaction */ false, use_cache,
+      prefetch_buffer, read_options, rep->index_handle, rep->decompressor.get(),
+      &index_block->As<Block_kIndex>(), get_context, lookup_context,
+      /* for_compaction */ false, use_cache,
       /* async_read */ false, /* use_block_cache_for_lookup */ true);
 
   return s;
diff --git a/table/block_based/mock_block_based_table.h b/table/block_based/mock_block_based_table.h
index 13f3dfaee14b..481589076f4a 100644
--- a/table/block_based/mock_block_based_table.h
+++ b/table/block_based/mock_block_based_table.h
@@ -32,7 +32,7 @@ class MockBlockBasedTableTester {
 
   explicit MockBlockBasedTableTester(const FilterPolicy* filter_policy)
       : MockBlockBasedTableTester(
-            std::shared_ptr<const FilterPolicy>(filter_policy)){};
+            std::shared_ptr<const FilterPolicy>(filter_policy)) {};
 
   explicit MockBlockBasedTableTester(
       std::shared_ptr<const FilterPolicy> filter_policy)
diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc
index ce0b691a47f3..95c1cf32a2e8 100644
--- a/table/block_based/partitioned_filter_block.cc
+++ b/table/block_based/partitioned_filter_block.cc
@@ -143,6 +143,7 @@ void PartitionedFilterBlockBuilder::CutAFilterBlock(const Slice* next_key,
     ikey = p_index_builder_->GetPartitionKey();
   }
   filters_.push_back({std::move(ikey), std::move(filter_data), filter});
+  completed_partitions_size_.FetchAddRelaxed(filter.size());
   partitioned_filters_construction_status_.UpdateIfOk(
       filter_construction_status);
 
@@ -209,6 +210,56 @@ size_t PartitionedFilterBlockBuilder::EstimateEntriesAdded() {
   return total_added_in_built_ + filter_bits_builder_->EstimateEntriesAdded();
 }
 
+size_t PartitionedFilterBlockBuilder::CurrentFilterSizeEstimate() {
+  size_t active_partition_size =
+      filter_bits_builder_->EstimateEntriesAdded() * 2;  // 2 bytes per key
+
+  return estimated_filter_size_.LoadRelaxed() + active_partition_size;
+}
+
+void PartitionedFilterBlockBuilder::OnDataBlockFinalized(
+    uint64_t num_data_blocks) {
+  UpdateFilterSizeEstimate(num_data_blocks);
+}
+
+void PartitionedFilterBlockBuilder::UpdateFilterSizeEstimate(
+    uint64_t num_data_blocks) {
+  size_t partitions_size = completed_partitions_size_.LoadRelaxed();
+
+  // Reserve space if no partitions have been cut
+  size_t active_filter_estimate = 0;
+  if (partitions_size == 0) {
+    size_t avg_bytes_per_entry =
+        2;  // 2 bytes per entry, approx 15 bits per key
+
+    // Estimate using keys_per_partition_ since we expect to cut the first
+    // partition once it reaches approx. this many entries.
+    active_filter_estimate = keys_per_partition_ * avg_bytes_per_entry;
+
+    // Add a 2x buffer (for top-level index, etc.)
+    active_filter_estimate = active_filter_estimate * 2;
+  }
+  size_t filter_estimate = std::max(partitions_size, active_filter_estimate);
+
+  // Estimate top-level partition index size
+  if (p_index_builder_->separator_is_key_plus_seq()) {
+    filter_estimate += index_on_filter_block_builder_.CurrentSizeEstimate();
+  } else {
+    filter_estimate +=
+        index_on_filter_block_builder_without_seq_.CurrentSizeEstimate();
+  }
+
+  // Reserve filter space for the next data block
+  size_t reserved = 0;
+  if (num_data_blocks > 0) {
+    reserved = (filter_estimate / num_data_blocks) *
+               2;  // 2x average size per data block
+    estimated_filter_size_.StoreRelaxed(filter_estimate + reserved);
+  } else {
+    estimated_filter_size_.StoreRelaxed(filter_estimate);
+  }
+}
+
 void PartitionedFilterBlockBuilder::PrevKeyBeforeFinish(
     const Slice& prev_key_without_ts) {
   assert(prev_key_without_ts.compare(DEBUG_add_with_prev_key_called_
@@ -240,7 +291,7 @@ Status PartitionedFilterBlockBuilder::Finish(
 
     index_on_filter_block_builder_.Add(e.ikey, handle_encoding,
                                        &handle_delta_encoding_slice);
-    if (!p_index_builder_->seperator_is_key_plus_seq()) {
+    if (!p_index_builder_->separator_is_key_plus_seq()) {
       index_on_filter_block_builder_without_seq_.Add(
           ExtractUserKey(e.ikey), handle_encoding,
           &handle_delta_encoding_slice);
@@ -267,7 +318,7 @@ Status PartitionedFilterBlockBuilder::Finish(
     if (UNLIKELY(filters_.empty())) {
       if (!index_on_filter_block_builder_.empty()) {
         // Simplest to just add them all at the end
-        if (p_index_builder_->seperator_is_key_plus_seq()) {
+        if (p_index_builder_->separator_is_key_plus_seq()) {
           *filter = index_on_filter_block_builder_.Finish();
         } else {
           *filter = index_on_filter_block_builder_without_seq_.Finish();
@@ -413,8 +464,7 @@ Status PartitionedFilterBlockReader::GetFilterPartitionBlock(
 
   const Status s = table()->RetrieveBlock(
       prefetch_buffer, read_options, fltr_blk_handle,
-      UncompressionDict::GetEmptyDict(), filter_block, get_context,
-      lookup_context,
+      /* decomp */ nullptr, filter_block, get_context, lookup_context,
       /* for_compaction */ false, /* use_cache */ true,
       /* async_read */ false, /* use_block_cache_for_lookup */ true);
 
@@ -592,7 +642,8 @@ Status PartitionedFilterBlockReader::CacheDependencies(
                                   /*usage=*/FilePrefetchBufferUsage::kUnknown);
 
     IOOptions opts;
-    s = rep->file->PrepareIOOptions(ro, opts);
+    IODebugContext dbg;
+    s = rep->file->PrepareIOOptions(ro, opts, &dbg);
     if (s.ok()) {
       s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off,
                                     static_cast<size_t>(prefetch_len));
@@ -610,7 +661,7 @@ Status PartitionedFilterBlockReader::CacheDependencies(
     // filter blocks
     s = table()->MaybeReadBlockAndLoadToCache(
         prefetch_buffer ? prefetch_buffer.get() : tail_prefetch_buffer, ro,
-        handle, UncompressionDict::GetEmptyDict(),
+        handle, /* dict */ nullptr,
         /* for_compaction */ false, &block, nullptr /* get_context */,
         &lookup_context, nullptr /* contents */, false,
         /* use_block_cache_for_lookup */ true);
diff --git a/table/block_based/partitioned_filter_block.h b/table/block_based/partitioned_filter_block.h
index 8faed24a92db..96f39dd4f01a 100644
--- a/table/block_based/partitioned_filter_block.h
+++ b/table/block_based/partitioned_filter_block.h
@@ -18,6 +18,7 @@
 #include "table/block_based/filter_block_reader_common.h"
 #include "table/block_based/full_filter_block.h"
 #include "table/block_based/index_builder.h"
+#include "util/atomic.h"
 #include "util/autovector.h"
 #include "util/hash_containers.h"
 
@@ -46,6 +47,8 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
   }
 
   size_t EstimateEntriesAdded() override;
+  size_t CurrentFilterSizeEstimate() override;
+  void OnDataBlockFinalized(uint64_t num_data_blocks) override;
 
   void PrevKeyBeforeFinish(const Slice& prev_key_without_ts) override;
   Status Finish(const BlockHandle& last_partition_block_handle, Slice* filter,
@@ -67,6 +70,11 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
     return Status::OK();
   }
 
+ protected:
+  // Needs to be thread-safe to be invoked from background worker
+  // thread when parallel compression is enabled.
+  void UpdateFilterSizeEstimate(uint64_t num_data_blocks) override;
+
  private:  // fns
   // Whether to cut a filter block before the next key
   bool DecideCutAFilterBlock();
@@ -92,6 +100,11 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
   };
   std::deque<FilterEntry> filters_;  // list of partitioned filters and keys
                                      // used in building the index
+  // Running total of completed filter partition sizes to avoid
+  // iterating over filters_ deque, which can be concurrently modified by
+  // the main thread when parallel compression is enabled.
+  RelaxedAtomic<size_t> completed_partitions_size_{0};
+
   // The desired number of keys per partition
   uint32_t keys_per_partition_;
   // According to the bits builders, how many keys/prefixes added
@@ -107,6 +120,12 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
   // For Add without prev key
   std::string prev_key_without_ts_;
 
+  // Cached filter size estimate for hot path performance - updated only when
+  // data blocks are written for meaningful estimate updates.
+  // Must be atomic since UpdateFilterSizeEstimate() can be called from
+  // background worker threads when parallel compression is enabled.
+  RelaxedAtomic<size_t> estimated_filter_size_{0};
+
 #ifndef NDEBUG
   // For verifying accurate previous keys are provided by the caller, so that
   // release code can be fast
diff --git a/table/block_based/partitioned_filter_block_test.cc b/table/block_based/partitioned_filter_block_test.cc
index 80cb131a990b..02869a879c61 100644
--- a/table/block_based/partitioned_filter_block_test.cc
+++ b/table/block_based/partitioned_filter_block_test.cc
@@ -27,7 +27,7 @@ class MockedBlockBasedTable : public BlockBasedTable {
   MockedBlockBasedTable(Rep* rep, PartitionedIndexBuilder* pib)
       : BlockBasedTable(rep, /*block_cache_tracer=*/nullptr) {
     // Initialize what Open normally does as much as necessary for the test
-    rep->index_key_includes_seq = pib->seperator_is_key_plus_seq();
+    rep->index_key_includes_seq = pib->separator_is_key_plus_seq();
     rep->index_value_is_full = !pib->get_use_value_delta_encoding();
   }
 };
@@ -315,7 +315,8 @@ class PartitionedFilterBlockTest
         std::string(*InternalKey(user_key, 0, ValueType::kTypeValue).rep());
     BlockHandle dont_care_block_handle(1, 1);
     std::string scratch;
-    builder->AddIndexEntry(key, nullptr, dont_care_block_handle, &scratch);
+    builder->AddIndexEntry(key, nullptr, dont_care_block_handle, &scratch,
+                           false);
   }
 
   void CutABlock(PartitionedIndexBuilder* builder, const std::string& user_key,
@@ -327,7 +328,8 @@ class PartitionedFilterBlockTest
     BlockHandle dont_care_block_handle(1, 1);
     Slice slice = Slice(next_key.data(), next_key.size());
     std::string scratch;
-    builder->AddIndexEntry(key, &slice, dont_care_block_handle, &scratch);
+    builder->AddIndexEntry(key, &slice, dont_care_block_handle, &scratch,
+                           false);
   }
 
   int CountNumOfIndexPartitions(PartitionedIndexBuilder* builder) {
@@ -348,7 +350,7 @@ INSTANTIATE_TEST_CASE_P(
     FormatVersions, PartitionedFilterBlockTest,
     testing::Combine(
         testing::ValuesIn(std::set<uint32_t>{
-            2, 3, 4, 5, test::kDefaultFormatVersion, kLatestFormatVersion}),
+            2, 3, 4, 5, test::kDefaultFormatVersion, kLatestBbtFormatVersion}),
         testing::ValuesIn(test::GetUDTTestModes()), testing::Bool()));
 
 TEST_P(PartitionedFilterBlockTest, EmptyBuilder) {
diff --git a/table/block_based/partitioned_index_iterator.h b/table/block_based/partitioned_index_iterator.h
index 6412fe2399b5..31ccded9a025 100644
--- a/table/block_based/partitioned_index_iterator.h
+++ b/table/block_based/partitioned_index_iterator.h
@@ -81,8 +81,6 @@ class PartitionedIndexIterator : public InternalIteratorBase<IndexValue> {
     }
   }
   inline IterBoundCheck UpperBoundCheckResult() override {
-    // Shouldn't be called.
-    assert(false);
     return IterBoundCheck::kUnknown;
   }
   void SetPinnedItersMgr(PinnedIteratorsManager*) override {
diff --git a/table/block_based/partitioned_index_reader.cc b/table/block_based/partitioned_index_reader.cc
index 04c73ba0bbec..da3f3658da59 100644
--- a/table/block_based/partitioned_index_reader.cc
+++ b/table/block_based/partitioned_index_reader.cc
@@ -190,7 +190,7 @@ Status PartitionIndexReader::CacheDependencies(
     // filter blocks
     Status s = table()->MaybeReadBlockAndLoadToCache(
         prefetch_buffer ? prefetch_buffer.get() : tail_prefetch_buffer, ro,
-        handle, UncompressionDict::GetEmptyDict(),
+        handle, rep->decompressor.get(),
         /*for_compaction=*/false, &block.As<Block_kIndex>(),
         /*get_context=*/nullptr, &lookup_context, /*contents=*/nullptr,
         /*async_read=*/false, /*use_block_cache_for_lookup=*/true);
diff --git a/table/block_based/reader_common.cc b/table/block_based/reader_common.cc
index 8f8c82ff43ac..fbafe414dd9a 100644
--- a/table/block_based/reader_common.cc
+++ b/table/block_based/reader_common.cc
@@ -25,7 +25,7 @@ void ForceReleaseCachedEntry(void* arg, void* h) {
 // WART: this is specific to block-based table
 Status VerifyBlockChecksum(const Footer& footer, const char* data,
                            size_t block_size, const std::string& file_name,
-                           uint64_t offset) {
+                           uint64_t offset, BlockType block_type) {
   PERF_TIMER_GUARD(block_checksum_time);
 
   assert(footer.GetBlockTrailerSize() == 5);
@@ -58,7 +58,8 @@ Status VerifyBlockChecksum(const Footer& footer, const char* data,
         std::string(modifier ? "(context removed)" : "") + " = " +
         std::to_string(stored) + ", computed = " + std::to_string(computed) +
         ", type = " + std::to_string(type) + "  in " + file_name + " offset " +
-        std::to_string(offset) + " size " + std::to_string(block_size));
+        std::to_string(offset) + " size " + std::to_string(block_size) +
+        ", block_type = " + BlockTypeToString(block_type));
   }
 }
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/reader_common.h b/table/block_based/reader_common.h
index 89518fd8c2a4..6d16f4069413 100644
--- a/table/block_based/reader_common.h
+++ b/table/block_based/reader_common.h
@@ -10,6 +10,7 @@
 
 #include "rocksdb/advanced_cache.h"
 #include "rocksdb/table.h"
+#include "table/block_based/block_type.h"
 
 namespace ROCKSDB_NAMESPACE {
 class Footer;
@@ -27,10 +28,12 @@ inline MemoryAllocator* GetMemoryAllocator(
 // Assumes block has a trailer past `data + block_size` as in format.h.
 // `file_name` provided for generating diagnostic message in returned status.
 // `offset` might be required for proper verification (also used for message).
+// `block_type` is included in the error message to provide context about
+// which type of block failed checksum verification.
 //
 // Returns Status::OK() on checksum match, or Status::Corruption() on checksum
 // mismatch.
 Status VerifyBlockChecksum(const Footer& footer, const char* data,
                            size_t block_size, const std::string& file_name,
-                           uint64_t offset);
+                           uint64_t offset, BlockType block_type);
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/uncompression_dict_reader.cc b/table/block_based/uncompression_dict_reader.cc
index b7c9e02f01ba..2a6b25aaa5ee 100644
--- a/table/block_based/uncompression_dict_reader.cc
+++ b/table/block_based/uncompression_dict_reader.cc
@@ -23,7 +23,7 @@ Status UncompressionDictReader::Create(
   assert(!pin || prefetch);
   assert(uncompression_dict_reader);
 
-  CachableEntry<UncompressionDict> uncompression_dict;
+  CachableEntry<DecompressorDict> uncompression_dict;
   if (prefetch || !use_cache) {
     const Status s = ReadUncompressionDictionary(
         table, prefetch_buffer, ro, use_cache, nullptr /* get_context */,
@@ -47,7 +47,7 @@ Status UncompressionDictReader::ReadUncompressionDictionary(
     const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
     const ReadOptions& read_options, bool use_cache, GetContext* get_context,
     BlockCacheLookupContext* lookup_context,
-    CachableEntry<UncompressionDict>* uncompression_dict) {
+    CachableEntry<DecompressorDict>* uncompression_dict) {
   // TODO: add perf counter for compression dictionary read time
 
   assert(table);
@@ -60,8 +60,7 @@ Status UncompressionDictReader::ReadUncompressionDictionary(
 
   const Status s = table->RetrieveBlock(
       prefetch_buffer, read_options, rep->compression_dict_handle,
-      UncompressionDict::GetEmptyDict(), uncompression_dict, get_context,
-      lookup_context,
+      /* decomp */ nullptr, uncompression_dict, get_context, lookup_context,
       /* for_compaction */ false, use_cache,
       /* async_read */ false, /* use_block_cache_for_lookup */ true);
 
@@ -79,7 +78,7 @@ Status UncompressionDictReader::ReadUncompressionDictionary(
 Status UncompressionDictReader::GetOrReadUncompressionDictionary(
     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
     GetContext* get_context, BlockCacheLookupContext* lookup_context,
-    CachableEntry<UncompressionDict>* uncompression_dict) const {
+    CachableEntry<DecompressorDict>* uncompression_dict) const {
   assert(uncompression_dict);
 
   if (!uncompression_dict_.IsEmpty()) {
diff --git a/table/block_based/uncompression_dict_reader.h b/table/block_based/uncompression_dict_reader.h
index b5d64dbf1458..d0579a66055c 100644
--- a/table/block_based/uncompression_dict_reader.h
+++ b/table/block_based/uncompression_dict_reader.h
@@ -18,7 +18,6 @@ struct BlockCacheLookupContext;
 class FilePrefetchBuffer;
 class GetContext;
 struct ReadOptions;
-struct UncompressionDict;
 
 // Provides access to the uncompression dictionary regardless of whether
 // it is owned by the reader or stored in the cache, or whether it is pinned
@@ -34,13 +33,13 @@ class UncompressionDictReader {
   Status GetOrReadUncompressionDictionary(
       FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
       GetContext* get_context, BlockCacheLookupContext* lookup_context,
-      CachableEntry<UncompressionDict>* uncompression_dict) const;
+      CachableEntry<DecompressorDict>* uncompression_dict) const;
 
   size_t ApproximateMemoryUsage() const;
 
  private:
   UncompressionDictReader(const BlockBasedTable* t,
-                          CachableEntry<UncompressionDict>&& uncompression_dict)
+                          CachableEntry<DecompressorDict>&& uncompression_dict)
       : table_(t), uncompression_dict_(std::move(uncompression_dict)) {
     assert(table_);
   }
@@ -51,10 +50,10 @@ class UncompressionDictReader {
       const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
       const ReadOptions& read_options, bool use_cache, GetContext* get_context,
       BlockCacheLookupContext* lookup_context,
-      CachableEntry<UncompressionDict>* uncompression_dict);
+      CachableEntry<DecompressorDict>* uncompression_dict);
 
   const BlockBasedTable* table_;
-  CachableEntry<UncompressionDict> uncompression_dict_;
+  CachableEntry<DecompressorDict> uncompression_dict_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/user_defined_index_wrapper.h b/table/block_based/user_defined_index_wrapper.h
new file mode 100644
index 000000000000..b65ba147e2fc
--- /dev/null
+++ b/table/block_based/user_defined_index_wrapper.h
@@ -0,0 +1,326 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/user_defined_index.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_type.h"
+#include "table/block_based/cachable_entry.h"
+#include "table/block_based/index_builder.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// UserDefinedIndexWrapper wraps around the existing index types in block based
+// table, and supports plugging in an additional user defined index. The wrapper
+// class forwards calls to both the wrapped internal index, and a user defined
+// index builder.
+class UserDefinedIndexBuilderWrapper : public IndexBuilder {
+ public:
+  UserDefinedIndexBuilderWrapper(
+      const std::string& name,
+      std::unique_ptr<IndexBuilder> internal_index_builder,
+      std::unique_ptr<UserDefinedIndexBuilder> user_defined_index_builder,
+      const InternalKeyComparator* comparator, size_t ts_sz,
+      bool persist_user_defined_timestamps)
+      : IndexBuilder(comparator, ts_sz, persist_user_defined_timestamps),
+        name_(name),
+        internal_index_builder_(std::move(internal_index_builder)),
+        user_defined_index_builder_(std::move(user_defined_index_builder)) {}
+
+  ~UserDefinedIndexBuilderWrapper() override = default;
+
+  Slice AddIndexEntry(const Slice& last_key_in_current_block,
+                      const Slice* first_key_in_next_block,
+                      const BlockHandle& block_handle,
+                      std::string* separator_scratch,
+                      bool skip_delta_encoding) override {
+    UserDefinedIndexBuilder::BlockHandle handle;
+    handle.offset = block_handle.offset();
+    handle.size = block_handle.size();
+    // Forward the call to both index builders
+    ParsedInternalKey pkey_last;
+    ParsedInternalKey pkey_first;
+    // There's no way to return an error here, so we remember the statsu and
+    // return it in Finish()
+    if (status_.ok()) {
+      status_ = ParseInternalKey(last_key_in_current_block, &pkey_last,
+                                 /*lof_err_key*/ false);
+    }
+    if (status_.ok() && first_key_in_next_block) {
+      status_ = ParseInternalKey(*first_key_in_next_block, &pkey_first,
+                                 /*lof_err_key*/ false);
+    }
+    if (status_.ok()) {
+      user_defined_index_builder_->AddIndexEntry(
+          pkey_last.user_key,
+          first_key_in_next_block ? &pkey_first.user_key : nullptr, handle,
+          separator_scratch);
+    }
+    return internal_index_builder_->AddIndexEntry(
+        last_key_in_current_block, first_key_in_next_block, block_handle,
+        separator_scratch, skip_delta_encoding);
+  }
+
+  // Not supported with parallel compression
+  std::unique_ptr<PreparedIndexEntry> CreatePreparedIndexEntry() override {
+    return nullptr;
+  }
+  void PrepareIndexEntry(const Slice& last_key_in_current_block,
+                         const Slice* first_key_in_next_block,
+                         PreparedIndexEntry* out) override {
+    (void)last_key_in_current_block;
+    (void)first_key_in_next_block;
+    (void)out;
+    assert(false);
+  }
+  void FinishIndexEntry(const BlockHandle& block_handle,
+                        PreparedIndexEntry* entry,
+                        bool skip_delta_encoding) override {
+    (void)block_handle;
+    (void)entry;
+    (void)skip_delta_encoding;
+    assert(false);
+  }
+
+  void OnKeyAdded(const Slice& key,
+                  const std::optional<Slice>& value) override {
+    ParsedInternalKey pkey;
+    if (status_.ok()) {
+      if (!value.has_value()) {
+        status_ = Status::InvalidArgument(
+            "user_defined_index_factory not supported with parallel "
+            "compression");
+      } else {
+        status_ = ParseInternalKey(key, &pkey, /*lof_err_key*/ false);
+        if (status_.ok() && pkey.type != ValueType::kTypeValue) {
+          status_ = Status::InvalidArgument(
+              "user_defined_index_factory only supported with Puts");
+        }
+      }
+    }
+    if (!status_.ok()) {
+      return;
+    }
+
+    // Forward the call to both index builders
+    internal_index_builder_->OnKeyAdded(key, value);
+
+    // Pass the user key to the UDI. We don't expect multiple entries with
+    // different sequence numbers for the same key in the file. RocksDB may
+    // enforce it in the future by allowing UDIs only for read only
+    // bulkloaded use cases, and only allow ingestion of files with
+    // sequence number 0.
+    user_defined_index_builder_->OnKeyAdded(
+        pkey.user_key, UserDefinedIndexBuilder::ValueType::kValue,
+        value.value());
+  }
+
+  Status Finish(IndexBlocks* index_blocks,
+                const BlockHandle& last_partition_block_handle) override {
+    if (!status_.ok() && !status_.IsIncomplete()) {
+      return status_;
+    }
+
+    if (!udi_finished_) {
+      // Finish the user defined index builder
+      Slice user_index_contents;
+      status_ = user_defined_index_builder_->Finish(&user_index_contents);
+      if (!status_.ok()) {
+        return status_;
+      }
+
+      // Add the user defined index to the meta blocks
+      std::string block_name = kUserDefinedIndexPrefix + name_;
+      index_blocks->meta_blocks.insert(
+          {block_name, {BlockType::kUserDefinedIndex, user_index_contents}});
+      udi_finished_ = true;
+    }
+
+    // Finish the internal index builder
+    status_ = internal_index_builder_->Finish(index_blocks,
+                                              last_partition_block_handle);
+    if (!status_.ok()) {
+      return status_;
+    }
+
+    index_size_ = internal_index_builder_->IndexSize();
+    return status_;
+  }
+
+  size_t IndexSize() const override { return index_size_; }
+
+  uint64_t CurrentIndexSizeEstimate() const override { return 0; }
+
+  bool separator_is_key_plus_seq() override {
+    return internal_index_builder_->separator_is_key_plus_seq();
+  }
+
+ private:
+  const std::string name_;
+  std::unique_ptr<IndexBuilder> internal_index_builder_;
+  std::unique_ptr<UserDefinedIndexBuilder> user_defined_index_builder_;
+  Status status_;
+  bool udi_finished_ = false;
+};
+
+class UserDefinedIndexIteratorWrapper
+    : public InternalIteratorBase<IndexValue> {
+ public:
+  explicit UserDefinedIndexIteratorWrapper(
+      std::unique_ptr<UserDefinedIndexIterator>&& udi_iter)
+      : udi_iter_(std::move(udi_iter)), valid_(false) {}
+
+  bool Valid() const override { return valid_; }
+
+  void SeekToFirst() override {
+    status_ = Status::NotSupported("SeekToFirst not supported");
+  }
+
+  void SeekToLast() override {
+    status_ = Status::NotSupported("SeekToLast not supported");
+  }
+
+  void Seek(const Slice& target) override {
+    ParsedInternalKey pkey;
+    status_ = ParseInternalKey(target, &pkey, /*log_err_key=*/false);
+    if (status_.ok()) {
+      status_ = udi_iter_->SeekAndGetResult(pkey.user_key, &result_);
+    }
+    if (status_.ok()) {
+      valid_ = result_.bound_check_result == IterBoundCheck::kInbound;
+      if (valid_) {
+        ikey_.Set(result_.key, 0, ValueType::kTypeValue);
+      }
+    } else {
+      valid_ = false;
+    }
+  }
+
+  void Next() override {
+    status_ = udi_iter_->NextAndGetResult(&result_);
+    if (status_.ok()) {
+      valid_ = result_.bound_check_result == IterBoundCheck::kInbound;
+      if (valid_) {
+        ikey_.Set(result_.key, 0, ValueType::kTypeValue);
+      }
+    } else {
+      valid_ = false;
+    }
+  }
+
+  bool NextAndGetResult(IterateResult* result) override {
+    status_ = udi_iter_->NextAndGetResult(&result_);
+    if (status_.ok()) {
+      valid_ = result_.bound_check_result == IterBoundCheck::kInbound;
+      if (valid_) {
+        ikey_.Set(result_.key, 0, ValueType::kTypeValue);
+      }
+      if (status_.ok()) {
+        *result = result_;
+      }
+    } else {
+      valid_ = false;
+    }
+    return valid_;
+  }
+
+  void SeekForPrev(const Slice& /*target*/) override {
+    status_ = Status::NotSupported("SeekForPrev not supported");
+  }
+
+  void Prev() override { status_ = Status::NotSupported("Prev not supported"); }
+
+  Slice key() const override { return Slice(*ikey_.const_rep()); }
+
+  IndexValue value() const override {
+    auto handle = udi_iter_->value();
+    IndexValue val(BlockHandle(handle.offset, handle.size), Slice());
+    return val;
+  }
+
+  Status status() const override { return status_; }
+
+  void Prepare(const MultiScanArgs* scan_opts) override {
+    if (scan_opts) {
+      udi_iter_->Prepare(scan_opts->GetScanRanges().data(),
+                         scan_opts->GetScanRanges().size());
+    }
+  }
+
+  IterBoundCheck UpperBoundCheckResult() override {
+    return result_.bound_check_result;
+  }
+
+ private:
+  std::unique_ptr<UserDefinedIndexIterator> udi_iter_;
+  IterateResult result_;
+  InternalKey ikey_;
+  Status status_;
+  bool valid_;
+};
+
+class UserDefinedIndexReaderWrapper : public BlockBasedTable::IndexReader {
+ public:
+  UserDefinedIndexReaderWrapper(
+      const std::string& name,
+      std::unique_ptr<BlockBasedTable::IndexReader>&& reader,
+      std::unique_ptr<UserDefinedIndexReader>&& udi_reader)
+      : name_(name),
+        reader_(std::move(reader)),
+        udi_reader_(std::move(udi_reader)) {}
+
+  virtual InternalIteratorBase<IndexValue>* NewIterator(
+      const ReadOptions& read_options, bool disable_prefix_seek,
+      IndexBlockIter* iter, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context) override {
+    if (!read_options.table_index_factory) {
+      return reader_->NewIterator(read_options, disable_prefix_seek, iter,
+                                  get_context, lookup_context);
+    }
+    if (name_ != read_options.table_index_factory->Name()) {
+      return NewErrorInternalIterator<IndexValue>(Status::InvalidArgument(
+          "Bad index name" +
+          std::string(read_options.table_index_factory->Name()) +
+          ". Only supported UDI is " + name_));
+    }
+    std::unique_ptr<UserDefinedIndexIterator> udi_iter =
+        udi_reader_->NewIterator(read_options);
+    if (udi_iter) {
+      InternalIteratorBase<IndexValue>* wrap_iter =
+          new UserDefinedIndexIteratorWrapper(std::move(udi_iter));
+      return wrap_iter;
+    }
+    return NewErrorInternalIterator<IndexValue>(
+        Status::NotFound("COuld not create UDI iterator"));
+  }
+
+  virtual Status CacheDependencies(
+      const ReadOptions& ro, bool pin,
+      FilePrefetchBuffer* tail_prefetch_buffer) override {
+    return reader_->CacheDependencies(ro, pin, tail_prefetch_buffer);
+  }
+
+  size_t ApproximateMemoryUsage() const override {
+    return reader_->ApproximateMemoryUsage();
+  }
+
+  virtual void EraseFromCacheBeforeDestruction(
+      uint32_t uncache_aggressiveness) override {
+    reader_->EraseFromCacheBeforeDestruction(uncache_aggressiveness);
+  }
+
+ private:
+  std::string name_;
+  std::unique_ptr<BlockBasedTable::IndexReader> reader_;
+  std::unique_ptr<UserDefinedIndexReader> udi_reader_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_fetcher.cc b/table/block_fetcher.cc
index 0637440bdcf9..2f4ee64b19fc 100644
--- a/table/block_fetcher.cc
+++ b/table/block_fetcher.cc
@@ -33,20 +33,20 @@ inline void BlockFetcher::ProcessTrailerIfPresent() {
   if (footer_.GetBlockTrailerSize() > 0) {
     assert(footer_.GetBlockTrailerSize() == BlockBasedTable::kBlockTrailerSize);
     if (read_options_.verify_checksums) {
-      io_status_ = status_to_io_status(
-          VerifyBlockChecksum(footer_, slice_.data(), block_size_,
-                              file_->file_name(), handle_.offset()));
+      io_status_ = status_to_io_status(VerifyBlockChecksum(
+          footer_, slice_.data(), block_size_, file_->file_name(),
+          handle_.offset(), block_type_));
       RecordTick(ioptions_.stats, BLOCK_CHECKSUM_COMPUTE_COUNT);
       if (!io_status_.ok()) {
         assert(io_status_.IsCorruption());
         RecordTick(ioptions_.stats, BLOCK_CHECKSUM_MISMATCH_COUNT);
       }
     }
-    compression_type_ =
+    compression_type() =
         BlockBasedTable::GetBlockCompressionType(slice_.data(), block_size_);
   } else {
     // E.g. plain table or cuckoo table
-    compression_type_ = kNoCompression;
+    compression_type() = kNoCompression;
   }
 }
 
@@ -74,7 +74,8 @@ inline bool BlockFetcher::TryGetUncompressBlockFromPersistentCache() {
 inline bool BlockFetcher::TryGetFromPrefetchBuffer() {
   if (prefetch_buffer_ != nullptr) {
     IOOptions opts;
-    IOStatus io_s = file_->PrepareIOOptions(read_options_, opts);
+    IODebugContext dbg;
+    IOStatus io_s = file_->PrepareIOOptions(read_options_, opts, &dbg);
     if (io_s.ok()) {
       bool read_from_prefetch_buffer = prefetch_buffer_->TryReadFromCache(
           opts, file_, handle_.offset(), block_size_with_trailer_, &slice_,
@@ -195,7 +196,7 @@ inline void BlockFetcher::CopyBufferToCompressedBuf() {
 }
 
 // Before - Entering this method means the block is uncompressed or do not need
-// to be uncompressed.
+// to be decompressed.
 //
 // The block can be in one of the following buffers:
 // 1. prefetch buffer if prefetch is enabled and the block is prefetched before
@@ -219,14 +220,14 @@ inline void BlockFetcher::GetBlockContents() {
     if (got_from_prefetch_buffer_ || used_buf_ == &stack_buf_[0]) {
       CopyBufferToHeapBuf();
     } else if (used_buf_ == compressed_buf_.get()) {
-      if (compression_type_ == kNoCompression &&
+      if (compression_type() == kNoCompression &&
           memory_allocator_ != memory_allocator_compressed_) {
         CopyBufferToHeapBuf();
       } else {
         heap_buf_ = std::move(compressed_buf_);
       }
     } else if (direct_io_buf_.get() != nullptr || use_fs_scratch_) {
-      if (compression_type_ == kNoCompression) {
+      if (compression_type() == kNoCompression) {
         CopyBufferToHeapBuf();
       } else {
         CopyBufferToCompressedBuf();
@@ -241,12 +242,13 @@ inline void BlockFetcher::GetBlockContents() {
 }
 
 // Read a block from the file and verify its checksum. Upon return, io_status_
-// will be updated with the status of the read, and slice_ will be updated
-// with a pointer to the data.
+// will be updated with the status of the read, and slice_ will be
+// updated with a pointer to the data.
 void BlockFetcher::ReadBlock(bool retry) {
   FSReadRequest read_req;
   IOOptions opts;
-  io_status_ = file_->PrepareIOOptions(read_options_, opts);
+  IODebugContext dbg;
+  io_status_ = file_->PrepareIOOptions(read_options_, opts, &dbg);
   opts.verify_and_reconstruct_read = retry;
   read_req.status.PermitUncheckedError();
   // Actual file read
@@ -256,8 +258,9 @@ void BlockFetcher::ReadBlock(bool retry) {
       PERF_CPU_TIMER_GUARD(
           block_read_cpu_time,
           ioptions_.env ? ioptions_.env->GetSystemClock().get() : nullptr);
-      io_status_ = file_->Read(opts, handle_.offset(), block_size_with_trailer_,
-                               &slice_, /*scratch=*/nullptr, &direct_io_buf_);
+      io_status_ =
+          file_->Read(opts, handle_.offset(), block_size_with_trailer_, &slice_,
+                      /*scratch=*/nullptr, &direct_io_buf_, &dbg);
       PERF_COUNTER_ADD(block_read_count, 1);
       used_buf_ = const_cast<char*>(slice_.data());
     } else if (use_fs_scratch_) {
@@ -269,7 +272,7 @@ void BlockFetcher::ReadBlock(bool retry) {
       read_req.len = block_size_with_trailer_;
       read_req.scratch = nullptr;
       io_status_ = file_->MultiRead(opts, &read_req, /*num_reqs=*/1,
-                                    /*AlignedBuf* =*/nullptr);
+                                    /*AlignedBuf* =*/nullptr, &dbg);
       PERF_COUNTER_ADD(block_read_count, 1);
 
       slice_ = Slice(read_req.result.data(), read_req.result.size());
@@ -283,9 +286,10 @@ void BlockFetcher::ReadBlock(bool retry) {
           block_read_cpu_time,
           ioptions_.env ? ioptions_.env->GetSystemClock().get() : nullptr);
 
-      io_status_ = file_->Read(
-          opts, handle_.offset(), /*size*/ block_size_with_trailer_,
-          /*result*/ &slice_, /*scratch*/ used_buf_, /*aligned_buf=*/nullptr);
+      io_status_ =
+          file_->Read(opts, handle_.offset(), /*size*/ block_size_with_trailer_,
+                      /*result*/ &slice_, /*scratch*/ used_buf_,
+                      /*aligned_buf=*/nullptr, &dbg);
       PERF_COUNTER_ADD(block_read_count, 1);
 #ifndef NDEBUG
       if (slice_.data() == &stack_buf_[0]) {
@@ -320,6 +324,7 @@ void BlockFetcher::ReadBlock(bool retry) {
   }
 
   PERF_COUNTER_ADD(block_read_byte, block_size_with_trailer_);
+  IGNORE_STATUS_IF_ERROR(io_status_);
   if (io_status_.ok()) {
     if (use_fs_scratch_ && !read_req.status.ok()) {
       io_status_ = read_req.status;
@@ -356,7 +361,7 @@ void BlockFetcher::ReadBlock(bool retry) {
 
 IOStatus BlockFetcher::ReadBlockContents() {
   if (TryGetUncompressBlockFromPersistentCache()) {
-    compression_type_ = kNoCompression;
+    compression_type() = kNoCompression;
 #ifndef NDEBUG
     contents_->has_trailer = footer_.GetBlockTrailerSize() > 0;
 #endif  // NDEBUG
@@ -384,19 +389,16 @@ IOStatus BlockFetcher::ReadBlockContents() {
     }
   }
 
-  if (do_uncompress_ && compression_type_ != kNoCompression) {
+  if (do_uncompress_ && compression_type() != kNoCompression) {
     PERF_TIMER_GUARD(block_decompress_time);
-    // compressed page, uncompress, update cache
-    UncompressionContext context(compression_type_);
-    UncompressionInfo info(context, uncompression_dict_, compression_type_);
-    io_status_ = status_to_io_status(UncompressSerializedBlock(
-        info, slice_.data(), block_size_, contents_, footer_.format_version(),
-        ioptions_, memory_allocator_));
+    // Process the compressed block without trailer
+    slice_.size_ = block_size_;
+    decomp_args_.compressed_data = slice_;
+    io_status_ = status_to_io_status(DecompressSerializedBlock(
+        decomp_args_, *decompressor_, contents_, ioptions_, memory_allocator_));
 #ifndef NDEBUG
     num_heap_buf_memcpy_++;
 #endif
-    // Save the compressed block without trailer
-    slice_ = Slice(slice_.data(), block_size_);
   } else {
     GetBlockContents();
     slice_ = Slice();
@@ -409,7 +411,7 @@ IOStatus BlockFetcher::ReadBlockContents() {
 
 IOStatus BlockFetcher::ReadAsyncBlockContents() {
   if (TryGetUncompressBlockFromPersistentCache()) {
-    compression_type_ = kNoCompression;
+    compression_type() = kNoCompression;
 #ifndef NDEBUG
     contents_->has_trailer = footer_.GetBlockTrailerSize() > 0;
 #endif  // NDEBUG
@@ -418,7 +420,8 @@ IOStatus BlockFetcher::ReadAsyncBlockContents() {
     assert(prefetch_buffer_ != nullptr);
     if (!for_compaction_) {
       IOOptions opts;
-      IOStatus io_s = file_->PrepareIOOptions(read_options_, opts);
+      IODebugContext dbg;
+      IOStatus io_s = file_->PrepareIOOptions(read_options_, opts, &dbg);
       if (!io_s.ok()) {
         return io_s;
       }
@@ -441,15 +444,14 @@ IOStatus BlockFetcher::ReadAsyncBlockContents() {
         }
         used_buf_ = const_cast<char*>(slice_.data());
 
-        if (do_uncompress_ && compression_type_ != kNoCompression) {
+        if (do_uncompress_ && compression_type() != kNoCompression) {
           PERF_TIMER_GUARD(block_decompress_time);
-          // compressed page, uncompress, update cache
-          UncompressionContext context(compression_type_);
-          UncompressionInfo info(context, uncompression_dict_,
-                                 compression_type_);
-          io_status_ = status_to_io_status(UncompressSerializedBlock(
-              info, slice_.data(), block_size_, contents_,
-              footer_.format_version(), ioptions_, memory_allocator_));
+          // Process the compressed block without trailer
+          slice_.size_ = block_size_;
+          decomp_args_.compressed_data = slice_;
+          io_status_ = status_to_io_status(
+              DecompressSerializedBlock(decomp_args_, *decompressor_, contents_,
+                                        ioptions_, memory_allocator_));
 #ifndef NDEBUG
           num_heap_buf_memcpy_++;
 #endif
diff --git a/table/block_fetcher.h b/table/block_fetcher.h
index 9441e0a73cae..76e59369f093 100644
--- a/table/block_fetcher.h
+++ b/table/block_fetcher.h
@@ -14,6 +14,7 @@
 #include "table/block_based/block_type.h"
 #include "table/format.h"
 #include "table/persistent_cache_options.h"
+#include "util/cast_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -46,7 +47,7 @@ class BlockFetcher {
                BlockContents* contents,
                const ImmutableOptions& ioptions /* ref retained */,
                bool do_uncompress, bool maybe_compressed, BlockType block_type,
-               const UncompressionDict& uncompression_dict /* ref retained */,
+               UnownedPtr<Decompressor> decompressor,
                const PersistentCacheOptions& cache_options /* ref retained */,
                MemoryAllocator* memory_allocator = nullptr,
                MemoryAllocator* memory_allocator_compressed = nullptr,
@@ -63,7 +64,7 @@ class BlockFetcher {
         block_type_(block_type),
         block_size_(static_cast<size_t>(handle_.size())),
         block_size_with_trailer_(block_size_ + footer.GetBlockTrailerSize()),
-        uncompression_dict_(uncompression_dict),
+        decompressor_(decompressor),
         cache_options_(cache_options),
         memory_allocator_(memory_allocator),
         memory_allocator_compressed_(memory_allocator_compressed),
@@ -81,14 +82,17 @@ class BlockFetcher {
   IOStatus ReadBlockContents();
   IOStatus ReadAsyncBlockContents();
 
-  inline CompressionType get_compression_type() const {
-    return compression_type_;
+  inline CompressionType compression_type() const {
+    return decomp_args_.compression_type;
+  }
+  inline CompressionType& compression_type() {
+    return decomp_args_.compression_type;
   }
   inline size_t GetBlockSizeWithTrailer() const {
     return block_size_with_trailer_;
   }
   inline Slice& GetCompressedBlock() {
-    assert(compression_type_ != kNoCompression);
+    assert(compression_type() != kNoCompression);
     return slice_;
   }
 
@@ -121,7 +125,7 @@ class BlockFetcher {
   const BlockType block_type_;
   const size_t block_size_;
   const size_t block_size_with_trailer_;
-  const UncompressionDict& uncompression_dict_;
+  UnownedPtr<Decompressor> decompressor_;
   const PersistentCacheOptions& cache_options_;
   MemoryAllocator* memory_allocator_;
   MemoryAllocator* memory_allocator_compressed_;
@@ -133,11 +137,11 @@ class BlockFetcher {
   CacheAllocationPtr compressed_buf_;
   char stack_buf_[kDefaultStackBufferSize];
   bool got_from_prefetch_buffer_ = false;
-  CompressionType compression_type_;
   bool for_compaction_ = false;
   bool use_fs_scratch_ = false;
   bool retry_corrupt_read_ = false;
   FSAllocationPtr fs_buf_;
+  Decompressor::Args decomp_args_;
 
   // return true if found
   bool TryGetUncompressBlockFromPersistentCache();
diff --git a/table/block_fetcher_test.cc b/table/block_fetcher_test.cc
index 17310edec6ae..e3d5dff735fd 100644
--- a/table/block_fetcher_test.cc
+++ b/table/block_fetcher_test.cc
@@ -319,10 +319,11 @@ class BlockFetcherTest : public testing::Test {
     PersistentCacheOptions persistent_cache_options;
     Footer footer;
     ReadFooter(file, &footer);
+    auto mgr = GetBuiltinV2CompressionManager();
     std::unique_ptr<BlockFetcher> fetcher(new BlockFetcher(
         file, nullptr /* prefetch_buffer */, footer, roptions, block, contents,
         ioptions, do_uncompress, compressed, block_type,
-        UncompressionDict::GetEmptyDict(), persistent_cache_options,
+        mgr->GetDecompressor().get(), persistent_cache_options,
         heap_buf_allocator, compressed_buf_allocator));
 
     ASSERT_OK(fetcher->ReadBlockContents());
@@ -335,7 +336,7 @@ class BlockFetcherTest : public testing::Test {
     if (do_uncompress) {
       *compression_type = kNoCompression;
     } else {
-      *compression_type = fetcher->get_compression_type();
+      *compression_type = fetcher->compression_type();
     }
   }
 
diff --git a/table/cleanable_test.cc b/table/cleanable_test.cc
index b58eb7dc61e2..c53571bf0077 100644
--- a/table/cleanable_test.cc
+++ b/table/cleanable_test.cc
@@ -31,7 +31,9 @@ void Multiplier(void* arg1, void* arg2) {
 TEST_F(CleanableTest, Register) {
   int n2 = 2, n3 = 3;
   int res = 1;
-  { Cleanable c1; }
+  {
+    Cleanable c1;
+  }
   // ~Cleanable
   ASSERT_EQ(1, res);
 
diff --git a/table/external_table.cc b/table/external_table.cc
new file mode 100644
index 000000000000..5fc20f406929
--- /dev/null
+++ b/table/external_table.cc
@@ -0,0 +1,487 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/external_table.h"
+
+#include "logging/logging.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block.h"
+#include "table/internal_iterator.h"
+#include "table/meta_blocks.h"
+#include "table/table_builder.h"
+#include "table/table_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+class ExternalTableIteratorAdapter : public InternalIterator {
+ public:
+  explicit ExternalTableIteratorAdapter(ExternalTableIterator* iterator)
+      : iterator_(iterator), valid_(false) {}
+
+  // No copying allowed
+  ExternalTableIteratorAdapter(const ExternalTableIteratorAdapter&) = delete;
+  ExternalTableIteratorAdapter& operator=(const ExternalTableIteratorAdapter&) =
+      delete;
+
+  ~ExternalTableIteratorAdapter() override {}
+
+  bool Valid() const override { return valid_; }
+
+  void SeekToFirst() override {
+    status_ = Status::OK();
+    if (iterator_) {
+      iterator_->SeekToFirst();
+      UpdateKey(OptSlice());
+    }
+  }
+
+  void SeekToLast() override {
+    status_ = Status::OK();
+    if (iterator_) {
+      iterator_->SeekToLast();
+      UpdateKey(OptSlice());
+    }
+  }
+
+  void Seek(const Slice& target) override {
+    status_ = Status::OK();
+    if (iterator_) {
+      ParsedInternalKey pkey;
+      status_ = ParseInternalKey(target, &pkey, /*log_err_key=*/false);
+      if (status_.ok()) {
+        iterator_->Seek(pkey.user_key);
+        UpdateKey(OptSlice());
+      }
+    }
+  }
+
+  void SeekForPrev(const Slice& target) override {
+    status_ = Status::OK();
+    if (iterator_) {
+      ParsedInternalKey pkey;
+      status_ = ParseInternalKey(target, &pkey, /*log_err_key=*/false);
+      if (status_.ok()) {
+        iterator_->SeekForPrev(pkey.user_key);
+        UpdateKey(OptSlice());
+      }
+    }
+  }
+
+  void Next() override {
+    if (iterator_) {
+      iterator_->Next();
+      UpdateKey(OptSlice());
+    }
+  }
+
+  bool NextAndGetResult(IterateResult* result) override {
+    if (iterator_) {
+      valid_ = iterator_->NextAndGetResult(&result_);
+      result->value_prepared = result_.value_prepared;
+      result->bound_check_result = result_.bound_check_result;
+      if (valid_) {
+        UpdateKey(result_.key);
+        result->key = key();
+      }
+    } else {
+      valid_ = false;
+    }
+    return valid_;
+  }
+
+  bool PrepareValue() override {
+    if (iterator_ && !result_.value_prepared) {
+      valid_ = iterator_->PrepareValue();
+      result_.value_prepared = true;
+    }
+    return valid_;
+  }
+
+  IterBoundCheck UpperBoundCheckResult() override {
+    if (iterator_) {
+      result_.bound_check_result = iterator_->UpperBoundCheckResult();
+    }
+    return result_.bound_check_result;
+  }
+
+  void Prev() override {
+    if (iterator_) {
+      iterator_->Prev();
+      UpdateKey(OptSlice());
+    }
+  }
+
+  Slice key() const override {
+    if (iterator_) {
+      return Slice(*key_.const_rep());
+    }
+    return Slice();
+  }
+
+  Slice value() const override {
+    if (iterator_) {
+      return iterator_->value();
+    }
+    return Slice();
+  }
+
+  Status status() const override { return status_; }
+
+  void Prepare(const MultiScanArgs* scan_opts) override {
+    if (iterator_ && scan_opts) {
+      iterator_->Prepare(scan_opts->GetScanRanges().data(), scan_opts->size());
+    } else if (iterator_) {
+      iterator_->Prepare(nullptr, 0);
+    }
+  }
+
+ private:
+  std::unique_ptr<ExternalTableIterator> iterator_;
+  InternalKey key_;
+  bool valid_;
+  Status status_;
+  IterateResult result_;
+
+  void UpdateKey(OptSlice res) {
+    if (iterator_) {
+      valid_ = iterator_->Valid();
+      status_ = iterator_->status();
+      if (valid_ && status_.ok()) {
+        key_.Set(res.has_value() ? res.value() : iterator_->key(), 0,
+                 ValueType::kTypeValue);
+      }
+    }
+  }
+};
+
+class ExternalTableReaderAdapter : public TableReader {
+ public:
+  explicit ExternalTableReaderAdapter(
+      const ImmutableOptions& ioptions,
+      std::unique_ptr<ExternalTableReader>&& reader)
+      : ioptions_(ioptions), reader_(std::move(reader)) {}
+
+  ~ExternalTableReaderAdapter() override {}
+
+  // No copying allowed
+  ExternalTableReaderAdapter(const ExternalTableReaderAdapter&) = delete;
+  ExternalTableReaderAdapter& operator=(const ExternalTableReaderAdapter&) =
+      delete;
+
+  InternalIterator* NewIterator(
+      const ReadOptions& read_options, const SliceTransform* prefix_extractor,
+      Arena* arena, bool /* skip_filters */, TableReaderCaller /* caller */,
+      size_t /* compaction_readahead_size */ = 0,
+      bool /* allow_unprepared_value */ = false) override {
+    auto iterator = reader_->NewIterator(read_options, prefix_extractor);
+    if (arena == nullptr) {
+      return new ExternalTableIteratorAdapter(iterator);
+    } else {
+      auto* mem = arena->AllocateAligned(sizeof(ExternalTableIteratorAdapter));
+      return new (mem) ExternalTableIteratorAdapter(iterator);
+    }
+  }
+
+  uint64_t ApproximateOffsetOf(const ReadOptions&, const Slice&,
+                               TableReaderCaller) override {
+    return 0;
+  }
+
+  uint64_t ApproximateSize(const ReadOptions&, const Slice&, const Slice&,
+                           TableReaderCaller) override {
+    return 0;
+  }
+
+  void SetupForCompaction() override {}
+
+  std::shared_ptr<const TableProperties> GetTableProperties() const override {
+    std::shared_ptr<TableProperties> props;
+    std::unique_ptr<char[]> property_block;
+    uint64_t property_block_size = 0;
+    uint64_t property_block_offset = 0;
+    Status s;
+    // Get the raw properties block from the external table reader. We don't
+    // support writing the global sequence number, but we still get and return
+    // the correct global seqno offset in the file to prevent accidental
+    // corruption.
+    s = reader_->GetPropertiesBlock(&property_block, &property_block_size,
+                                    &property_block_offset);
+    if (s.ok()) {
+      std::unique_ptr<TableProperties> table_properties =
+          std::make_unique<TableProperties>();
+      BlockContents block_contents(std::move(property_block),
+                                   property_block_size);
+      Block block(std::move(block_contents));
+      s = ParsePropertiesBlock(ioptions_, property_block_offset, block,
+                               table_properties);
+      if (s.ok()) {
+        props.reset(table_properties.release());
+      }
+    } else {
+      // Fallback to getting a minimal table properties structure from the
+      // external table reader
+      props = std::make_shared<TableProperties>(*reader_->GetTableProperties());
+      props->key_largest_seqno = 0;
+      props->key_smallest_seqno = 0;
+    }
+    return props;
+  }
+
+  size_t ApproximateMemoryUsage() const override { return 0; }
+
+  Status Get(const ReadOptions&, const Slice&, GetContext*,
+             const SliceTransform*, bool = false) override {
+    return Status::NotSupported(
+        "Get() not supported on external file iterator");
+  }
+
+  Status VerifyChecksum(const ReadOptions& /*ro*/, TableReaderCaller /*caller*/,
+                        bool /*meta_blocks_only*/ = false) override {
+    return Status::OK();
+  }
+
+ private:
+  const ImmutableOptions& ioptions_;
+  std::unique_ptr<ExternalTableReader> reader_;
+};
+
+class ExternalTableBuilderAdapter : public TableBuilder {
+ public:
+  explicit ExternalTableBuilderAdapter(
+      const TableBuilderOptions& topts,
+      std::unique_ptr<ExternalTableBuilder>&& builder,
+      std::unique_ptr<FSWritableFile>&& file)
+      : builder_(std::move(builder)),
+        file_(std::move(file)),
+        ioptions_(topts.ioptions) {
+    properties_.num_data_blocks = 1;
+    properties_.index_size = 0;
+    properties_.filter_size = 0;
+    properties_.format_version = 0;
+    properties_.key_largest_seqno = 0;
+    properties_.key_smallest_seqno = 0;
+    properties_.column_family_id = topts.column_family_id;
+    properties_.column_family_name = topts.column_family_name;
+    properties_.db_id = topts.db_id;
+    properties_.db_session_id = topts.db_session_id;
+    properties_.db_host_id = topts.ioptions.db_host_id;
+    if (!ReifyDbHostIdProperty(topts.ioptions.env, &properties_.db_host_id)
+             .ok()) {
+      ROCKS_LOG_INFO(topts.ioptions.logger,
+                     "db_host_id property will not be set");
+    }
+    properties_.orig_file_number = topts.cur_file_num;
+    properties_.comparator_name = topts.ioptions.user_comparator != nullptr
+                                      ? topts.ioptions.user_comparator->Name()
+                                      : "nullptr";
+    properties_.prefix_extractor_name =
+        topts.moptions.prefix_extractor != nullptr
+            ? topts.moptions.prefix_extractor->AsString()
+            : "nullptr";
+
+    for (auto& factory : *topts.internal_tbl_prop_coll_factories) {
+      assert(factory);
+      std::unique_ptr<InternalTblPropColl> collector{
+          factory->CreateInternalTblPropColl(topts.column_family_id,
+                                             topts.level_at_creation,
+                                             topts.ioptions.num_levels)};
+      if (collector) {
+        table_properties_collectors_.emplace_back(std::move(collector));
+      }
+    }
+  }
+
+  void Add(const Slice& key, const Slice& value) override {
+    ParsedInternalKey pkey;
+    status_ = ParseInternalKey(key, &pkey, /*log_err_key=*/false);
+    if (status_.ok()) {
+      if (pkey.type != ValueType::kTypeValue) {
+        status_ = Status::NotSupported(
+            "Value type " + std::to_string(pkey.type) + "not supported");
+      } else {
+        builder_->Add(pkey.user_key, value);
+        properties_.num_entries++;
+        properties_.raw_key_size += key.size();
+        properties_.raw_value_size += value.size();
+        NotifyCollectTableCollectorsOnAdd(key, value, /*file_size=*/0,
+                                          table_properties_collectors_,
+                                          ioptions_.logger);
+      }
+    }
+  }
+
+  Status status() const override {
+    if (status_.ok()) {
+      return builder_->status();
+    } else {
+      return status_;
+    }
+  }
+
+  IOStatus io_status() const override { return status_to_io_status(status()); }
+
+  Status Finish() override {
+    // Approximate the data size
+    properties_.data_size =
+        properties_.raw_key_size + properties_.raw_value_size;
+
+    PropertyBlockBuilder property_block_builder;
+    property_block_builder.AddTableProperty(properties_);
+    UserCollectedProperties more_user_collected_properties;
+    NotifyCollectTableCollectorsOnFinish(
+        table_properties_collectors_, ioptions_.logger, &property_block_builder,
+        more_user_collected_properties, properties_.readable_properties);
+    properties_.user_collected_properties.insert(
+        more_user_collected_properties.begin(),
+        more_user_collected_properties.end());
+
+    Slice prop_block = property_block_builder.Finish();
+    Status s = builder_->PutPropertiesBlock(prop_block);
+    if (s.ok() || s.IsNotSupported()) {
+      // If the builder doesn't support writing the properties block,
+      // we still call Finish() and let the external builder handle it.
+      s = builder_->Finish();
+    }
+
+    return s;
+  }
+
+  void Abandon() override { builder_->Abandon(); }
+
+  uint64_t FileSize() const override { return builder_->FileSize(); }
+
+  uint64_t NumEntries() const override { return properties_.num_entries; }
+
+  TableProperties GetTableProperties() const override {
+    return builder_->GetTableProperties();
+  }
+
+  std::string GetFileChecksum() const override {
+    return builder_->GetFileChecksum();
+  }
+
+  const char* GetFileChecksumFuncName() const override {
+    return builder_->GetFileChecksumFuncName();
+  }
+
+ private:
+  Status status_;
+  std::unique_ptr<ExternalTableBuilder> builder_;
+  std::unique_ptr<FSWritableFile> file_;
+  const ImmutableOptions& ioptions_;
+  TableProperties properties_;
+  std::vector<std::unique_ptr<InternalTblPropColl>>
+      table_properties_collectors_;
+};
+
+class ExternalTableFactoryAdapter : public TableFactory {
+ public:
+  explicit ExternalTableFactoryAdapter(
+      std::shared_ptr<ExternalTableFactory> inner)
+      : inner_(std::move(inner)) {}
+
+  const char* Name() const override { return inner_->Name(); }
+
+  using TableFactory::NewTableReader;
+  Status NewTableReader(
+      const ReadOptions& ro, const TableReaderOptions& topts,
+      std::unique_ptr<RandomAccessFileReader>&& file, uint64_t /* file_size */,
+      std::unique_ptr<TableReader>* table_reader,
+      bool /* prefetch_index_and_filter_in_cache */) const override {
+    // SstFileReader specifies largest_seqno as kMaxSequenceNumber to denote
+    // that its unknown
+    if (topts.largest_seqno > 0 && topts.largest_seqno != kMaxSequenceNumber) {
+      return Status::NotSupported(
+          "Ingesting file with sequence number larger than 0");
+    }
+    std::unique_ptr<ExternalTableReader> reader;
+    FileOptions fopts(topts.env_options);
+    ExternalTableOptions ext_topts(topts.prefix_extractor,
+                                   topts.ioptions.user_comparator,
+                                   topts.ioptions.fs, fopts);
+    auto status =
+        inner_->NewTableReader(ro, file->file_name(), ext_topts, &reader);
+    if (!status.ok()) {
+      return status;
+    }
+    table_reader->reset(
+        new ExternalTableReaderAdapter(topts.ioptions, std::move(reader)));
+    file.reset();
+    return Status::OK();
+  }
+
+  using TableFactory::NewTableBuilder;
+  TableBuilder* NewTableBuilder(const TableBuilderOptions& topts,
+                                WritableFileWriter* file) const override {
+    std::unique_ptr<ExternalTableBuilder> builder;
+    ExternalTableBuilderOptions ext_topts(
+        topts.read_options, topts.write_options,
+        topts.moptions.prefix_extractor, topts.ioptions.user_comparator,
+        topts.column_family_name, topts.reason);
+    auto file_wrapper =
+        std::make_unique<ExternalTableWritableFileWrapper>(file);
+    builder.reset(inner_->NewTableBuilder(ext_topts, file->file_name(),
+                                          file_wrapper.get()));
+    if (builder) {
+      return new ExternalTableBuilderAdapter(topts, std::move(builder),
+                                             std::move(file_wrapper));
+    }
+    return nullptr;
+  }
+
+  std::unique_ptr<TableFactory> Clone() const override { return nullptr; }
+
+ private:
+  // An FSWritableFile subclass for wrapping a WritableFileWriter. The
+  // latter is private to RocksDB, so we wrap it here in order to pass it
+  // to the ExternalTableBuilder. This is necessary for WritableFileWriter
+  // to intercept Append so that it can calculate the file checksum.
+  class ExternalTableWritableFileWrapper : public FSWritableFile {
+   public:
+    explicit ExternalTableWritableFileWrapper(WritableFileWriter* writer)
+        : writer_(writer) {}
+
+    using FSWritableFile::Append;
+    IOStatus Append(const Slice& data, const IOOptions& options,
+                    IODebugContext* /*dbg*/) override {
+      return writer_->Append(options, data);
+    }
+
+    IOStatus Close(const IOOptions& options, IODebugContext* /*dbg*/) override {
+      return writer_->Close(options);
+    }
+
+    IOStatus Flush(const IOOptions& options, IODebugContext* /*dbg*/) override {
+      return writer_->Flush(options);
+    }
+
+    IOStatus Sync(const IOOptions& options, IODebugContext* /*dbg*/) override {
+      return writer_->Sync(options, /*use_fsync=*/false);
+    }
+
+    uint64_t GetFileSize(const IOOptions& options,
+                         IODebugContext* dbg) override {
+      return writer_->writable_file()->GetFileSize(options, dbg);
+    }
+
+   private:
+    WritableFileWriter* writer_;
+  };
+
+  std::shared_ptr<ExternalTableFactory> inner_;
+};
+
+}  // namespace
+
+std::unique_ptr<TableFactory> NewExternalTableFactory(
+    std::shared_ptr<ExternalTableFactory> inner_factory) {
+  std::unique_ptr<TableFactory> res;
+  res = std::make_unique<ExternalTableFactoryAdapter>(std::move(inner_factory));
+  return res;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/table/external_table_reader.cc b/table/external_table_reader.cc
deleted file mode 100644
index fdd0de0a0674..000000000000
--- a/table/external_table_reader.cc
+++ /dev/null
@@ -1,220 +0,0 @@
-//  Copyright (c) Meta Platforms, Inc. and affiliates.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#include "rocksdb/external_table_reader.h"
-
-#include "rocksdb/table.h"
-#include "table/internal_iterator.h"
-#include "table/table_builder.h"
-#include "table/table_reader.h"
-
-namespace ROCKSDB_NAMESPACE {
-
-namespace {
-
-class ExternalTableIterator : public InternalIterator {
- public:
-  explicit ExternalTableIterator(Iterator* iterator) : iterator_(iterator) {}
-
-  // No copying allowed
-  ExternalTableIterator(const ExternalTableIterator&) = delete;
-  ExternalTableIterator& operator=(const ExternalTableIterator&) = delete;
-
-  ~ExternalTableIterator() override {}
-
-  bool Valid() const override { return iterator_ && iterator_->Valid(); }
-
-  void SeekToFirst() override {
-    status_ = Status::OK();
-    if (iterator_) {
-      iterator_->SeekToFirst();
-      UpdateKey();
-    }
-  }
-
-  void SeekToLast() override {
-    status_ = Status::OK();
-    if (iterator_) {
-      iterator_->SeekToLast();
-      UpdateKey();
-    }
-  }
-
-  void Seek(const Slice& target) override {
-    status_ = Status::OK();
-    if (iterator_) {
-      ParsedInternalKey pkey;
-      status_ = ParseInternalKey(target, &pkey, /*log_err_key=*/false);
-      if (status_.ok()) {
-        iterator_->Seek(pkey.user_key);
-        UpdateKey();
-      }
-    }
-  }
-
-  void SeekForPrev(const Slice& target) override {
-    status_ = Status::OK();
-    if (iterator_) {
-      ParsedInternalKey pkey;
-      status_ = ParseInternalKey(target, &pkey, /*log_err_key=*/false);
-      if (status_.ok()) {
-        iterator_->SeekForPrev(pkey.user_key);
-        UpdateKey();
-      }
-    }
-  }
-
-  void Next() override {
-    if (iterator_) {
-      iterator_->Next();
-      UpdateKey();
-    }
-  }
-
-  void Prev() override {
-    if (iterator_) {
-      iterator_->Prev();
-      UpdateKey();
-    }
-  }
-
-  Slice key() const override {
-    if (iterator_) {
-      return Slice(*key_.const_rep());
-    }
-    return Slice();
-  }
-
-  Slice value() const override {
-    if (iterator_) {
-      return iterator_->value();
-    }
-    return Slice();
-  }
-
-  Status status() const override {
-    return !status_.ok() ? status_
-                         : (iterator_ ? iterator_->status() : Status::OK());
-  }
-
- private:
-  std::unique_ptr<Iterator> iterator_;
-  InternalKey key_;
-  Status status_;
-
-  void UpdateKey() { key_.Set(iterator_->key(), 0, ValueType::kTypeValue); }
-};
-
-class ExternalTableReaderAdapter : public TableReader {
- public:
-  explicit ExternalTableReaderAdapter(
-      std::unique_ptr<ExternalTableReader> reader)
-      : reader_(std::move(reader)) {}
-
-  ~ExternalTableReaderAdapter() override {}
-
-  // No copying allowed
-  ExternalTableReaderAdapter(const ExternalTableReaderAdapter&) = delete;
-  ExternalTableReaderAdapter& operator=(const ExternalTableReaderAdapter&) =
-      delete;
-
-  InternalIterator* NewIterator(
-      const ReadOptions& read_options, const SliceTransform* prefix_extractor,
-      Arena* arena, bool /* skip_filters */, TableReaderCaller /* caller */,
-      size_t /* compaction_readahead_size */ = 0,
-      bool /* allow_unprepared_value */ = false) override {
-    auto iterator = reader_->NewIterator(read_options, prefix_extractor);
-    if (arena == nullptr) {
-      return new ExternalTableIterator(iterator);
-    } else {
-      auto* mem = arena->AllocateAligned(sizeof(ExternalTableIterator));
-      return new (mem) ExternalTableIterator(iterator);
-    }
-  }
-
-  uint64_t ApproximateOffsetOf(const ReadOptions&, const Slice&,
-                               TableReaderCaller) override {
-    return 0;
-  }
-
-  uint64_t ApproximateSize(const ReadOptions&, const Slice&, const Slice&,
-                           TableReaderCaller) override {
-    return 0;
-  }
-
-  void SetupForCompaction() override {}
-
-  std::shared_ptr<const TableProperties> GetTableProperties() const override {
-    std::shared_ptr<TableProperties> props =
-        std::make_shared<TableProperties>(*reader_->GetTableProperties());
-    props->key_largest_seqno = 0;
-    return props;
-  }
-
-  size_t ApproximateMemoryUsage() const override { return 0; }
-
-  Status Get(const ReadOptions&, const Slice&, GetContext*,
-             const SliceTransform*, bool = false) override {
-    return Status::NotSupported(
-        "Get() not supported on external file iterator");
-  }
-
-  virtual Status VerifyChecksum(const ReadOptions& /*ro*/,
-                                TableReaderCaller /*caller*/) override {
-    return Status::OK();
-  }
-
- private:
-  std::unique_ptr<ExternalTableReader> reader_;
-};
-
-class ExternalTableFactoryAdapter : public TableFactory {
- public:
-  explicit ExternalTableFactoryAdapter(
-      std::shared_ptr<ExternalTableFactory> inner)
-      : inner_(std::move(inner)) {}
-
-  const char* Name() const override { return inner_->Name(); }
-
-  using TableFactory::NewTableReader;
-  Status NewTableReader(
-      const ReadOptions& ro, const TableReaderOptions& topts,
-      std::unique_ptr<RandomAccessFileReader>&& file, uint64_t /* file_size */,
-      std::unique_ptr<TableReader>* table_reader,
-      bool /* prefetch_index_and_filter_in_cache */) const override {
-    std::unique_ptr<ExternalTableReader> reader;
-    ExternalTableOptions ext_topts(topts.prefix_extractor,
-                                   topts.ioptions.user_comparator);
-    auto status =
-        inner_->NewTableReader(ro, file->file_name(), ext_topts, &reader);
-    if (!status.ok()) {
-      return status;
-    }
-    table_reader->reset(new ExternalTableReaderAdapter(std::move(reader)));
-    file.reset();
-    return Status::OK();
-  }
-
-  TableBuilder* NewTableBuilder(const TableBuilderOptions&,
-                                WritableFileWriter*) const override {
-    return nullptr;
-  }
-
-  std::unique_ptr<TableFactory> Clone() const override { return nullptr; }
-
- private:
-  std::shared_ptr<ExternalTableFactory> inner_;
-};
-
-}  // namespace
-
-std::shared_ptr<TableFactory> NewExternalTableFactory(
-    std::shared_ptr<ExternalTableFactory> inner_factory) {
-  std::shared_ptr<TableFactory> res;
-  res.reset(new ExternalTableFactoryAdapter(std::move(inner_factory)));
-  return res;
-}
-
-}  // namespace ROCKSDB_NAMESPACE
diff --git a/table/format.cc b/table/format.cc
index 46de42fbe9e2..d0f80009d442 100644
--- a/table/format.cc
+++ b/table/format.cc
@@ -154,23 +154,18 @@ std::string IndexValue::ToString(bool hex, bool have_first_key) const {
 
 namespace {
 inline bool IsLegacyFooterFormat(uint64_t magic_number) {
-  return magic_number == kLegacyBlockBasedTableMagicNumber ||
-         magic_number == kLegacyPlainTableMagicNumber;
+  return magic_number == kLegacyPlainTableMagicNumber;
 }
+// Used when reading format_version=0 footers (plain tables)
 inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) {
-  if (magic_number == kLegacyBlockBasedTableMagicNumber) {
-    return kBlockBasedTableMagicNumber;
-  }
   if (magic_number == kLegacyPlainTableMagicNumber) {
     return kPlainTableMagicNumber;
   }
   assert(false);
   return magic_number;
 }
+// Used by plain tables to write format_version=0 footers
 inline uint64_t DownconvertToLegacyFooterFormat(uint64_t magic_number) {
-  if (magic_number == kBlockBasedTableMagicNumber) {
-    return kLegacyBlockBasedTableMagicNumber;
-  }
   if (magic_number == kPlainTableMagicNumber) {
     return kLegacyPlainTableMagicNumber;
   }
@@ -178,14 +173,18 @@ inline uint64_t DownconvertToLegacyFooterFormat(uint64_t magic_number) {
   return magic_number;
 }
 inline uint8_t BlockTrailerSizeForMagicNumber(uint64_t magic_number) {
-  if (magic_number == kBlockBasedTableMagicNumber ||
-      magic_number == kLegacyBlockBasedTableMagicNumber) {
+  if (magic_number == kBlockBasedTableMagicNumber) {
     return static_cast<uint8_t>(BlockBasedTable::kBlockTrailerSize);
   } else {
     return 0;
   }
 }
 
+// NOTE: format_version 0 is still used by plain tables and format_version 1 by
+// cuckoo table. For block-based tables, format_version < 2 is no longer
+// supported for reading or writing. Legacy magic numbers on block-based tables
+// are used only for good error reporting.
+//
 // Footer format, in three parts:
 // * Part1
 //   -> format_version == 0 (inferred from legacy magic number)
@@ -229,7 +228,8 @@ Status FooterBuilder::Build(uint64_t magic_number, uint32_t format_version,
                             const BlockHandle& index_handle,
                             uint32_t base_context_checksum) {
   assert(magic_number != Footer::kNullTableMagicNumber);
-  assert(IsSupportedFormatVersion(format_version));
+  assert(IsSupportedFormatVersionForWrite(magic_number, format_version) ||
+         TEST_AllowUnsupportedFormatVersion());
 
   char* part2;
   char* part3;
@@ -250,6 +250,7 @@ Status FooterBuilder::Build(uint64_t magic_number, uint32_t format_version,
     EncodeFixed64(cur, magic_number);
     assert(cur + 8 == slice_.data() + slice_.size());
   } else {
+    // format_version == 0 is used by plain tables
     slice_ = Slice(data_.data(), Footer::kVersion0EncodedLength);
     // Legacy SST files use kCRC32c checksum but it's not stored in footer.
     assert(checksum_type == kNoChecksum || checksum_type == kCRC32c);
@@ -336,9 +337,18 @@ Status Footer::DecodeFrom(Slice input, uint64_t input_offset,
   const char* magic_ptr = input.data() + input.size() - kMagicNumberLengthByte;
   uint64_t magic = DecodeFixed64(magic_ptr);
 
-  // We check for legacy formats here and silently upconvert them
+  // Legacy block-based tables (format_version < 2) are no longer supported.
+  // (This constant is only used here and in the corresponding test.)
+  if (magic == 0xdb4775248b80fb57ull) {
+    return Status::NotSupported(
+        "Unsupported legacy magic number for block-based SST format. Load with "
+        "RocksDB >= 4.6.0 and < 11.0.0 and run full compaction to upgrade.");
+  }
+
+  // Check for legacy formats
   bool legacy = IsLegacyFooterFormat(magic);
   if (legacy) {
+    // Legacy plain tables are still supported - upconvert magic
     magic = UpconvertLegacyFooterFormat(magic);
   }
   if (enforce_table_magic_number != 0 && enforce_table_magic_number != magic) {
@@ -354,6 +364,7 @@ Status Footer::DecodeFrom(Slice input, uint64_t input_offset,
   uint32_t computed_checksum = 0;
   uint64_t footer_offset = 0;
   if (legacy) {
+    // Legacy format (format_version=0, used by plain tables)
     // The size is already asserted to be at least kMinEncodedLength
     // at the beginning of the function
     input.remove_prefix(input.size() - kVersion0EncodedLength);
@@ -362,9 +373,11 @@ Status Footer::DecodeFrom(Slice input, uint64_t input_offset,
   } else {
     part3_ptr = magic_ptr - 4;
     format_version_ = DecodeFixed32(part3_ptr);
-    if (UNLIKELY(!IsSupportedFormatVersion(format_version_))) {
-      return Status::Corruption("Corrupt or unsupported format_version: " +
-                                std::to_string(format_version_));
+    if (UNLIKELY(!IsSupportedFormatVersionForRead(magic, format_version_) &&
+                 !TEST_AllowUnsupportedFormatVersion())) {
+      return Status::Corruption("Corrupt or unsupported format_version " +
+                                std::to_string(format_version_) +
+                                " for magic " + std::to_string(magic));
     }
     // All known format versions >= 1 occupy exactly this many bytes.
     if (UNLIKELY(input.size() < kNewVersionsEncodedLength)) {
@@ -475,15 +488,41 @@ std::string Footer::ToString() const {
   return result;
 }
 
-static Status ReadFooterFromFileInternal(const IOOptions& opts,
-                                         RandomAccessFileReader* file,
-                                         FileSystem& fs,
-                                         FilePrefetchBuffer* prefetch_buffer,
-                                         uint64_t file_size, Footer* footer,
-                                         uint64_t enforce_table_magic_number) {
-  if (file_size < Footer::kMinEncodedLength) {
+bool& TEST_AllowUnsupportedFormatVersion() {
+  static bool allow = false;
+  return allow;
+}
+
+static Status ReadFooterFromFileInternal(
+    const IOOptions& opts, RandomAccessFileReader* file, FileSystem& fs,
+    FilePrefetchBuffer* prefetch_buffer, uint64_t expected_file_size,
+    Footer* footer, uint64_t enforce_table_magic_number) {
+  uint64_t file_size_from_file_system = 0;
+  Status s;
+  // Prefer the more efficient FSRandomAccessFile::GetFileSize when available
+  s = file->file()->GetFileSize(&file_size_from_file_system);
+  if (!s.ok()) {
+    // Fall back on FileSystem::GetFileSize on failure
+    s = fs.GetFileSize(file->file_name(), IOOptions(),
+                       &file_size_from_file_system, nullptr);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  if (expected_file_size != file_size_from_file_system) {
+    // When file is opened during DB Open, the expected file size is from
+    // manifest. Otherwise it is not guaranteed.
+    return Status::Corruption("Sst file size mismatch between expected " +
+                              std::to_string(expected_file_size) +
+                              " and file system " +
+                              std::to_string(file_size_from_file_system) +
+                              " sstable: " + file->file_name());
+  }
+
+  if (expected_file_size < Footer::kMinEncodedLength) {
     return Status::Corruption("file is too short (" +
-                              std::to_string(file_size) +
+                              std::to_string(expected_file_size) +
                               " bytes) to be an "
                               "sstable: " +
                               file->file_name());
@@ -492,10 +531,9 @@ static Status ReadFooterFromFileInternal(const IOOptions& opts,
   std::array<char, Footer::kMaxEncodedLength + 1> footer_buf;
   AlignedBuf internal_buf;
   Slice footer_input;
-  uint64_t read_offset = (file_size > Footer::kMaxEncodedLength)
-                             ? file_size - Footer::kMaxEncodedLength
+  uint64_t read_offset = (expected_file_size > Footer::kMaxEncodedLength)
+                             ? expected_file_size - Footer::kMaxEncodedLength
                              : 0;
-  Status s;
   // TODO: Need to pass appropriate deadline to TryReadFromCache(). Right now,
   // there is no readahead for point lookups, so TryReadFromCache will fail if
   // the required data is not in the prefetch buffer. Once deadline is enabled
@@ -520,23 +558,14 @@ static Status ReadFooterFromFileInternal(const IOOptions& opts,
 
   TEST_SYNC_POINT_CALLBACK("ReadFooterFromFileInternal:0", &footer_input);
 
-  // Check that we actually read the whole footer from the file. It may be
-  // that size isn't correct.
+  // Check that we actually read the whole footer from the file.
   if (footer_input.size() < Footer::kMinEncodedLength) {
-    uint64_t size_on_disk = 0;
-    if (fs.GetFileSize(file->file_name(), IOOptions(), &size_on_disk, nullptr)
-            .ok()) {
-      // Similar to CheckConsistency message, but not completely sure the
-      // expected size always came from manifest.
-      return Status::Corruption("Sst file size mismatch: " + file->file_name() +
-                                ". Expected " + std::to_string(file_size) +
-                                ", actual size " +
-                                std::to_string(size_on_disk) + "\n");
-    } else {
-      return Status::Corruption(
-          "Missing SST footer data in file " + file->file_name() +
-          " File too short? Expected size: " + std::to_string(file_size));
-    }
+    return Status::Corruption(
+        "The number of bytes read for Footer input " +
+        std::to_string(footer_input.size()) +
+        " is smaller than minimum footer encoded length: " +
+        std::to_string(Footer::kMinEncodedLength) + " for file " +
+        file->file_name() + "\n");
   }
 
   s = footer->DecodeFrom(footer_input, read_offset, enforce_table_magic_number);
@@ -549,20 +578,21 @@ static Status ReadFooterFromFileInternal(const IOOptions& opts,
 
 Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
                           FileSystem& fs, FilePrefetchBuffer* prefetch_buffer,
-                          uint64_t file_size, Footer* footer,
+                          uint64_t expected_file_size, Footer* footer,
                           uint64_t enforce_table_magic_number,
                           Statistics* stats) {
-  Status s =
-      ReadFooterFromFileInternal(opts, file, fs, prefetch_buffer, file_size,
-                                 footer, enforce_table_magic_number);
+  Status s = ReadFooterFromFileInternal(opts, file, fs, prefetch_buffer,
+                                        expected_file_size, footer,
+                                        enforce_table_magic_number);
   if (s.IsCorruption() &&
       CheckFSFeatureSupport(&fs, FSSupportedOps::kVerifyAndReconstructRead)) {
     IOOptions new_opts = opts;
     new_opts.verify_and_reconstruct_read = true;
     footer->Reset();
     s = ReadFooterFromFileInternal(new_opts, file, fs,
-                                   /*prefetch_buffer=*/nullptr, file_size,
-                                   footer, enforce_table_magic_number);
+                                   /*prefetch_buffer=*/nullptr,
+                                   expected_file_size, footer,
+                                   enforce_table_magic_number);
     RecordTick(stats, FILE_READ_CORRUPTION_RETRY_COUNT);
     if (s.ok()) {
       RecordTick(stats, FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT);
@@ -653,70 +683,81 @@ uint32_t ComputeBuiltinChecksumWithLastByte(ChecksumType type, const char* data,
   }
 }
 
-Status UncompressBlockData(const UncompressionInfo& uncompression_info,
-                           const char* data, size_t size,
-                           BlockContents* out_contents, uint32_t format_version,
+Status DecompressBlockData(Decompressor::Args& args, Decompressor& decompressor,
+                           BlockContents* out_contents,
                            const ImmutableOptions& ioptions,
                            MemoryAllocator* allocator) {
-  Status ret = Status::OK();
-
-  assert(uncompression_info.type() != kNoCompression &&
-         "Invalid compression type");
+  assert(args.compression_type != kNoCompression && "Invalid compression type");
 
   StopWatchNano timer(ioptions.clock,
                       ShouldReportDetailedTime(ioptions.env, ioptions.stats));
-  size_t uncompressed_size = 0;
-  const char* error_msg = nullptr;
-  CacheAllocationPtr ubuf = UncompressData(
-      uncompression_info, data, size, &uncompressed_size,
-      GetCompressFormatForVersion(format_version), allocator, &error_msg);
-  if (!ubuf) {
-    if (!CompressionTypeSupported(uncompression_info.type())) {
-      ret = Status::NotSupported(
-          "Unsupported compression method for this build",
-          CompressionTypeToString(uncompression_info.type()));
-    } else {
-      std::ostringstream oss;
-      oss << "Corrupted compressed block contents";
-      if (error_msg) {
-        oss << ": " << error_msg;
-      }
-      ret = Status::Corruption(
-          oss.str(), CompressionTypeToString(uncompression_info.type()));
-    }
-    return ret;
+
+  Status s = decompressor.ExtractUncompressedSize(args);
+  if (UNLIKELY(!s.ok())) {
+    return s;
+  }
+  CacheAllocationPtr ubuf = AllocateBlock(args.uncompressed_size, allocator);
+  s = decompressor.DecompressBlock(args, ubuf.get());
+  if (UNLIKELY(!s.ok())) {
+    return s;
   }
 
-  *out_contents = BlockContents(std::move(ubuf), uncompressed_size);
+  *out_contents = BlockContents(std::move(ubuf), args.uncompressed_size);
 
   if (ShouldReportDetailedTime(ioptions.env, ioptions.stats)) {
     RecordTimeToHistogram(ioptions.stats, DECOMPRESSION_TIMES_NANOS,
                           timer.ElapsedNanos());
   }
-  RecordTick(ioptions.stats, BYTES_DECOMPRESSED_FROM, size);
+  RecordTick(ioptions.stats, BYTES_DECOMPRESSED_FROM,
+             args.compressed_data.size());
   RecordTick(ioptions.stats, BYTES_DECOMPRESSED_TO, out_contents->data.size());
   RecordTick(ioptions.stats, NUMBER_BLOCK_DECOMPRESSED);
 
-  TEST_SYNC_POINT_CALLBACK("UncompressBlockData:TamperWithReturnValue",
-                           static_cast<void*>(&ret));
-  TEST_SYNC_POINT_CALLBACK(
-      "UncompressBlockData:"
-      "TamperWithDecompressionOutput",
-      static_cast<void*>(out_contents));
+  TEST_SYNC_POINT_CALLBACK("DecompressBlockData:TamperWithReturnValue",
+                           static_cast<void*>(&s));
+  TEST_SYNC_POINT_CALLBACK("DecompressBlockData:TamperWithDecompressionOutput",
+                           static_cast<void*>(out_contents));
 
-  return ret;
+  return s;
 }
 
-Status UncompressSerializedBlock(const UncompressionInfo& uncompression_info,
-                                 const char* data, size_t size,
+Status DecompressBlockData(const char* data, size_t size, CompressionType type,
+                           Decompressor& decompressor,
+                           BlockContents* out_contents,
+                           const ImmutableOptions& ioptions,
+                           MemoryAllocator* allocator,
+                           Decompressor::ManagedWorkingArea* working_area) {
+  Decompressor::Args args;
+  args.compressed_data = Slice(data, size);
+  args.compression_type = type;
+  args.working_area = working_area;
+  return DecompressBlockData(args, decompressor, out_contents, ioptions,
+                             allocator);
+}
+
+Status DecompressSerializedBlock(const char* data, size_t size,
+                                 CompressionType type,
+                                 Decompressor& decompressor,
                                  BlockContents* out_contents,
-                                 uint32_t format_version,
                                  const ImmutableOptions& ioptions,
                                  MemoryAllocator* allocator) {
   assert(data[size] != kNoCompression);
-  assert(data[size] == static_cast<char>(uncompression_info.type()));
-  return UncompressBlockData(uncompression_info, data, size, out_contents,
-                             format_version, ioptions, allocator);
+  assert(data[size] == static_cast<char>(type));
+  return DecompressBlockData(data, size, type, decompressor, out_contents,
+                             ioptions, allocator);
+}
+
+Status DecompressSerializedBlock(Decompressor::Args& args,
+                                 Decompressor& decompressor,
+                                 BlockContents* out_contents,
+                                 const ImmutableOptions& ioptions,
+                                 MemoryAllocator* allocator) {
+  assert(args.compressed_data.data()[args.compressed_data.size()] !=
+         kNoCompression);
+  assert(args.compressed_data.data()[args.compressed_data.size()] ==
+         static_cast<char>(args.compression_type));
+  return DecompressBlockData(args, decompressor, out_contents, ioptions,
+                             allocator);
 }
 
 // Replace the contents of db_host_id with the actual hostname, if db_host_id
diff --git a/table/format.h b/table/format.h
index dac5d695be45..be7c0fa8abff 100644
--- a/table/format.h
+++ b/table/format.h
@@ -34,7 +34,6 @@ bool ShouldReportDetailedTime(Env* env, Statistics* stats);
 // the length of the magic number in bytes.
 constexpr uint32_t kMagicNumberLengthByte = 8;
 
-extern const uint64_t kLegacyBlockBasedTableMagicNumber;
 extern const uint64_t kBlockBasedTableMagicNumber;
 
 extern const uint64_t kLegacyPlainTableMagicNumber;
@@ -55,7 +54,7 @@ class BlockHandle {
   uint64_t offset() const { return offset_; }
   void set_offset(uint64_t _offset) { offset_ = _offset; }
 
-  // The size of the stored block
+  // The size of the stored block, this size does not include the block trailer.
   uint64_t size() const { return size_; }
   void set_size(uint64_t _size) { size_ = _size; }
 
@@ -90,6 +89,16 @@ class BlockHandle {
   static const BlockHandle kNullBlockHandle;
 };
 
+struct EncodedBlockHandle {
+  explicit EncodedBlockHandle(const BlockHandle& h) {
+    auto end = h.EncodeTo(buffer.data());
+    size = end - buffer.data();
+  }
+  Slice AsSlice() const { return Slice(buffer.data(), size); }
+  std::array<char, BlockHandle::kMaxEncodedLength> buffer;
+  size_t size;
+};
+
 // Value in block-based table file index.
 //
 // The index entry for block n is: y -> h, [x],
@@ -153,17 +162,49 @@ inline uint32_t ChecksumModifierForContext(uint32_t base_context_checksum,
   return modifier & all_or_nothing;
 }
 
-inline uint32_t GetCompressFormatForVersion(uint32_t format_version) {
-  // As of format_version 2, we encode compressed block with
-  // compress_format_version == 2. Before that, the version is 1.
-  // DO NOT CHANGE THIS FUNCTION, it affects disk format
-  return format_version >= 2 ? 2 : 1;
-}
+constexpr uint32_t kLatestBbtFormatVersion = 7;
 
-constexpr uint32_t kLatestFormatVersion = 6;
+// Minimum format version supported for reading SST files in block-based format.
+//
+// When phasing out old format versions, first increase the write minimum,
+// then later (>= 6 mo) increase the read minimum when removing the
+// implementation for both read and write.
+constexpr uint32_t kMinSupportedBbtFormatVersionForRead = 2;
+
+// Minimum format version supported for writing new SST files in block-based
+// format. This should be >= kMinSupportedFormatVersionForRead.
+//
+// When phasing out old format versions, first increase the write minimum,
+// then later (>= 6 mo) increase the read minimum when removing the
+// implementation for both read and write.
+constexpr uint32_t kMinSupportedBbtFormatVersionForWrite = 2;
+static_assert(kMinSupportedBbtFormatVersionForWrite >=
+              kMinSupportedBbtFormatVersionForRead);
+
+inline bool IsSupportedFormatVersionForRead(uint64_t magic, uint32_t version) {
+  if (magic == kBlockBasedTableMagicNumber) {
+    return version >= kMinSupportedBbtFormatVersionForRead &&
+           version <= kLatestBbtFormatVersion;
+  } else if (magic == kPlainTableMagicNumber) {
+    return version == 0;
+  } else if (magic == kCuckooTableMagicNumber) {
+    return version == 1;
+  } else {
+    return false;
+  }
+}
 
-inline bool IsSupportedFormatVersion(uint32_t version) {
-  return version <= kLatestFormatVersion;
+inline bool IsSupportedFormatVersionForWrite(uint64_t magic, uint32_t version) {
+  if (magic == kBlockBasedTableMagicNumber) {
+    return version >= kMinSupportedBbtFormatVersionForWrite &&
+           version <= kLatestBbtFormatVersion;
+  } else if (magic == kPlainTableMagicNumber) {
+    return version == 0;
+  } else if (magic == kCuckooTableMagicNumber) {
+    return version == 1;
+  } else {
+    return false;
+  }
 }
 
 // Same as having a unique id in footer.
@@ -175,6 +216,10 @@ inline bool FormatVersionUsesIndexHandleInFooter(uint32_t version) {
   return version < 6;
 }
 
+inline bool FormatVersionUsesCompressionManagerName(uint32_t version) {
+  return version >= 7;
+}
+
 // Footer encapsulates the fixed information stored at the tail end of every
 // SST file. In general, it should only include things that cannot go
 // elsewhere under the metaindex block. For example, checksum_type is
@@ -308,6 +353,10 @@ class FooterBuilder {
   std::array<char, Footer::kMaxEncodedLength> data_;
 };
 
+// Set to true to allow unit testing of writing unsupported block-based table
+// format versions (to test read side)
+bool& TEST_AllowUnsupportedFormatVersion();
+
 // Read the footer from file
 // If enforce_table_magic_number != 0, ReadFooterFromFile() will return
 // corruption if table_magic number is not equal to enforce_table_magic_number
@@ -382,6 +431,7 @@ struct BlockContents {
 
   // The additional memory space taken by the block data.
   size_t usable_size() const {
+    // FIXME: doesn't account for possible block trailer
     if (allocation.get() != nullptr) {
       auto allocator = allocation.get_deleter().allocator;
       if (allocator) {
@@ -416,21 +466,30 @@ struct BlockContents {
 // The `data` points to serialized block contents read in from file, which
 // must be compressed and include a trailer beyond `size`. A new buffer is
 // allocated with the given allocator (or default) and the uncompressed
-// contents are returned in `out_contents`.
-// format_version is as defined in include/rocksdb/table.h, which is
-// used to determine compression format version.
-Status UncompressSerializedBlock(const UncompressionInfo& info,
-                                 const char* data, size_t size,
+// contents are returned in `out_contents`. Statistics updated.
+Status DecompressSerializedBlock(const char* data, size_t size,
+                                 CompressionType type,
+                                 Decompressor& decompressor,
                                  BlockContents* out_contents,
-                                 uint32_t format_version,
                                  const ImmutableOptions& ioptions,
                                  MemoryAllocator* allocator = nullptr);
 
-// This is a variant of UncompressSerializedBlock that does not expect a
-// block trailer beyond `size`. (CompressionType is taken from `info`.)
-Status UncompressBlockData(const UncompressionInfo& info, const char* data,
-                           size_t size, BlockContents* out_contents,
-                           uint32_t format_version,
+Status DecompressSerializedBlock(Decompressor::Args& args,
+                                 Decompressor& decompressor,
+                                 BlockContents* out_contents,
+                                 const ImmutableOptions& ioptions,
+                                 MemoryAllocator* allocator = nullptr);
+
+// This is a variant of DecompressSerializedBlock that does not expect a
+// block trailer beyond `size`. (CompressionType is passed in.)
+Status DecompressBlockData(
+    const char* data, size_t size, CompressionType type,
+    Decompressor& decompressor, BlockContents* out_contents,
+    const ImmutableOptions& ioptions, MemoryAllocator* allocator = nullptr,
+    Decompressor::ManagedWorkingArea* working_area = nullptr);
+
+Status DecompressBlockData(Decompressor::Args& args, Decompressor& decompressor,
+                           BlockContents* out_contents,
                            const ImmutableOptions& ioptions,
                            MemoryAllocator* allocator = nullptr);
 
diff --git a/table/internal_iterator.h b/table/internal_iterator.h
index 8ecbb0f90b4f..b385ef55a2c0 100644
--- a/table/internal_iterator.h
+++ b/table/internal_iterator.h
@@ -10,6 +10,7 @@
 
 #include "db/dbformat.h"
 #include "file/readahead_file_info.h"
+#include "rocksdb/advanced_iterator.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/status.h"
@@ -19,19 +20,6 @@ namespace ROCKSDB_NAMESPACE {
 
 class PinnedIteratorsManager;
 
-enum class IterBoundCheck : char {
-  kUnknown = 0,
-  kOutOfBound,
-  kInbound,
-};
-
-struct IterateResult {
-  Slice key;
-  IterBoundCheck bound_check_result = IterBoundCheck::kUnknown;
-  // If false, PrepareValue() needs to be called before value().
-  bool value_prepared = true;
-};
-
 template <class TValue>
 class InternalIteratorBase : public Cleanable {
  public:
@@ -212,6 +200,8 @@ class InternalIteratorBase : public Cleanable {
   // used by MergingIterator and LevelIterator for now.
   virtual bool IsDeleteRangeSentinelKey() const { return false; }
 
+  virtual void Prepare(const MultiScanArgs* /*scan_opts*/) {}
+
  protected:
   void SeekForPrevImpl(const Slice& target, const CompareInterface* cmp) {
     Seek(target);
diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h
index b53076910ec6..b585aaa4a7e0 100644
--- a/table/iterator_wrapper.h
+++ b/table/iterator_wrapper.h
@@ -195,6 +195,14 @@ class IteratorWrapperBase {
     return iter_->IsDeleteRangeSentinelKey();
   }
 
+  // scan_opts lifetime is guaranteed until the iterator is destructed, or
+  // Prepare() is called with a new scan_opts
+  void Prepare(const MultiScanArgs* scan_opts) {
+    if (iter_) {
+      iter_->Prepare(scan_opts);
+    }
+  }
+
  private:
   void Update() {
     valid_ = iter_->Valid();
diff --git a/table/merging_iterator.cc b/table/merging_iterator.cc
index 375c811c59fc..e27f4c6fa270 100644
--- a/table/merging_iterator.cc
+++ b/table/merging_iterator.cc
@@ -482,6 +482,12 @@ class MergingIterator : public InternalIterator {
            current_->IsValuePinned();
   }
 
+  void Prepare(const MultiScanArgs* scan_opts) override {
+    for (auto& child : children_) {
+      child.iter.Prepare(scan_opts);
+    }
+  }
+
  private:
   // Represents an element in the min/max heap. Each HeapItem corresponds to a
   // point iterator or a range tombstone iterator, differentiated by
diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc
index 7d6ab76e294c..72ee79266af6 100644
--- a/table/meta_blocks.cc
+++ b/table/meta_blocks.cc
@@ -29,8 +29,6 @@ namespace ROCKSDB_NAMESPACE {
 const std::string kPropertiesBlockName = "rocksdb.properties";
 // NB: only used with format_version >= 6
 const std::string kIndexBlockName = "rocksdb.index";
-// Old property block name for backward compatibility
-const std::string kPropertiesBlockOldName = "rocksdb.stats";
 const std::string kCompressionDictBlockName = "rocksdb.compression_dict";
 const std::string kRangeDelBlockName = "rocksdb.range_del";
 
@@ -167,6 +165,9 @@ void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) {
   if (props.key_largest_seqno != UINT64_MAX) {
     Add(TablePropertiesNames::kKeyLargestSeqno, props.key_largest_seqno);
   }
+  if (props.key_smallest_seqno != UINT64_MAX) {
+    Add(TablePropertiesNames::kKeySmallestSeqno, props.key_smallest_seqno);
+  }
 }
 
 Slice PropertyBlockBuilder::Finish() {
@@ -253,6 +254,146 @@ bool NotifyCollectTableCollectorsOnFinish(
   return all_succeeded;
 }
 
+Status ParsePropertiesBlock(
+    const ImmutableOptions& ioptions, uint64_t offset, Block& properties_block,
+    std::unique_ptr<TableProperties>& new_table_properties) {
+  std::unique_ptr<MetaBlockIter> iter(properties_block.NewMetaIterator());
+
+  //  All pre-defined properties of type uint64_t
+  std::unordered_map<std::string, uint64_t*> predefined_uint64_properties = {
+      {TablePropertiesNames::kOriginalFileNumber,
+       &new_table_properties->orig_file_number},
+      {TablePropertiesNames::kDataSize, &new_table_properties->data_size},
+      {TablePropertiesNames::kIndexSize, &new_table_properties->index_size},
+      {TablePropertiesNames::kIndexPartitions,
+       &new_table_properties->index_partitions},
+      {TablePropertiesNames::kTopLevelIndexSize,
+       &new_table_properties->top_level_index_size},
+      {TablePropertiesNames::kIndexKeyIsUserKey,
+       &new_table_properties->index_key_is_user_key},
+      {TablePropertiesNames::kIndexValueIsDeltaEncoded,
+       &new_table_properties->index_value_is_delta_encoded},
+      {TablePropertiesNames::kFilterSize, &new_table_properties->filter_size},
+      {TablePropertiesNames::kRawKeySize, &new_table_properties->raw_key_size},
+      {TablePropertiesNames::kRawValueSize,
+       &new_table_properties->raw_value_size},
+      {TablePropertiesNames::kNumDataBlocks,
+       &new_table_properties->num_data_blocks},
+      {TablePropertiesNames::kNumEntries, &new_table_properties->num_entries},
+      {TablePropertiesNames::kNumFilterEntries,
+       &new_table_properties->num_filter_entries},
+      {TablePropertiesNames::kDeletedKeys,
+       &new_table_properties->num_deletions},
+      {TablePropertiesNames::kMergeOperands,
+       &new_table_properties->num_merge_operands},
+      {TablePropertiesNames::kNumRangeDeletions,
+       &new_table_properties->num_range_deletions},
+      {TablePropertiesNames::kFormatVersion,
+       &new_table_properties->format_version},
+      {TablePropertiesNames::kFixedKeyLen,
+       &new_table_properties->fixed_key_len},
+      {TablePropertiesNames::kColumnFamilyId,
+       &new_table_properties->column_family_id},
+      {TablePropertiesNames::kCreationTime,
+       &new_table_properties->creation_time},
+      {TablePropertiesNames::kOldestKeyTime,
+       &new_table_properties->oldest_key_time},
+      {TablePropertiesNames::kNewestKeyTime,
+       &new_table_properties->newest_key_time},
+      {TablePropertiesNames::kFileCreationTime,
+       &new_table_properties->file_creation_time},
+      {TablePropertiesNames::kSlowCompressionEstimatedDataSize,
+       &new_table_properties->slow_compression_estimated_data_size},
+      {TablePropertiesNames::kFastCompressionEstimatedDataSize,
+       &new_table_properties->fast_compression_estimated_data_size},
+      {TablePropertiesNames::kTailStartOffset,
+       &new_table_properties->tail_start_offset},
+      {TablePropertiesNames::kUserDefinedTimestampsPersisted,
+       &new_table_properties->user_defined_timestamps_persisted},
+      {TablePropertiesNames::kKeyLargestSeqno,
+       &new_table_properties->key_largest_seqno},
+      {TablePropertiesNames::kKeySmallestSeqno,
+       &new_table_properties->key_smallest_seqno},
+  };
+
+  Status s;
+  std::string last_key;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    s = iter->status();
+    if (!s.ok()) {
+      break;
+    }
+
+    auto key = iter->key().ToString();
+    // properties block should be strictly sorted with no duplicate key.
+    if (!last_key.empty() &&
+        BytewiseComparator()->Compare(key, last_key) <= 0) {
+      s = Status::Corruption("properties unsorted");
+      break;
+    }
+    last_key = key;
+
+    auto raw_val = iter->value();
+    auto pos = predefined_uint64_properties.find(key);
+
+    if (key == ExternalSstFilePropertyNames::kGlobalSeqno) {
+      new_table_properties->external_sst_file_global_seqno_offset =
+          offset + iter->ValueOffset();
+    }
+
+    if (pos != predefined_uint64_properties.end()) {
+      if (key == TablePropertiesNames::kDeletedKeys ||
+          key == TablePropertiesNames::kMergeOperands) {
+        // Insert in user-collected properties for API backwards compatibility
+        new_table_properties->user_collected_properties.insert(
+            {key, raw_val.ToString()});
+      }
+      // handle predefined rocksdb properties
+      uint64_t val;
+      if (!GetVarint64(&raw_val, &val)) {
+        // skip malformed value
+        auto error_msg =
+            "Detect malformed value in properties meta-block:"
+            "\tkey: " +
+            key + "\tval: " + raw_val.ToString();
+        ROCKS_LOG_ERROR(ioptions.logger, "%s", error_msg.c_str());
+        continue;
+      }
+      *(pos->second) = val;
+    } else if (key == TablePropertiesNames::kDbId) {
+      new_table_properties->db_id = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kDbSessionId) {
+      new_table_properties->db_session_id = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kDbHostId) {
+      new_table_properties->db_host_id = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kFilterPolicy) {
+      new_table_properties->filter_policy_name = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kColumnFamilyName) {
+      new_table_properties->column_family_name = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kComparator) {
+      new_table_properties->comparator_name = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kMergeOperator) {
+      new_table_properties->merge_operator_name = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kPrefixExtractorName) {
+      new_table_properties->prefix_extractor_name = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kPropertyCollectors) {
+      new_table_properties->property_collectors_names = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kCompression) {
+      new_table_properties->compression_name = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kCompressionOptions) {
+      new_table_properties->compression_options = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kSequenceNumberTimeMapping) {
+      new_table_properties->seqno_to_time_mapping = raw_val.ToString();
+    } else {
+      // handle user-collected properties
+      new_table_properties->user_collected_properties.insert(
+          {key, raw_val.ToString()});
+    }
+  }
+
+  return s;
+}
+
 // FIXME: should be a parameter for reading table properties to use persistent
 // cache?
 Status ReadTablePropertiesHelper(
@@ -282,7 +423,7 @@ Status ReadTablePropertiesHelper(
       BlockFetcher block_fetcher(
           file, prefetch_buffer, footer, modified_ro, handle, &block_contents,
           ioptions, false /* decompress */, false /*maybe_compressed*/,
-          BlockType::kProperties, UncompressionDict::GetEmptyDict(),
+          BlockType::kProperties, nullptr /*decompressor*/,
           PersistentCacheOptions::kEmpty, memory_allocator);
       s = block_fetcher.ReadBlockContents();
       if (!s.ok()) {
@@ -296,15 +437,16 @@ Status ReadTablePropertiesHelper(
       // If retrying, use a stronger file system read to check and correct
       // data corruption
       IOOptions opts;
-      if (PrepareIOFromReadOptions(ro, ioptions.clock, opts) !=
+      IODebugContext dbg;
+      if (PrepareIOFromReadOptions(ro, ioptions.clock, opts, &dbg) !=
           IOStatus::OK()) {
         return s;
       }
       opts.verify_and_reconstruct_read = true;
       std::unique_ptr<char[]> data(new char[len]);
       Slice result;
-      IOStatus io_s =
-          file->Read(opts, handle.offset(), len, &result, data.get(), nullptr);
+      IOStatus io_s = file->Read(opts, handle.offset(), len, &result,
+                                 data.get(), nullptr, &dbg);
       RecordTick(ioptions.stats, FILE_READ_CORRUPTION_RETRY_COUNT);
       if (!io_s.ok()) {
         ROCKS_LOG_INFO(ioptions.info_log,
@@ -324,146 +466,16 @@ Status ReadTablePropertiesHelper(
 
     uint64_t block_size = block_contents.data.size();
     Block properties_block(std::move(block_contents));
-    // Unfortunately, Block::size() might not equal block_contents.data.size(),
-    // and Block hides block_contents
-    std::unique_ptr<MetaBlockIter> iter(properties_block.NewMetaIterator());
-
     std::unique_ptr<TableProperties> new_table_properties{new TableProperties};
-    // All pre-defined properties of type uint64_t
-    std::unordered_map<std::string, uint64_t*> predefined_uint64_properties = {
-        {TablePropertiesNames::kOriginalFileNumber,
-         &new_table_properties->orig_file_number},
-        {TablePropertiesNames::kDataSize, &new_table_properties->data_size},
-        {TablePropertiesNames::kIndexSize, &new_table_properties->index_size},
-        {TablePropertiesNames::kIndexPartitions,
-         &new_table_properties->index_partitions},
-        {TablePropertiesNames::kTopLevelIndexSize,
-         &new_table_properties->top_level_index_size},
-        {TablePropertiesNames::kIndexKeyIsUserKey,
-         &new_table_properties->index_key_is_user_key},
-        {TablePropertiesNames::kIndexValueIsDeltaEncoded,
-         &new_table_properties->index_value_is_delta_encoded},
-        {TablePropertiesNames::kFilterSize, &new_table_properties->filter_size},
-        {TablePropertiesNames::kRawKeySize,
-         &new_table_properties->raw_key_size},
-        {TablePropertiesNames::kRawValueSize,
-         &new_table_properties->raw_value_size},
-        {TablePropertiesNames::kNumDataBlocks,
-         &new_table_properties->num_data_blocks},
-        {TablePropertiesNames::kNumEntries, &new_table_properties->num_entries},
-        {TablePropertiesNames::kNumFilterEntries,
-         &new_table_properties->num_filter_entries},
-        {TablePropertiesNames::kDeletedKeys,
-         &new_table_properties->num_deletions},
-        {TablePropertiesNames::kMergeOperands,
-         &new_table_properties->num_merge_operands},
-        {TablePropertiesNames::kNumRangeDeletions,
-         &new_table_properties->num_range_deletions},
-        {TablePropertiesNames::kFormatVersion,
-         &new_table_properties->format_version},
-        {TablePropertiesNames::kFixedKeyLen,
-         &new_table_properties->fixed_key_len},
-        {TablePropertiesNames::kColumnFamilyId,
-         &new_table_properties->column_family_id},
-        {TablePropertiesNames::kCreationTime,
-         &new_table_properties->creation_time},
-        {TablePropertiesNames::kOldestKeyTime,
-         &new_table_properties->oldest_key_time},
-        {TablePropertiesNames::kNewestKeyTime,
-         &new_table_properties->newest_key_time},
-        {TablePropertiesNames::kFileCreationTime,
-         &new_table_properties->file_creation_time},
-        {TablePropertiesNames::kSlowCompressionEstimatedDataSize,
-         &new_table_properties->slow_compression_estimated_data_size},
-        {TablePropertiesNames::kFastCompressionEstimatedDataSize,
-         &new_table_properties->fast_compression_estimated_data_size},
-        {TablePropertiesNames::kTailStartOffset,
-         &new_table_properties->tail_start_offset},
-        {TablePropertiesNames::kUserDefinedTimestampsPersisted,
-         &new_table_properties->user_defined_timestamps_persisted},
-        {TablePropertiesNames::kKeyLargestSeqno,
-         &new_table_properties->key_largest_seqno},
-    };
-
-    std::string last_key;
-    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-      s = iter->status();
-      if (!s.ok()) {
-        break;
-      }
-
-      auto key = iter->key().ToString();
-      // properties block should be strictly sorted with no duplicate key.
-      if (!last_key.empty() &&
-          BytewiseComparator()->Compare(key, last_key) <= 0) {
-        s = Status::Corruption("properties unsorted");
-        break;
-      }
-      last_key = key;
-
-      auto raw_val = iter->value();
-      auto pos = predefined_uint64_properties.find(key);
-
-      if (key == ExternalSstFilePropertyNames::kGlobalSeqno) {
-        new_table_properties->external_sst_file_global_seqno_offset =
-            handle.offset() + iter->ValueOffset();
-      }
-
-      if (pos != predefined_uint64_properties.end()) {
-        if (key == TablePropertiesNames::kDeletedKeys ||
-            key == TablePropertiesNames::kMergeOperands) {
-          // Insert in user-collected properties for API backwards compatibility
-          new_table_properties->user_collected_properties.insert(
-              {key, raw_val.ToString()});
-        }
-        // handle predefined rocksdb properties
-        uint64_t val;
-        if (!GetVarint64(&raw_val, &val)) {
-          // skip malformed value
-          auto error_msg =
-              "Detect malformed value in properties meta-block:"
-              "\tkey: " +
-              key + "\tval: " + raw_val.ToString();
-          ROCKS_LOG_ERROR(ioptions.logger, "%s", error_msg.c_str());
-          continue;
-        }
-        *(pos->second) = val;
-      } else if (key == TablePropertiesNames::kDbId) {
-        new_table_properties->db_id = raw_val.ToString();
-      } else if (key == TablePropertiesNames::kDbSessionId) {
-        new_table_properties->db_session_id = raw_val.ToString();
-      } else if (key == TablePropertiesNames::kDbHostId) {
-        new_table_properties->db_host_id = raw_val.ToString();
-      } else if (key == TablePropertiesNames::kFilterPolicy) {
-        new_table_properties->filter_policy_name = raw_val.ToString();
-      } else if (key == TablePropertiesNames::kColumnFamilyName) {
-        new_table_properties->column_family_name = raw_val.ToString();
-      } else if (key == TablePropertiesNames::kComparator) {
-        new_table_properties->comparator_name = raw_val.ToString();
-      } else if (key == TablePropertiesNames::kMergeOperator) {
-        new_table_properties->merge_operator_name = raw_val.ToString();
-      } else if (key == TablePropertiesNames::kPrefixExtractorName) {
-        new_table_properties->prefix_extractor_name = raw_val.ToString();
-      } else if (key == TablePropertiesNames::kPropertyCollectors) {
-        new_table_properties->property_collectors_names = raw_val.ToString();
-      } else if (key == TablePropertiesNames::kCompression) {
-        new_table_properties->compression_name = raw_val.ToString();
-      } else if (key == TablePropertiesNames::kCompressionOptions) {
-        new_table_properties->compression_options = raw_val.ToString();
-      } else if (key == TablePropertiesNames::kSequenceNumberTimeMapping) {
-        new_table_properties->seqno_to_time_mapping = raw_val.ToString();
-      } else {
-        // handle user-collected properties
-        new_table_properties->user_collected_properties.insert(
-            {key, raw_val.ToString()});
-      }
-    }
+    s = ParsePropertiesBlock(ioptions, handle.offset(), properties_block,
+                             new_table_properties);
 
     // Modified version of BlockFetcher checksum verification
     // (See write_global_seqno comment above)
     if (s.ok() && footer.GetBlockTrailerSize() > 0) {
       s = VerifyBlockChecksum(footer, properties_block.data(), block_size,
-                              file->file_name(), handle.offset());
+                              file->file_name(), handle.offset(),
+                              BlockType::kProperties);
       if (s.IsCorruption()) {
         if (new_table_properties->external_sst_file_global_seqno_offset != 0) {
           std::string tmp_buf(properties_block.data(), len);
@@ -472,7 +484,8 @@ Status ReadTablePropertiesHelper(
               handle.offset();
           EncodeFixed64(&tmp_buf[static_cast<size_t>(global_seqno_offset)], 0);
           s = VerifyBlockChecksum(footer, tmp_buf.data(), block_size,
-                                  file->file_name(), handle.offset());
+                                  file->file_name(), handle.offset(),
+                                  BlockType::kProperties);
         }
       }
     }
@@ -530,14 +543,6 @@ Status FindOptionalMetaBlock(InternalIterator* meta_index_iter,
     if (meta_index_iter->Valid() && meta_index_iter->key() == meta_block_name) {
       Slice v = meta_index_iter->value();
       return block_handle->DecodeFrom(&v);
-    } else if (meta_block_name == kPropertiesBlockName) {
-      // Have to try old name for compatibility
-      meta_index_iter->Seek(kPropertiesBlockOldName);
-      if (meta_index_iter->status().ok() && meta_index_iter->Valid() &&
-          meta_index_iter->key() == kPropertiesBlockOldName) {
-        Slice v = meta_index_iter->value();
-        return block_handle->DecodeFrom(&v);
-      }
     }
   }
   // else
@@ -567,8 +572,9 @@ Status ReadMetaIndexBlockInFile(RandomAccessFileReader* file,
                                 Footer* footer_out) {
   Footer footer;
   IOOptions opts;
+  IODebugContext dbg;
   Status s;
-  s = file->PrepareIOOptions(read_options, opts);
+  s = file->PrepareIOOptions(read_options, opts, &dbg);
   if (!s.ok()) {
     return s;
   }
@@ -585,7 +591,7 @@ Status ReadMetaIndexBlockInFile(RandomAccessFileReader* file,
   return BlockFetcher(file, prefetch_buffer, footer, read_options,
                       metaindex_handle, metaindex_contents, ioptions,
                       false /* do decompression */, false /*maybe_compressed*/,
-                      BlockType::kMetaIndex, UncompressionDict::GetEmptyDict(),
+                      BlockType::kMetaIndex, nullptr /*decompressor*/,
                       PersistentCacheOptions::kEmpty, memory_allocator)
       .ReadBlockContents();
 }
@@ -638,8 +644,8 @@ Status ReadMetaBlock(RandomAccessFileReader* file,
   return BlockFetcher(file, prefetch_buffer, footer, read_options, block_handle,
                       contents, ioptions, false /* decompress */,
                       false /*maybe_compressed*/, block_type,
-                      UncompressionDict::GetEmptyDict(),
-                      PersistentCacheOptions::kEmpty, memory_allocator)
+                      nullptr /*decompressor*/, PersistentCacheOptions::kEmpty,
+                      memory_allocator)
       .ReadBlockContents();
 }
 
diff --git a/table/meta_blocks.h b/table/meta_blocks.h
index a6aacdf5030a..0012e9c305fc 100644
--- a/table/meta_blocks.h
+++ b/table/meta_blocks.h
@@ -22,6 +22,7 @@
 
 namespace ROCKSDB_NAMESPACE {
 
+class Block;
 class BlockBuilder;
 class BlockHandle;
 class Env;
@@ -33,7 +34,6 @@ struct TableProperties;
 // Meta block names for metaindex
 extern const std::string kPropertiesBlockName;
 extern const std::string kIndexBlockName;
-extern const std::string kPropertiesBlockOldName;
 extern const std::string kCompressionDictBlockName;
 extern const std::string kRangeDelBlockName;
 
@@ -110,6 +110,10 @@ bool NotifyCollectTableCollectorsOnFinish(
     UserCollectedProperties& user_collected_properties,
     UserCollectedProperties& readable_properties);
 
+Status ParsePropertiesBlock(
+    const ImmutableOptions& ioptions, uint64_t offset, Block& block,
+    std::unique_ptr<TableProperties>& new_table_properties);
+
 // Read table properties from a file using known BlockHandle.
 // @returns a status to indicate if the operation succeeded. On success,
 //          *table_properties will point to a heap-allocated TableProperties
diff --git a/table/multiget_context.h b/table/multiget_context.h
index a82c08aabe3c..c42b3b2c1869 100644
--- a/table/multiget_context.h
+++ b/table/multiget_context.h
@@ -129,7 +129,9 @@ class MultiGetContext {
       lookup_key_ptr_ = reinterpret_cast<LookupKey*>(lookup_key_heap_buf.get());
     }
 
-    for (size_t iter = 0; iter != num_keys_; ++iter) {
+    for (size_t iter = 0;
+         iter < num_keys_ && /* suppress a warning */ iter < MAX_BATCH_SIZE;
+         ++iter) {
       // autovector may not be contiguous storage, so make a copy
       sorted_keys_[iter] = (*sorted_keys)[begin + iter];
       sorted_keys_[iter]->lkey = new (&lookup_key_ptr_[iter])
@@ -219,7 +221,9 @@ class MultiGetContext {
         while (++index_ < range_->end_ &&
                (Mask{1} << index_) &
                    (range_->ctx_->value_mask_ | range_->skip_mask_ |
-                    range_->invalid_mask_));
+                    range_->invalid_mask_)) {
+          // empty loop body
+        }
         return *this;
       }
 
diff --git a/table/plain/plain_table_builder.cc b/table/plain/plain_table_builder.cc
index 541b4a5b768a..9c4f87553774 100644
--- a/table/plain/plain_table_builder.cc
+++ b/table/plain/plain_table_builder.cc
@@ -151,6 +151,14 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
     return;
   }
 
+#ifndef NDEBUG
+  bool skip = false;
+  TEST_SYNC_POINT_CALLBACK("PlainTableBuilder::Add::skip", (void*)&skip);
+  if (skip) {
+    return;
+  }
+#endif  // !NDEBUG
+
   // Store key hash
   if (store_index_in_file_) {
     if (moptions_.prefix_extractor == nullptr) {
diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc
index 578e92aa3126..b90f24da6898 100644
--- a/table/plain/plain_table_reader.cc
+++ b/table/plain/plain_table_reader.cc
@@ -120,7 +120,9 @@ Status PlainTableReader::Open(
     bool full_scan_mode, const bool immortal_table,
     const SliceTransform* prefix_extractor) {
   if (file_size > PlainTableIndex::kMaxFileSize) {
-    return Status::NotSupported("File is too large for PlainTableReader!");
+    return Status::NotSupported("File size " + std::to_string(file_size) +
+                                " exceeds PlainTableReader max file size " +
+                                std::to_string(PlainTableIndex::kMaxFileSize));
   }
 
   std::unique_ptr<TableProperties> props;
diff --git a/table/sst_file_dumper.cc b/table/sst_file_dumper.cc
index 905eef7004a7..a4b235546559 100644
--- a/table/sst_file_dumper.cc
+++ b/table/sst_file_dumper.cc
@@ -23,6 +23,7 @@
 #include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
+#include "rocksdb/file_checksum.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/status.h"
@@ -47,12 +48,13 @@ SstFileDumper::SstFileDumper(const Options& options,
                              Temperature file_temp, size_t readahead_size,
                              bool verify_checksum, bool output_hex,
                              bool decode_blob_index, const EnvOptions& soptions,
-                             bool silent)
+                             bool silent, bool show_sequence_number_type)
     : file_name_(file_path),
       read_num_(0),
       file_temp_(file_temp),
       output_hex_(output_hex),
       decode_blob_index_(decode_blob_index),
+      show_sequence_number_type_(show_sequence_number_type),
       soptions_(soptions),
       silent_(silent),
       options_(options),
@@ -84,6 +86,7 @@ Status SstFileDumper::GetTableReader(const std::string& file_path) {
   uint64_t file_size = 0;
   FileOptions fopts = soptions_;
   fopts.temperature = file_temp_;
+  fopts.file_checksum_func_name = kNoFileChecksumFuncName;
   Status s = fs->NewRandomAccessFile(file_path, fopts, &file, nullptr);
   if (s.ok()) {
     // check empty file
@@ -128,18 +131,18 @@ Status SstFileDumper::GetTableReader(const std::string& file_path) {
       if (magic_number == kCuckooTableMagicNumber) {
         fopts = soptions_;
         fopts.temperature = file_temp_;
+        fopts.file_checksum_func_name = kNoFileChecksumFuncName;
       }
 
       fs->NewRandomAccessFile(file_path, fopts, &file, nullptr);
       file_.reset(new RandomAccessFileReader(std::move(file), file_path));
     }
 
-    // For old sst format, ReadTableProperties might fail but file can be read
-    if (ReadTableProperties(magic_number, file_.get(), file_size,
+    s = ReadTableProperties(magic_number, file_.get(), file_size,
                             (magic_number == kBlockBasedTableMagicNumber)
                                 ? &prefetch_buffer
-                                : nullptr)
-            .ok()) {
+                                : nullptr);
+    if (s.ok()) {
       s = SetTableOptionsByMagicNumber(magic_number);
       if (s.ok()) {
         if (table_properties_ && !table_properties_->comparator_name.empty()) {
@@ -154,10 +157,16 @@ Status SstFileDumper::GetTableReader(const std::string& file_path) {
           }
         }
       }
-    } else {
-      s = SetOldTableOptions();
     }
     options_.comparator = internal_comparator_.user_comparator();
+
+    {
+      Status status = ReadMetaIndexBlockInFile(
+          file_.get(), file_size, magic_number, ImmutableOptions(options_),
+          ReadOptions(), &meta_index_contents_);
+      // Ignore any errors since this is required for a specific CLI option
+      status.PermitUncheckedError();
+    }
   }
 
   if (s.ok()) {
@@ -172,7 +181,8 @@ Status SstFileDumper::NewTableReader(
     const InternalKeyComparator& /*internal_comparator*/, uint64_t file_size,
     std::unique_ptr<TableReader>* /*table_reader*/) {
   auto t_opt = TableReaderOptions(
-      ioptions_, moptions_.prefix_extractor, soptions_, internal_comparator_,
+      ioptions_, moptions_.prefix_extractor,
+      moptions_.compression_manager.get(), soptions_, internal_comparator_,
       0 /* block_protection_bytes_per_key */, false /* skip_filters */,
       false /* immortal */, true /* force_direct_prefetch */, -1 /* level */,
       nullptr /* block_cache_tracer */, 0 /* max_file_size_for_l0_meta_pin */,
@@ -211,7 +221,7 @@ Status SstFileDumper::DumpTable(const std::string& out_filename) {
   Env* env = options_.env;
   Status s = env->NewWritableFile(out_filename, &out_file, soptions_);
   if (s.ok()) {
-    s = table_reader_->DumpTable(out_file.get());
+    s = table_reader_->DumpTable(out_file.get(), show_sequence_number_type_);
   }
   if (!s.ok()) {
     // close the file before return error, ignore the close error if there's any
@@ -222,8 +232,9 @@ Status SstFileDumper::DumpTable(const std::string& out_filename) {
 }
 
 Status SstFileDumper::CalculateCompressedTableSize(
-    const TableBuilderOptions& tb_options, size_t block_size,
-    uint64_t* num_data_blocks, uint64_t* compressed_table_size) {
+    const TableBuilderOptions& tb_options, TableProperties* props,
+    std::chrono::microseconds* write_time,
+    std::chrono::microseconds* read_time) {
   std::unique_ptr<Env> env(NewMemEnv(options_.env));
   std::unique_ptr<WritableFileWriter> dest_writer;
   Status s =
@@ -232,12 +243,11 @@ Status SstFileDumper::CalculateCompressedTableSize(
   if (!s.ok()) {
     return s;
   }
-  BlockBasedTableOptions table_options;
-  table_options.block_size = block_size;
-  BlockBasedTableFactory block_based_tf(table_options);
-  std::unique_ptr<TableBuilder> table_builder;
-  table_builder.reset(
-      block_based_tf.NewTableBuilder(tb_options, dest_writer.get()));
+  std::chrono::steady_clock::time_point start =
+      std::chrono::steady_clock::now();
+  std::unique_ptr<TableBuilder> table_builder{
+      tb_options.moptions.table_factory->NewTableBuilder(tb_options,
+                                                         dest_writer.get())};
   std::unique_ptr<InternalIterator> iter(table_reader_->NewIterator(
       read_options_, moptions_.prefix_extractor.get(), /*arena=*/nullptr,
       /*skip_filters=*/false, TableReaderCaller::kSSTDumpTool));
@@ -248,56 +258,112 @@ Status SstFileDumper::CalculateCompressedTableSize(
   if (!s.ok()) {
     return s;
   }
+  iter.reset();
   s = table_builder->Finish();
+  *write_time = std::chrono::duration_cast<std::chrono::microseconds>(
+      std::chrono::steady_clock::now() - start);
+  if (!s.ok()) {
+    return s;
+  }
+  s = dest_writer->Close({});
+  if (!s.ok()) {
+    return s;
+  }
+  dest_writer.reset();
+  *props = table_builder->GetTableProperties();
+  start = std::chrono::steady_clock::now();
+  TableReaderOptions reader_options(ioptions_, moptions_.prefix_extractor,
+                                    moptions_.compression_manager.get(),
+                                    soptions_, internal_comparator_,
+                                    0 /* block_protection_bytes_per_key */);
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  s = RandomAccessFileReader::Create(env->GetFileSystem(), testFileName,
+                                     soptions_, &file_reader, /*dbg=*/nullptr);
+  if (!s.ok()) {
+    return s;
+  }
+  std::unique_ptr<TableReader> table_reader;
+  s = tb_options.moptions.table_factory->NewTableReader(
+      reader_options, std::move(file_reader), table_builder->FileSize(),
+      &table_reader);
   if (!s.ok()) {
     return s;
   }
-  *compressed_table_size = table_builder->FileSize();
-  assert(num_data_blocks != nullptr);
-  *num_data_blocks = table_builder->GetTableProperties().num_data_blocks;
+  iter.reset(table_reader->NewIterator(
+      read_options_, moptions_.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kSSTDumpTool));
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+  }
+  s = iter->status();
+  if (!s.ok()) {
+    return s;
+  }
+  iter.reset();
+  table_reader.reset();
+  file_reader.reset();
+  *read_time = std::chrono::duration_cast<std::chrono::microseconds>(
+      std::chrono::steady_clock::now() - start);
   return env->DeleteFile(testFileName);
 }
 
 Status SstFileDumper::ShowAllCompressionSizes(
-    size_t block_size,
-    const std::vector<std::pair<CompressionType, const char*>>&
-        compression_types,
-    int32_t compress_level_from, int32_t compress_level_to,
-    uint32_t max_dict_bytes, uint32_t zstd_max_train_bytes,
-    uint64_t max_dict_buffer_bytes, bool use_zstd_dict_trainer) {
-  fprintf(stdout, "Block Size: %" ROCKSDB_PRIszt "\n", block_size);
-  for (auto& i : compression_types) {
-    if (CompressionTypeSupported(i.first)) {
-      fprintf(stdout, "Compression: %-24s\n", i.second);
-      CompressionOptions compress_opt;
-      compress_opt.max_dict_bytes = max_dict_bytes;
-      compress_opt.zstd_max_train_bytes = zstd_max_train_bytes;
-      compress_opt.max_dict_buffer_bytes = max_dict_buffer_bytes;
-      compress_opt.use_zstd_dict_trainer = use_zstd_dict_trainer;
+    const std::vector<CompressionType>& compression_types,
+    int32_t compress_level_from, int32_t compress_level_to) {
+#ifndef NDEBUG
+  fprintf(stdout,
+          "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
+#endif
+  BlockBasedTableOptions bbto;
+  if (options_.table_factory->IsInstanceOf(
+          TableFactory::kBlockBasedTableName())) {
+    bbto = *(static_cast_with_check<BlockBasedTableFactory>(
+                 options_.table_factory.get()))
+                ->GetOptions<BlockBasedTableOptions>();
+  }
+
+  for (CompressionType ctype : compression_types) {
+    std::string cname;
+    if (!GetStringFromCompressionType(&cname, ctype).ok()) {
+      // Can produce names like "Reserved4F" for unrecognized values
+      cname = CompressionTypeToString(ctype);
+    }
+    if (options_.compression_manager
+            ? options_.compression_manager->SupportsCompressionType(ctype)
+            : CompressionTypeSupported(ctype)) {
+      CompressionOptions compress_opt = options_.compression_opts;
+      fprintf(stdout,
+              "Compression: %-24s Block Size: %" PRIu64 "  Threads: %u\n",
+              cname.c_str(), bbto.block_size, compress_opt.parallel_threads);
       for (int32_t j = compress_level_from; j <= compress_level_to; j++) {
-        fprintf(stdout, "Compression level: %d", j);
+        fprintf(stdout, "Cx level: %d", j);
         compress_opt.level = j;
-        Status s = ShowCompressionSize(block_size, i.first, compress_opt);
+        Status s = ShowCompressionSize(ctype, compress_opt);
         if (!s.ok()) {
           return s;
         }
       }
     } else {
-      fprintf(stdout, "Unsupported compression type: %s.\n", i.second);
+      fprintf(stdout, "Unsupported compression type: %s.\n", cname.c_str());
     }
   }
   return Status::OK();
 }
 
 Status SstFileDumper::ShowCompressionSize(
-    size_t block_size, CompressionType compress_type,
-    const CompressionOptions& compress_opt) {
-  Options opts;
+    CompressionType compress_type, const CompressionOptions& compress_opt) {
+  Options opts = options_;  // Use compression_manager etc.
   opts.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
   opts.statistics->set_stats_level(StatsLevel::kAll);
+  if (!opts.table_factory->IsInstanceOf(TableFactory::kBlockBasedTableName())) {
+    // Currently need block-based table for compression
+    opts.table_factory = std::make_shared<BlockBasedTableFactory>();
+  }
+
+  // Create internal Options types
   const ImmutableOptions imoptions(opts);
   const ColumnFamilyOptions cfo(opts);
   const MutableCFOptions moptions(cfo);
+
   // TODO: plumb Env::IOActivity, Env::IOPriority
   const ReadOptions read_options;
   const WriteOptions write_options;
@@ -312,24 +378,27 @@ Status SstFileDumper::ShowCompressionSize(
       &block_based_table_factories, compress_type, compress_opt,
       TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
       column_family_name, unknown_level, kUnknownNewestKeyTime);
-  uint64_t num_data_blocks = 0;
-  std::chrono::steady_clock::time_point start =
-      std::chrono::steady_clock::now();
-  uint64_t file_size;
-  Status s = CalculateCompressedTableSize(tb_opts, block_size, &num_data_blocks,
-                                          &file_size);
+  TableProperties props;
+  std::chrono::microseconds write_time;
+  std::chrono::microseconds read_time;
+  Status s =
+      CalculateCompressedTableSize(tb_opts, &props, &write_time, &read_time);
   if (!s.ok()) {
     return s;
   }
 
-  std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
-  fprintf(stdout, " Size: %10" PRIu64, file_size);
-  fprintf(stdout, " Blocks: %6" PRIu64, num_data_blocks);
-  fprintf(stdout, " Time Taken: %10s microsecs",
-          std::to_string(
-              std::chrono::duration_cast<std::chrono::microseconds>(end - start)
-                  .count())
+  uint64_t num_data_blocks = props.num_data_blocks;
+
+  fprintf(stdout, " Cx size: %10" PRIu64, props.data_size);
+  fprintf(stdout, " Uncx size: %10" PRIu64, props.uncompressed_data_size);
+  fprintf(stdout, " Ratio: %10s",
+          std::to_string(static_cast<double>(props.uncompressed_data_size) /
+                         static_cast<double>(props.data_size))
               .c_str());
+  fprintf(stdout, " Write usec: %10s ",
+          std::to_string(write_time.count()).c_str());
+  fprintf(stdout, " Read usec: %10s ",
+          std::to_string(read_time.count()).c_str());
   const uint64_t compressed_blocks =
       opts.statistics->getAndResetTickerCount(NUMBER_BLOCK_COMPRESSED);
   const uint64_t not_compressed_blocks =
@@ -359,11 +428,11 @@ Status SstFileDumper::ShowCompressionSize(
                              : ((static_cast<double>(not_compressed_blocks) /
                                  static_cast<double>(num_data_blocks)) *
                                 100.0);
-  fprintf(stdout, " Compressed: %6" PRIu64 " (%5.1f%%)", compressed_blocks,
+  fprintf(stdout, " Cx count: %6" PRIu64 " (%5.1f%%)", compressed_blocks,
           compressed_pcnt);
-  fprintf(stdout, " Not compressed (ratio): %6" PRIu64 " (%5.1f%%)",
+  fprintf(stdout, " Not cx for ratio: %6" PRIu64 " (%5.1f%%)",
           ratio_not_compressed_blocks, ratio_not_compressed_pcnt);
-  fprintf(stdout, " Not compressed (abort): %6" PRIu64 " (%5.1f%%)\n",
+  fprintf(stdout, " Not cx otherwise: %6" PRIu64 " (%5.1f%%)\n",
           not_compressed_blocks, not_compressed_pcnt);
   return Status::OK();
 }
@@ -389,16 +458,22 @@ Status SstFileDumper::ReadTableProperties(uint64_t table_magic_number,
 Status SstFileDumper::SetTableOptionsByMagicNumber(
     uint64_t table_magic_number) {
   assert(table_properties_);
-  if (table_magic_number == kBlockBasedTableMagicNumber ||
-      table_magic_number == kLegacyBlockBasedTableMagicNumber) {
-    BlockBasedTableFactory* bbtf = new BlockBasedTableFactory();
+  if (table_magic_number == kBlockBasedTableMagicNumber) {
+    // Preserve BlockBasedTableOptions on options_ when possible
+    if (!options_.table_factory->IsInstanceOf(
+            TableFactory::kBlockBasedTableName())) {
+      options_.table_factory = std::make_shared<BlockBasedTableFactory>();
+    }
+
+    BlockBasedTableFactory* bbtf =
+        static_cast_with_check<BlockBasedTableFactory>(
+            options_.table_factory.get());
     // To force tail prefetching, we fake reporting two useful reads of 512KB
     // from the tail.
     // It needs at least two data points to warm up the stats.
     bbtf->tail_prefetch_stats()->RecordEffectiveSize(512 * 1024);
     bbtf->tail_prefetch_stats()->RecordEffectiveSize(512 * 1024);
 
-    options_.table_factory.reset(bbtf);
     if (!silent_) {
       fprintf(stdout, "Sst file format: block-based\n");
     }
@@ -448,16 +523,6 @@ Status SstFileDumper::SetTableOptionsByMagicNumber(
   return Status::OK();
 }
 
-Status SstFileDumper::SetOldTableOptions() {
-  assert(table_properties_ == nullptr);
-  options_.table_factory = std::make_shared<BlockBasedTableFactory>();
-  if (!silent_) {
-    fprintf(stdout, "Sst file format: block-based(old version)\n");
-  }
-
-  return Status::OK();
-}
-
 Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num_limit,
                                      bool has_from, const std::string& from_key,
                                      bool has_to, const std::string& to_key,
@@ -474,12 +539,11 @@ Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num_limit,
   const Comparator* ucmp = internal_comparator_.user_comparator();
   size_t ts_sz = ucmp->timestamp_size();
 
-  Slice from_slice = from_key;
-  Slice to_slice = to_key;
+  OptSlice from_opt = has_from ? from_key : OptSlice{};
+  OptSlice to_opt = has_to ? to_key : OptSlice{};
   std::string from_key_buf, to_key_buf;
-  auto [from, to] = MaybeAddTimestampsToRange(
-      has_from ? &from_slice : nullptr, has_to ? &to_slice : nullptr, ts_sz,
-      &from_key_buf, &to_key_buf);
+  auto [from, to] = MaybeAddTimestampsToRange(from_opt, to_opt, ts_sz,
+                                              &from_key_buf, &to_key_buf);
   uint64_t i = 0;
   if (from.has_value()) {
     InternalKey ikey;
diff --git a/table/sst_file_dumper.h b/table/sst_file_dumper.h
index a1a857115a8b..b7d9e4003b83 100644
--- a/table/sst_file_dumper.h
+++ b/table/sst_file_dumper.h
@@ -21,7 +21,8 @@ class SstFileDumper {
                          bool verify_checksum, bool output_hex,
                          bool decode_blob_index,
                          const EnvOptions& soptions = EnvOptions(),
-                         bool silent = false);
+                         bool silent = false,
+                         bool show_sequence_number_type = false);
 
   // read_num_limit limits the total number of keys read. If read_num_limit = 0,
   // then there is no limit. If read_num_limit = 0 or
@@ -43,16 +44,14 @@ class SstFileDumper {
   Status getStatus() { return init_result_; }
 
   Status ShowAllCompressionSizes(
-      size_t block_size,
-      const std::vector<std::pair<CompressionType, const char*>>&
-          compression_types,
-      int32_t compress_level_from, int32_t compress_level_to,
-      uint32_t max_dict_bytes, uint32_t zstd_max_train_bytes,
-      uint64_t max_dict_buffer_bytes, bool use_zstd_dict_trainer);
-
-  Status ShowCompressionSize(size_t block_size, CompressionType compress_type,
+      const std::vector<CompressionType>& compression_types,
+      int32_t compress_level_from, int32_t compress_level_to);
+
+  Status ShowCompressionSize(CompressionType compress_type,
                              const CompressionOptions& compress_opt);
 
+  BlockContents& GetMetaIndexContents() { return meta_index_contents_; }
+
  private:
   // Get the TableReader implementation for the sst file
   Status GetTableReader(const std::string& file_path);
@@ -61,12 +60,11 @@ class SstFileDumper {
                              FilePrefetchBuffer* prefetch_buffer);
 
   Status CalculateCompressedTableSize(const TableBuilderOptions& tb_options,
-                                      size_t block_size,
-                                      uint64_t* num_data_blocks,
-                                      uint64_t* compressed_table_size);
+                                      TableProperties* props,
+                                      std::chrono::microseconds* write_time,
+                                      std::chrono::microseconds* read_time);
 
   Status SetTableOptionsByMagicNumber(uint64_t table_magic_number);
-  Status SetOldTableOptions();
 
   // Helper function to call the factory with settings specific to the
   // factory implementation
@@ -81,6 +79,7 @@ class SstFileDumper {
   Temperature file_temp_;
   bool output_hex_;
   bool decode_blob_index_;
+  bool show_sequence_number_type_;
   EnvOptions soptions_;
   // less verbose in stdout/stderr
   bool silent_;
@@ -98,6 +97,7 @@ class SstFileDumper {
   ReadOptions read_options_;
   InternalKeyComparator internal_comparator_;
   std::unique_ptr<TableProperties> table_properties_;
+  BlockContents meta_index_contents_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/sst_file_reader.cc b/table/sst_file_reader.cc
index a970666affa5..e63e67c92e1a 100644
--- a/table/sst_file_reader.cc
+++ b/table/sst_file_reader.cc
@@ -11,6 +11,7 @@
 #include "file/random_access_file_reader.h"
 #include "options/cf_options.h"
 #include "rocksdb/env.h"
+#include "rocksdb/file_checksum.h"
 #include "rocksdb/file_system.h"
 #include "table/get_context.h"
 #include "table/table_builder.h"
@@ -51,6 +52,7 @@ Status SstFileReader::Open(const std::string& file_path) {
   std::unique_ptr<FSRandomAccessFile> file;
   std::unique_ptr<RandomAccessFileReader> file_reader;
   FileOptions fopts(r->soptions);
+  fopts.file_checksum_func_name = kNoFileChecksumFuncName;
   const auto& fs = r->options.env->GetFileSystem();
 
   s = fs->GetFileSize(file_path, fopts.io_options, &file_size, nullptr);
@@ -62,7 +64,8 @@ Status SstFileReader::Open(const std::string& file_path) {
   }
   if (s.ok()) {
     TableReaderOptions t_opt(
-        r->ioptions, r->moptions.prefix_extractor, r->soptions,
+        r->ioptions, r->moptions.prefix_extractor,
+        r->moptions.compression_manager.get(), r->soptions,
         r->ioptions.internal_comparator,
         r->moptions.block_protection_bytes_per_key,
         /*skip_filters*/ false, /*immortal*/ false,
@@ -166,11 +169,11 @@ Iterator* SstFileReader::NewIterator(const ReadOptions& roptions) {
                       ? roptions.snapshot->GetSequenceNumber()
                       : kMaxSequenceNumber;
   ArenaWrappedDBIter* res = new ArenaWrappedDBIter();
-  res->Init(
-      r->options.env, roptions, r->ioptions, r->moptions, nullptr /* version */,
-      sequence, r->moptions.max_sequential_skip_in_iterations,
-      0 /* version_number */, nullptr /* read_callback */, nullptr /* cfh */,
-      true /* expose_blob_index */, false /* allow_refresh */);
+  res->Init(r->options.env, roptions, r->ioptions, r->moptions,
+            nullptr /* version */, sequence, 0 /* version_number */,
+            nullptr /* read_callback */, nullptr /* cfh */,
+            true /* expose_blob_index */, false /* allow_refresh */,
+            /*active_mem=*/nullptr);
   auto internal_iter = r->table_reader->NewIterator(
       res->GetReadOptions(), r->moptions.prefix_extractor.get(),
       res->GetArena(), false /* skip_filters */,
diff --git a/table/sst_file_reader_test.cc b/table/sst_file_reader_test.cc
index 2d169d6f3bee..439ac66b1963 100644
--- a/table/sst_file_reader_test.cc
+++ b/table/sst_file_reader_test.cc
@@ -164,7 +164,7 @@ TEST_F(SstFileReaderTest, ReadFileWithGlobalSeqno) {
   Options options;
   options.create_if_missing = true;
   std::string db_name = test::PerThreadDBPath("test_db");
-  DB* db;
+  std::unique_ptr<DB> db;
   ASSERT_OK(DB::Open(options, db_name, &db));
   // Bump sequence number.
   ASSERT_OK(db->Put(WriteOptions(), keys[0], "foo"));
@@ -186,7 +186,7 @@ TEST_F(SstFileReaderTest, ReadFileWithGlobalSeqno) {
     }
   }
   ASSERT_FALSE(ingested_file.empty());
-  delete db;
+  db.reset();
 
   // Verify the file can be open and read by SstFileReader.
   CheckFile(db_name + ingested_file, keys, true /* check_global_seqno */);
diff --git a/table/sst_file_writer.cc b/table/sst_file_writer.cc
index 8d1b03380d40..cf6c32cdf7da 100644
--- a/table/sst_file_writer.cc
+++ b/table/sst_file_writer.cc
@@ -30,7 +30,7 @@ const size_t kFadviseTrigger = 1024 * 1024;  // 1MB
 struct SstFileWriter::Rep {
   Rep(const EnvOptions& _env_options, const Options& options,
       Env::IOPriority _io_priority, const Comparator* _user_comparator,
-      ColumnFamilyHandle* _cfh, bool _invalidate_page_cache, bool _skip_filters,
+      ColumnFamilyHandle* _cfh, bool _invalidate_page_cache,
       std::string _db_session_id)
       : env_options(_env_options),
         ioptions(options),
@@ -39,7 +39,6 @@ struct SstFileWriter::Rep {
         internal_comparator(_user_comparator),
         cfh(_cfh),
         invalidate_page_cache(_invalidate_page_cache),
-        skip_filters(_skip_filters),
         db_session_id(_db_session_id),
         ts_sz(_user_comparator->timestamp_size()),
         strip_timestamp(ts_sz > 0 &&
@@ -67,7 +66,6 @@ struct SstFileWriter::Rep {
   // The size of the file during the last time we called Fadvise to remove
   // cached pages from page cache.
   uint64_t last_fadvise_size = 0;
-  bool skip_filters;
   std::string db_session_id;
   uint64_t next_file_number = 1;
   size_t ts_sz;
@@ -305,9 +303,9 @@ SstFileWriter::SstFileWriter(const EnvOptions& env_options,
                              const Comparator* user_comparator,
                              ColumnFamilyHandle* column_family,
                              bool invalidate_page_cache,
-                             Env::IOPriority io_priority, bool skip_filters)
+                             Env::IOPriority io_priority)
     : rep_(new Rep(env_options, options, io_priority, user_comparator,
-                   column_family, invalidate_page_cache, skip_filters,
+                   column_family, invalidate_page_cache,
                    DBImpl::GenerateDbSessionId(options.env))) {
   // SstFileWriter is used to create sst files that can be added to database
   // later. Therefore, no real db_id and db_session_id are associated with it.
@@ -403,9 +401,6 @@ Status SstFileWriter::Open(const std::string& file_path, Temperature temp) {
   // assign fake file numbers to each file (into table properties) and keep
   // the same session id for the life of the SstFileWriter.
   r->next_file_number++;
-  // XXX: when we can remove skip_filters from the SstFileWriter public API
-  // we can remove it from TableBuilderOptions.
-  table_builder_options.skip_filters = r->skip_filters;
   FileTypeSet tmp_set = r->ioptions.checksum_handoff_file_types;
   r->file_writer.reset(new WritableFileWriter(
       std::move(sst_file), file_path, r->env_options, r->ioptions.clock,
@@ -424,10 +419,6 @@ Status SstFileWriter::Open(const std::string& file_path, Temperature temp) {
   return s;
 }
 
-Status SstFileWriter::Add(const Slice& user_key, const Slice& value) {
-  return rep_->Add(user_key, value, ValueType::kTypeValue);
-}
-
 Status SstFileWriter::Put(const Slice& user_key, const Slice& value) {
   return rep_->Add(user_key, value, ValueType::kTypeValue);
 }
@@ -472,6 +463,7 @@ Status SstFileWriter::Finish(ExternalSstFileInfo* file_info) {
   }
   if (r->file_info.num_entries == 0 &&
       r->file_info.num_range_del_entries == 0) {
+    r->builder->status().PermitUncheckedError();
     return Status::InvalidArgument("Cannot create sst file with no entries");
   }
 
@@ -495,7 +487,10 @@ Status SstFileWriter::Finish(ExternalSstFileInfo* file_info) {
         r->file_writer->GetFileChecksumFuncName();
   }
   if (!s.ok()) {
-    r->ioptions.env->DeleteFile(r->file_info.file_path);
+    Status status = r->ioptions.env->DeleteFile(r->file_info.file_path);
+    // Silence ASSERT_STATUS_CHECKED warning, since DeleteFile may fail under
+    // some error injection, and we can just ignore the failure
+    status.PermitUncheckedError();
   }
 
   if (file_info != nullptr) {
diff --git a/table/table_builder.h b/table/table_builder.h
index 5ed7aba51f3d..ec9f61bbf98b 100644
--- a/table/table_builder.h
+++ b/table/table_builder.h
@@ -24,6 +24,7 @@
 #include "rocksdb/table_properties.h"
 #include "table/unique_id_impl.h"
 #include "trace_replay/block_cache_tracer.h"
+#include "util/cast_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -35,6 +36,7 @@ struct TableReaderOptions {
   TableReaderOptions(
       const ImmutableOptions& _ioptions,
       const std::shared_ptr<const SliceTransform>& _prefix_extractor,
+      UnownedPtr<CompressionManager> _compression_manager,
       const EnvOptions& _env_options,
       const InternalKeyComparator& _internal_comparator,
       uint8_t _block_protection_bytes_per_key, bool _skip_filters = false,
@@ -46,6 +48,7 @@ struct TableReaderOptions {
       uint64_t _tail_size = 0, bool _user_defined_timestamps_persisted = true)
       : ioptions(_ioptions),
         prefix_extractor(_prefix_extractor),
+        compression_manager(_compression_manager),
         env_options(_env_options),
         internal_comparator(_internal_comparator),
         skip_filters(_skip_filters),
@@ -64,6 +67,9 @@ struct TableReaderOptions {
 
   const ImmutableOptions& ioptions;
   const std::shared_ptr<const SliceTransform>& prefix_extractor;
+  // NOTE: the compression manager is not saved, just potentially a decompressor
+  // from it, so we don't need a shared_ptr copy
+  UnownedPtr<CompressionManager> compression_manager;
   const EnvOptions& env_options;
   const InternalKeyComparator& internal_comparator;
   // This is only used for BlockBasedTable (reader)
@@ -158,10 +164,6 @@ struct TableBuilderOptions : public TablePropertiesCollectorFactory::Context {
   const TableFileCreationReason reason;
   // END for FilterBuildingContext
 
-  // XXX: only used by BlockBasedTableBuilder for SstFileWriter. If you
-  // want to skip filters, that should be (for example) null filter_policy
-  // in the table options of the ioptions.table_factory
-  bool skip_filters = false;
   const uint64_t cur_file_num;
 };
 
@@ -207,6 +209,9 @@ class TableBuilder {
     return NumEntries() == 0 && GetTableProperties().num_range_deletions == 0;
   }
 
+  // Size of the file before its content is compressed.
+  virtual uint64_t PreCompressionSize() const { return 0; }
+
   // Size of the file generated so far.  If invoked after a successful
   // Finish() call, returns the size of the final generated file.
   virtual uint64_t FileSize() const = 0;
@@ -216,6 +221,11 @@ class TableBuilder {
   // is enabled.
   virtual uint64_t EstimatedFileSize() const { return FileSize(); }
 
+  // Estimated tail size of the SST file generated so far. The "tail" refers to
+  // all blocks written after data blocks (index + filter). This value helps
+  // estimate the total file size when deciding when to cut files.
+  virtual uint64_t EstimatedTailSize() const { return 0; }
+
   virtual uint64_t GetTailSize() const { return 0; }
 
   // If the user defined table properties collector suggest the file to
@@ -236,6 +246,11 @@ class TableBuilder {
   virtual void SetSeqnoTimeTableProperties(
       const SeqnoToTimeMapping& /*relevant_mapping*/,
       uint64_t /*oldest_ancestor_time*/) {}
+
+  // If this builder used CPU work from threads other than the caller, return
+  // the CPU microseconds used. 0 = no work outside calling thread, or not
+  // supported.
+  virtual uint64_t GetWorkerCPUMicros() const { return 0; }
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/table_properties.cc b/table/table_properties.cc
index 7fee67d1e928..48886c873fb7 100644
--- a/table/table_properties.cc
+++ b/table/table_properties.cc
@@ -65,6 +65,8 @@ std::string TableProperties::ToString(const std::string& prop_delim,
                  prop_delim, kv_delim);
 
   AppendProperty(result, "data block size", data_size, prop_delim, kv_delim);
+  AppendProperty(result, "data uncompressed size", uncompressed_data_size,
+                 prop_delim, kv_delim);
   char index_block_size_str[80];
   snprintf(index_block_size_str, sizeof(index_block_size_str),
            "index block size (user-key? %d, delta-value? %d)",
@@ -116,6 +118,8 @@ std::string TableProperties::ToString(const std::string& prop_delim,
                  prop_delim, kv_delim);
   AppendProperty(result, "largest sequence number in file", key_largest_seqno,
                  prop_delim, kv_delim);
+  AppendProperty(result, "smallest sequence number in file", key_smallest_seqno,
+                 prop_delim, kv_delim);
 
   AppendProperty(
       result, "merge operator name",
@@ -178,6 +182,7 @@ std::string TableProperties::ToString(const std::string& prop_delim,
 
 void TableProperties::Add(const TableProperties& tp) {
   data_size += tp.data_size;
+  uncompressed_data_size += tp.uncompressed_data_size;
   index_size += tp.index_size;
   index_partitions += tp.index_partitions;
   top_level_index_size += tp.top_level_index_size;
@@ -202,6 +207,7 @@ std::map<std::string, uint64_t>
 TableProperties::GetAggregatablePropertiesAsMap() const {
   std::map<std::string, uint64_t> rv;
   rv["data_size"] = data_size;
+  rv["uncompressed_data_size"] = uncompressed_data_size;
   rv["index_size"] = index_size;
   rv["index_partitions"] = index_partitions;
   rv["top_level_index_size"] = top_level_index_size;
@@ -320,6 +326,8 @@ const std::string TablePropertiesNames::kUserDefinedTimestampsPersisted =
     "rocksdb.user.defined.timestamps.persisted";
 const std::string TablePropertiesNames::kKeyLargestSeqno =
     "rocksdb.key.largest.seqno";
+const std::string TablePropertiesNames::kKeySmallestSeqno =
+    "rocksdb.key.smallest.seqno";
 
 static std::unordered_map<std::string, OptionTypeInfo>
     table_properties_type_info = {
@@ -330,6 +338,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
         {"data_size",
          {offsetof(struct TableProperties, data_size), OptionType::kUInt64T,
           OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+        {"uncompressed_data_size",
+         {offsetof(struct TableProperties, uncompressed_data_size),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
         {"index_size",
          {offsetof(struct TableProperties, index_size), OptionType::kUInt64T,
           OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
@@ -434,6 +446,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct TableProperties, key_largest_seqno),
           OptionType::kUInt64T, OptionVerificationType::kNormal,
           OptionTypeFlags::kNone}},
+        {"key_smallest_seqno",
+         {offsetof(struct TableProperties, key_smallest_seqno),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
         {"db_id",
          {offsetof(struct TableProperties, db_id), OptionType::kEncodedString}},
         {"db_session_id",
diff --git a/table/table_reader.h b/table/table_reader.h
index a9d46499bd06..4363755210fa 100644
--- a/table/table_reader.h
+++ b/table/table_reader.h
@@ -179,13 +179,15 @@ class TableReader {
   }
 
   // convert db file to a human readable form
-  virtual Status DumpTable(WritableFile* /*out_file*/) {
+  virtual Status DumpTable(WritableFile* /*out_file*/,
+                           bool /*show_sequence_number_type*/ = false) {
     return Status::NotSupported("DumpTable() not supported");
   }
 
   // check whether there is corruption in this db file
   virtual Status VerifyChecksum(const ReadOptions& /*read_options*/,
-                                TableReaderCaller /*caller*/) {
+                                TableReaderCaller /*caller*/,
+                                bool /*meta_blocks_only*/ = false) {
     return Status::NotSupported("VerifyChecksum() not supported");
   }
 
diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc
index a588f6eea07c..ce2e81ddecef 100644
--- a/table/table_reader_bench.cc
+++ b/table/table_reader_bench.cc
@@ -84,7 +84,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
   Env* env = Env::Default();
   auto* clock = env->GetSystemClock().get();
   TableBuilder* tb = nullptr;
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   Status s;
   const ImmutableOptions ioptions(opts);
   const ColumnFamilyOptions cfo(opts);
@@ -145,8 +145,9 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
     std::unique_ptr<RandomAccessFileReader> file_reader(
         new RandomAccessFileReader(std::move(raf), file_name));
     s = opts.table_factory->NewTableReader(
-        TableReaderOptions(ioptions, moptions.prefix_extractor, env_options,
-                           ikc, 0 /* block_protection_bytes_per_key */),
+        TableReaderOptions(ioptions, moptions.prefix_extractor,
+                           moptions.compression_manager.get(), env_options, ikc,
+                           0 /* block_protection_bytes_per_key */),
         std::move(file_reader), file_size, &table_reader);
     if (!s.ok()) {
       fprintf(stderr, "Open Table Error: %s\n", s.ToString().c_str());
@@ -256,8 +257,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
   if (!through_db) {
     env->DeleteFile(file_name);
   } else {
-    delete db;
-    db = nullptr;
+    db.reset();
     DestroyDB(dbname, opts);
   }
 }
diff --git a/table/table_test.cc b/table/table_test.cc
index 7441b0ff706b..e49b3ecf5b35 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -14,6 +14,7 @@
 #include <algorithm>
 #include <cstddef>
 #include <cstdio>
+#include <iomanip>
 #include <iostream>
 #include <map>
 #include <memory>
@@ -28,6 +29,7 @@
 #include "db/write_batch_internal.h"
 #include "memtable/stl_wrappers.h"
 #include "monitoring/statistics_impl.h"
+#include "options/cf_options.h"
 #include "options/options_helper.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
@@ -36,7 +38,7 @@
 #include "rocksdb/convenience.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
-#include "rocksdb/external_table_reader.h"
+#include "rocksdb/external_table.h"
 #include "rocksdb/file_checksum.h"
 #include "rocksdb/file_system.h"
 #include "rocksdb/filter_policy.h"
@@ -50,6 +52,8 @@
 #include "rocksdb/table_properties.h"
 #include "rocksdb/trace_record.h"
 #include "rocksdb/unique_id.h"
+#include "rocksdb/user_defined_index.h"
+#include "rocksdb/utilities/object_registry.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "table/block_based/block.h"
 #include "table/block_based/block_based_table_builder.h"
@@ -70,8 +74,9 @@
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
-#include "util/coding_lean.h"
+#include "util/coding.h"
 #include "util/compression.h"
+#include "util/defer.h"
 #include "util/file_checksum_helper.h"
 #include "util/random.h"
 #include "util/string_util.h"
@@ -83,6 +88,7 @@ namespace ROCKSDB_NAMESPACE {
 namespace {
 
 const std::string kDummyValue(10000, 'o');
+constexpr auto kVerbose = false;
 
 // DummyPropertiesCollector used to test BlockBasedTableProperties
 class DummyPropertiesCollector : public TablePropertiesCollector {
@@ -443,7 +449,8 @@ class TableConstructor : public Constructor {
 
     file_reader_.reset(new RandomAccessFileReader(std::move(source), "test"));
     return moptions.table_factory->NewTableReader(
-        TableReaderOptions(ioptions, moptions.prefix_extractor, soptions,
+        TableReaderOptions(ioptions, moptions.prefix_extractor,
+                           moptions.compression_manager.get(), soptions,
                            *last_internal_comparator_,
                            0 /* block_protection_bytes_per_key */,
                            /*skip_filters*/ false,
@@ -576,18 +583,16 @@ class DBConstructor : public Constructor {
  public:
   explicit DBConstructor(const Comparator* cmp)
       : Constructor(cmp), comparator_(cmp) {
-    db_ = nullptr;
     NewDB();
   }
-  ~DBConstructor() override { delete db_; }
+  ~DBConstructor() override {}
   Status FinishImpl(const Options& /*options*/,
                     const ImmutableOptions& /*ioptions*/,
                     const MutableCFOptions& /*moptions*/,
                     const BlockBasedTableOptions& /*table_options*/,
                     const InternalKeyComparator& /*internal_comparator*/,
                     const stl_wrappers::KVMap& kv_map) override {
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     NewDB();
     for (const auto& kv : kv_map) {
       WriteBatch batch;
@@ -602,7 +607,7 @@ class DBConstructor : public Constructor {
     return new InternalIteratorFromIterator(db_->NewIterator(ReadOptions()));
   }
 
-  DB* db() const override { return db_; }
+  DB* db() const override { return db_.get(); }
 
  private:
   void NewDB() {
@@ -621,7 +626,7 @@ class DBConstructor : public Constructor {
   }
 
   const Comparator* comparator_;
-  DB* db_;
+  std::unique_ptr<DB> db_;
 };
 
 enum TestType {
@@ -668,35 +673,6 @@ static std::vector<TestArgs> GenerateArgList() {
   std::vector<int> restart_intervals = {16, 1, 1024};
   std::vector<uint32_t> compression_parallel_threads = {1, 4};
 
-  // Only add compression if it is supported
-  std::vector<std::pair<CompressionType, bool>> compression_types;
-  compression_types.emplace_back(kNoCompression, false);
-  if (Snappy_Supported()) {
-    compression_types.emplace_back(kSnappyCompression, false);
-  }
-  if (Zlib_Supported()) {
-    compression_types.emplace_back(kZlibCompression, false);
-    compression_types.emplace_back(kZlibCompression, true);
-  }
-  if (BZip2_Supported()) {
-    compression_types.emplace_back(kBZip2Compression, false);
-    compression_types.emplace_back(kBZip2Compression, true);
-  }
-  if (LZ4_Supported()) {
-    compression_types.emplace_back(kLZ4Compression, false);
-    compression_types.emplace_back(kLZ4Compression, true);
-    compression_types.emplace_back(kLZ4HCCompression, false);
-    compression_types.emplace_back(kLZ4HCCompression, true);
-  }
-  if (XPRESS_Supported()) {
-    compression_types.emplace_back(kXpressCompression, false);
-    compression_types.emplace_back(kXpressCompression, true);
-  }
-  if (ZSTD_Supported()) {
-    compression_types.emplace_back(kZSTD, false);
-    compression_types.emplace_back(kZSTD, true);
-  }
-
   for (auto test_type : test_types) {
     for (auto reverse_compare : reverse_compare_types) {
       if (test_type == PLAIN_TABLE_SEMI_FIXED_PREFIX ||
@@ -707,9 +683,9 @@ static std::vector<TestArgs> GenerateArgList() {
         one_arg.type = test_type;
         one_arg.reverse_compare = reverse_compare;
         one_arg.restart_interval = restart_intervals[0];
-        one_arg.compression = compression_types[0].first;
+        one_arg.compression = kNoCompression;
         one_arg.compression_parallel_threads = 1;
-        one_arg.format_version = 0;
+        one_arg.format_version = 0;  // Plain tables use their own versioning
         one_arg.use_mmap = true;
         test_args.push_back(one_arg);
         one_arg.use_mmap = false;
@@ -718,17 +694,20 @@ static std::vector<TestArgs> GenerateArgList() {
       }
 
       for (auto restart_interval : restart_intervals) {
-        for (auto compression_type : compression_types) {
+        for (auto compression_type : GetSupportedCompressions()) {
           for (auto num_threads : compression_parallel_threads) {
-            TestArgs one_arg;
-            one_arg.type = test_type;
-            one_arg.reverse_compare = reverse_compare;
-            one_arg.restart_interval = restart_interval;
-            one_arg.compression = compression_type.first;
-            one_arg.compression_parallel_threads = num_threads;
-            one_arg.format_version = compression_type.second ? 2 : 1;
-            one_arg.use_mmap = false;
-            test_args.push_back(one_arg);
+            // format_version = 7 changes some compression handling
+            for (uint32_t fv : {kMinSupportedBbtFormatVersionForRead, 7U}) {
+              TestArgs one_arg;
+              one_arg.type = test_type;
+              one_arg.reverse_compare = reverse_compare;
+              one_arg.restart_interval = restart_interval;
+              one_arg.compression = compression_type;
+              one_arg.compression_parallel_threads = num_threads;
+              one_arg.format_version = fv;
+              one_arg.use_mmap = false;
+              test_args.push_back(one_arg);
+            }
           }
         }
       }
@@ -761,9 +740,6 @@ class FixedOrLessPrefixTransform : public SliceTransform {
 
   bool InDomain(const Slice& /*src*/) const override { return true; }
 
-  bool InRange(const Slice& dst) const override {
-    return (dst.size() <= prefix_len_);
-  }
   bool FullLengthEnabled(size_t* /*len*/) const override { return false; }
 };
 
@@ -929,7 +905,6 @@ class HarnessTest : public testing::Test {
 
   void TestRandomAccess(Random* rnd, const std::vector<std::string>& keys,
                         const stl_wrappers::KVMap& data) {
-    static const bool kVerbose = false;
     InternalIterator* iter = constructor_->NewIterator();
     ASSERT_TRUE(!iter->Valid());
     stl_wrappers::KVMap::const_iterator model_iter = data.begin();
@@ -1135,15 +1110,20 @@ class TableTest : public testing::Test {
 
 class GeneralTableTest : public TableTest {};
 class BlockBasedTableTestBase : public TableTest {};
-class BlockBasedTableTest
-    : public BlockBasedTableTestBase,
-      virtual public ::testing::WithParamInterface<uint32_t> {
+class BlockBasedTableTest : public BlockBasedTableTestBase,
+                            virtual public ::testing::WithParamInterface<
+                                std::tuple<uint32_t, size_t, size_t>> {
  public:
-  BlockBasedTableTest() : format_(GetParam()) { env_ = Env::Default(); }
+  BlockBasedTableTest() : format_(std::get<0>(GetParam())) {
+    env_ = Env::Default();
+  }
 
   BlockBasedTableOptions GetBlockBasedTableOptions() {
     BlockBasedTableOptions options;
     options.format_version = format_;
+    auto param = GetParam();
+    options.super_block_alignment_size = std::get<1>(param);
+    options.super_block_alignment_space_overhead_ratio = std::get<2>(param);
     return options;
   }
 
@@ -1375,8 +1355,12 @@ class FileChecksumTestHelper {
 
 uint64_t FileChecksumTestHelper::checksum_file_num_ = 1;
 
-INSTANTIATE_TEST_CASE_P(FormatVersions, BlockBasedTableTest,
-                        testing::ValuesIn(test::kFooterFormatVersionsToTest));
+INSTANTIATE_TEST_CASE_P(
+    FormatVersions, BlockBasedTableTest,
+    testing::Combine(testing::ValuesIn(test::kFooterFormatVersionsToTest),
+                     testing::Values(0, 128 * 1024, 512 * 1024,
+                                     2 * 1024 * 1024),
+                     testing::Values(2048, 32, 128)));
 
 // This test serves as the living tutorial for the prefix scan of user collected
 // properties.
@@ -1793,18 +1777,23 @@ TEST_P(BlockBasedTableTest, IndexUncompressed) {
 #endif  // SNAPPY
 
 TEST_P(BlockBasedTableTest, BlockBasedTableProperties2) {
-  TableConstructor c(&reverse_key_comparator);
+  TableConstructor c(&reverse_key_comparator,
+                     true /* convert_to_internal_key_ */);
   std::vector<std::string> keys;
   stl_wrappers::KVMap kvmap;
 
-  {
+  for (CompressionType ct : {kNoCompression, kSnappyCompression}) {
+    if (!Snappy_Supported() && ct == kSnappyCompression) {
+      continue;
+    }
     Options options;
-    options.compression = CompressionType::kNoCompression;
+    options.compression = ct;
     BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
     options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
     const ImmutableOptions ioptions(options);
     const MutableCFOptions moptions(options);
+    c.Add("blah", std::string(200, 'x'));  // something to compress
     c.Finish(options, ioptions, moptions, table_options,
              GetPlainInternalComparator(options.comparator), &keys, &kvmap);
 
@@ -1821,7 +1810,13 @@ TEST_P(BlockBasedTableTest, BlockBasedTableProperties2) {
     // No filter policy is used
     ASSERT_EQ("", props.filter_policy_name);
     // Compression type == that set:
-    ASSERT_EQ("NoCompression", props.compression_name);
+    if (FormatVersionUsesCompressionManagerName(table_options.format_version)) {
+      ASSERT_EQ(ct == kNoCompression ? ";;" : "BuiltinV2;01;",
+                props.compression_name);
+    } else {
+      ASSERT_EQ(ct == kNoCompression ? "NoCompression" : "Snappy",
+                props.compression_name);
+    }
     c.ResetTableReader();
   }
 
@@ -2044,7 +2039,7 @@ TEST_P(BlockBasedTableTest, PrefetchTest) {
 
   // Simple
   PrefetchRange(&c, &opt, &table_options,
-                /*key_range=*/"k01", "k05",
+                /*key_begin=*/"k01", /*key_end=*/"k05",
                 /*keys_in_cache=*/{"k01", "k02", "k03", "k04", "k05"},
                 /*keys_not_in_cache=*/{"k06", "k07"});
   PrefetchRange(&c, &opt, &table_options, "k01", "k01", {"k01", "k02", "k03"},
@@ -2280,6 +2275,44 @@ TEST_P(BlockBasedTableTest, BadChecksumType) {
             "Corruption: Corrupt or unsupported checksum type: 123 in test");
 }
 
+TEST_P(BlockBasedTableTest, ReservedBitInDataBlockFooter) {
+  // Test that reserved metadata bits in data block footer are detected.
+  // We construct a block directly rather than going through the full table
+  // iterator path to avoid issues with iterator error handling.
+
+  // Build a simple data block
+  BlockBuilder builder(16 /* restart_interval */);
+  InternalKey key("abc", 1, kTypeValue);
+  builder.Add(key.Encode(), "test_value");
+  Slice block_contents = builder.Finish();
+  std::string block_data = block_contents.ToString();
+
+  // The footer is the last 4 bytes - corrupt it by setting reserved bit 28
+  ASSERT_GE(block_data.size(), sizeof(uint32_t));
+  size_t footer_offset = block_data.size() - sizeof(uint32_t);
+  uint32_t footer = DecodeFixed32(block_data.data() + footer_offset);
+  footer |= (1u << 28);  // Set lowest reserved bit
+  EncodeFixed32(&block_data[footer_offset], footer);
+
+  // Try to construct a Block from the corrupted data
+  BlockContents contents(std::move(block_data));
+  Block block(std::move(contents), 0 /* read_amp_bytes_per_bit */);
+
+  // Block should have size() == 0 indicating error
+  ASSERT_EQ(block.size(), 0u);
+
+  // Try to get an iterator - it should be invalid with corruption status
+  DataBlockIter iter;
+  block.NewDataIterator(BytewiseComparator(), kMaxSequenceNumber, &iter,
+                        /*stats=*/nullptr, /*block_contents_pinned=*/false);
+  ASSERT_FALSE(iter.Valid());
+  ASSERT_EQ(iter.status().code(), Status::kCorruption)
+      << iter.status().ToString();
+  ASSERT_NE(iter.status().ToString().find("reserved bits set"),
+            std::string::npos)
+      << iter.status().ToString();
+}
+
 class BuiltinChecksumTest : public testing::Test,
                             public testing::WithParamInterface<ChecksumType> {};
 
@@ -2651,9 +2684,18 @@ void TableTest::IndexTest(BlockBasedTableOptions table_options) {
   c.ResetTableReader();
 }
 
-TEST_P(BlockBasedTableTest, BinaryIndexTest) {
+TEST_P(BlockBasedTableTest, BinaryIndexTestBinarySearch) {
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.index_type = BlockBasedTableOptions::kBinarySearch;
+  table_options.index_block_search_type = BlockBasedTableOptions::kBinary;
+  IndexTest(table_options);
+}
+
+TEST_P(BlockBasedTableTest, BinaryIndexTestInterpolationSearch) {
   BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
   table_options.index_type = BlockBasedTableOptions::kBinarySearch;
+  table_options.index_block_search_type =
+      BlockBasedTableOptions::kInterpolation;
   IndexTest(table_options);
 }
 
@@ -4701,8 +4743,8 @@ TEST_F(GeneralTableTest, ApproximateOffsetOfPlain) {
   // an arbitrary slice between k04 and k05, either before or after k04a
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04a"), 10000, 211000));
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("k05"), 210000, 211000));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k06"), 510000, 511000));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k07"), 510000, 511000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k06"), 510000, 512000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k07"), 510000, 512000));
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 610000, 612000));
   c.ResetTableReader();
 }
@@ -4728,13 +4770,18 @@ static void DoCompressionTest(CompressionType comp) {
   const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   c.Finish(options, ioptions, moptions, table_options, ikc, &keys, &kvmap);
+  size_t file_size = c.TEST_GetSink()->contents().size();
+  EXPECT_EQ(c.ApproximateOffsetOf("abc"), 0);
+  EXPECT_EQ(c.ApproximateOffsetOf("k01"), 0);
+  EXPECT_EQ(c.ApproximateOffsetOf("k02"), 0);
+  EXPECT_NEAR2(c.ApproximateOffsetOf("k03"), file_size / 2, file_size / 10);
+  EXPECT_NEAR2(c.ApproximateOffsetOf("k04"), file_size / 2, file_size / 10);
+  EXPECT_NEAR2(c.ApproximateOffsetOf("xyz"), file_size, file_size / 10);
+
+  size_t data_blocks_size = c.GetTableReader()->GetTableProperties()->data_size;
+  // Near expected compressed size ~= (0.25 + 0.25) * 10000
+  EXPECT_NEAR2(data_blocks_size, 5000, 1500);
 
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3555));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3555));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 7110));
   c.ResetTableReader();
 }
 
@@ -4972,30 +5019,11 @@ TEST(TableTest, FooterTests) {
   BlockHandle meta_index(data_size + index_size + 2 * 5, metaindex_size);
   uint64_t footer_offset = data_size + metaindex_size + index_size + 3 * 5;
   uint32_t base_context_checksum = 123456789;
-  {
-    // legacy block based
-    FooterBuilder footer;
-    ASSERT_OK(footer.Build(kBlockBasedTableMagicNumber, /* format_version */ 0,
-                           footer_offset, kCRC32c, meta_index, index));
-    Footer decoded_footer;
-    ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset));
-    ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber);
-    ASSERT_EQ(decoded_footer.checksum_type(), kCRC32c);
-    ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
-    ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
-    ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
-    ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
-    ASSERT_EQ(decoded_footer.format_version(), 0U);
-    ASSERT_EQ(decoded_footer.base_context_checksum(), 0U);
-    ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 5U);
-    // Ensure serialized with legacy magic
-    ASSERT_EQ(
-        DecodeFixed64(footer.GetSlice().data() + footer.GetSlice().size() - 8),
-        kLegacyBlockBasedTableMagicNumber);
-  }
-  // block based, various checksums, various versions
+  // block based, various checksums, various versions (format_version >= 2)
   for (auto t : GetSupportedChecksums()) {
-    for (uint32_t fv = 1; IsSupportedFormatVersion(fv); ++fv) {
+    for (uint32_t fv = kMinSupportedBbtFormatVersionForWrite;
+         IsSupportedFormatVersionForWrite(kBlockBasedTableMagicNumber, fv);
+         ++fv) {
       uint32_t maybe_bcc =
           FormatVersionUsesContextChecksum(fv) ? base_context_checksum : 0U;
       FooterBuilder footer;
@@ -5042,41 +5070,154 @@ TEST(TableTest, FooterTests) {
     }
   }
 
+  // plain table, various checksums, various versions (format_version >= 2)
+  // Plain tables have no block trailer (size 0), so set up separate handles
+  // Note: format_version >= 6 has complex footer checksum requirements,
+  // so we only test format_version 2-5 for plain tables here
   {
-    // legacy plain table
-    FooterBuilder footer;
-    ASSERT_OK(footer.Build(kPlainTableMagicNumber, /* format_version */ 0,
-                           footer_offset, kNoChecksum, meta_index));
+    uint64_t plain_metaindex_size = r->Uniform(1000000);
+    // For plain tables: metaindex is at offset 0, footer immediately follows
+    BlockHandle plain_meta_index(0, plain_metaindex_size);
+    uint64_t plain_footer_offset = plain_metaindex_size;
+    for (auto t : GetSupportedChecksums()) {
+      for (uint32_t fv = kMinSupportedBbtFormatVersionForWrite; fv < 6; ++fv) {
+        FooterBuilder footer;
+        ASSERT_OK(footer.Build(kPlainTableMagicNumber, fv, plain_footer_offset,
+                               t, plain_meta_index));
+        Footer decoded_footer;
+        ASSERT_OK(
+            decoded_footer.DecodeFrom(footer.GetSlice(), plain_footer_offset));
+        ASSERT_EQ(decoded_footer.table_magic_number(), kPlainTableMagicNumber);
+        ASSERT_EQ(decoded_footer.checksum_type(), t);
+        ASSERT_EQ(decoded_footer.metaindex_handle().offset(),
+                  plain_meta_index.offset());
+        ASSERT_EQ(decoded_footer.metaindex_handle().size(),
+                  plain_meta_index.size());
+        ASSERT_EQ(decoded_footer.format_version(), fv);
+        ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 0U);
+      }
+    }
+  }
+}
+
+// Test that legacy SST formats (format_version < 2) are properly rejected
+TEST(TableTest, LegacyFormatRejectionTests) {
+  // Temporarily disable unsupported format version allowance for this test
+  bool& allow = TEST_AllowUnsupportedFormatVersion();
+  SaveAndRestore<bool> saved_allow(&allow, false);
+
+  // Test legacy block-based magic number from LevelDB should be rejected
+  {
+    // Construct a fake footer with legacy block-based magic number
+    std::array<char, Footer::kVersion0EncodedLength> fake_footer;
+    std::fill(fake_footer.begin(), fake_footer.end(), 0);
+    // Put legacy magic number at the end
+    EncodeFixed64(fake_footer.data() + fake_footer.size() - 8,
+                  0xdb4775248b80fb57ull /*legacy magic number*/);
+
     Footer decoded_footer;
-    ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset));
-    ASSERT_EQ(decoded_footer.table_magic_number(), kPlainTableMagicNumber);
-    ASSERT_EQ(decoded_footer.checksum_type(), kCRC32c);
-    ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
-    ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
-    ASSERT_EQ(decoded_footer.index_handle().offset(), 0U);
-    ASSERT_EQ(decoded_footer.index_handle().size(), 0U);
-    ASSERT_EQ(decoded_footer.format_version(), 0U);
-    ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 0U);
-    // Ensure serialized with legacy magic
-    ASSERT_EQ(
-        DecodeFixed64(footer.GetSlice().data() + footer.GetSlice().size() - 8),
-        kLegacyPlainTableMagicNumber);
+    Status s = decoded_footer.DecodeFrom(
+        Slice(fake_footer.data(), fake_footer.size()), 0);
+    ASSERT_TRUE(s.IsNotSupported()) << s.ToString();
+    ASSERT_TRUE(s.ToString().find("nsupported legacy magic number") !=
+                std::string::npos)
+        << s.ToString();
+    ASSERT_TRUE(s.ToString().find("full compaction") != std::string::npos)
+        << s.ToString();
+  }
+
+  // Test format_version=1 with new magic number should be rejected
+  {
+    std::array<char, Footer::kNewVersionsEncodedLength> fake_footer;
+    std::fill(fake_footer.begin(), fake_footer.end(), 0);
+    // Part 1: checksum type
+    fake_footer[0] = kCRC32c;
+    // Part 3: format_version=1 and new magic number
+    char* part3 = fake_footer.data() + fake_footer.size() - 12;
+    EncodeFixed32(part3, 1);  // format_version = 1
+    EncodeFixed64(part3 + 4, kBlockBasedTableMagicNumber);
+
+    Footer decoded_footer;
+    Status s = decoded_footer.DecodeFrom(
+        Slice(fake_footer.data(), fake_footer.size()), 0);
+    // format_version=1 is not supported for read, should return Corruption
+    ASSERT_TRUE(s.IsCorruption()) << s.ToString();
+    ASSERT_TRUE(s.ToString().find("format_version") != std::string::npos)
+        << s.ToString();
   }
+
+  // Test format_version=0 with new magic number should be rejected
   {
-    // xxhash plain table (not currently used)
-    FooterBuilder footer;
-    ASSERT_OK(footer.Build(kPlainTableMagicNumber, /* format_version */ 1,
-                           footer_offset, kxxHash, meta_index));
+    std::array<char, Footer::kNewVersionsEncodedLength> fake_footer;
+    std::fill(fake_footer.begin(), fake_footer.end(), 0);
+    // Part 1: checksum type
+    fake_footer[0] = kCRC32c;
+    // Part 3: format_version=0 and new magic number
+    char* part3 = fake_footer.data() + fake_footer.size() - 12;
+    EncodeFixed32(part3, 0);  // format_version = 0
+    EncodeFixed64(part3 + 4, kBlockBasedTableMagicNumber);
+
     Footer decoded_footer;
-    ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset));
-    ASSERT_EQ(decoded_footer.table_magic_number(), kPlainTableMagicNumber);
-    ASSERT_EQ(decoded_footer.checksum_type(), kxxHash);
-    ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
-    ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
-    ASSERT_EQ(decoded_footer.index_handle().offset(), 0U);
-    ASSERT_EQ(decoded_footer.index_handle().size(), 0U);
-    ASSERT_EQ(decoded_footer.format_version(), 1U);
-    ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 0U);
+    Status s = decoded_footer.DecodeFrom(
+        Slice(fake_footer.data(), fake_footer.size()), 0);
+    // format_version=0 is not supported for read, should return Corruption
+    ASSERT_TRUE(s.IsCorruption()) << s.ToString();
+    ASSERT_TRUE(s.ToString().find("format_version") != std::string::npos)
+        << s.ToString();
+  }
+}
+
+// Test that configuring unsupported format_version for writing is sanitized
+// or rejected as appropriate
+TEST(TableTest, UnsupportedFormatVersionConfigTest) {
+  // Temporarily disable unsupported format version allowance for this test
+  bool& allow = TEST_AllowUnsupportedFormatVersion();
+  SaveAndRestore<bool> saved_allow(&allow, false);
+
+  // Test that format_version < kMinSupportedBbtFormatVersionForWrite is
+  // sanitized to kMinSupportedBbtFormatVersionForWrite during initialization
+  for (uint32_t fv = 0; fv < kMinSupportedBbtFormatVersionForWrite; ++fv) {
+    BlockBasedTableOptions table_options;
+    table_options.format_version = fv;
+    BlockBasedTableFactory factory(table_options);
+
+    // After construction, format_version should be sanitized
+    auto* opts = factory.GetOptions<BlockBasedTableOptions>();
+    ASSERT_EQ(opts->format_version, kMinSupportedBbtFormatVersionForWrite)
+        << "format_version=" << fv << " should be sanitized to "
+        << kMinSupportedBbtFormatVersionForWrite;
+  }
+
+  // Test that supported format versions are not changed
+  for (uint32_t fv = kMinSupportedBbtFormatVersionForWrite;
+       IsSupportedFormatVersionForWrite(kBlockBasedTableMagicNumber, fv);
+       ++fv) {
+    BlockBasedTableOptions table_options;
+    table_options.format_version = fv;
+    BlockBasedTableFactory factory(table_options);
+
+    auto* opts = factory.GetOptions<BlockBasedTableOptions>();
+    ASSERT_EQ(opts->format_version, fv)
+        << "format_version=" << fv << " should not be changed";
+
+    ColumnFamilyOptions cf_opts;
+    DBOptions db_opts;
+    Status s = factory.ValidateOptions(db_opts, cf_opts);
+    ASSERT_OK(s) << "format_version=" << fv << ": " << s.ToString();
+  }
+
+  // Test that format_version > kLatestBbtFormatVersion is rejected by
+  // ValidateOptions (not sanitized, since it could be a future version that
+  // requires newer code)
+  {
+    BlockBasedTableOptions table_options;
+    table_options.format_version = kLatestBbtFormatVersion + 1;
+    BlockBasedTableFactory factory(table_options);
+
+    ColumnFamilyOptions cf_opts;
+    DBOptions db_opts;
+    Status s = factory.ValidateOptions(db_opts, cf_opts);
+    ASSERT_TRUE(s.IsInvalidArgument()) << s.ToString();
   }
 }
 
@@ -5181,10 +5322,6 @@ class TestPrefixExtractor : public ROCKSDB_NAMESPACE::SliceTransform {
     return IsValid(src);
   }
 
-  bool InRange(const ROCKSDB_NAMESPACE::Slice& /*dst*/) const override {
-    return true;
-  }
-
   bool IsValid(const ROCKSDB_NAMESPACE::Slice& src) const {
     if (src.size() != 4) {
       return false;
@@ -5222,7 +5359,7 @@ TEST_F(PrefixTest, PrefixAndWholeKeyTest) {
   const std::string kDBPath = test::PerThreadDBPath("table_prefix_test");
   options.table_factory.reset(NewBlockBasedTableFactory(bbto));
   ASSERT_OK(DestroyDB(kDBPath, options));
-  ROCKSDB_NAMESPACE::DB* db;
+  std::unique_ptr<ROCKSDB_NAMESPACE::DB> db;
   ASSERT_OK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db));
 
   // Create a bunch of keys with 10 filters.
@@ -5236,7 +5373,7 @@ TEST_F(PrefixTest, PrefixAndWholeKeyTest) {
 
   // Trigger compaction.
   ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  delete db;
+  db.reset();
   // In the second round, turn whole_key_filtering off and expect
   // rocksdb still works.
 }
@@ -5326,7 +5463,8 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) {
         new RandomAccessFileReader(std::move(source), ""));
 
     options.table_factory->NewTableReader(
-        TableReaderOptions(ioptions, moptions.prefix_extractor, EnvOptions(),
+        TableReaderOptions(ioptions, moptions.prefix_extractor,
+                           moptions.compression_manager.get(), EnvOptions(),
                            ikc, 0 /* block_protection_bytes_per_key */),
         std::move(file_reader), ss_rw.contents().size(), &table_reader);
 
@@ -5501,7 +5639,8 @@ TEST_P(BlockBasedTableTest, BlockAlignTest) {
   const MutableCFOptions moptions2(options2);
 
   ASSERT_OK(moptions.table_factory->NewTableReader(
-      TableReaderOptions(ioptions2, moptions2.prefix_extractor, EnvOptions(),
+      TableReaderOptions(ioptions2, moptions2.prefix_extractor,
+                         moptions2.compression_manager.get(), EnvOptions(),
                          GetPlainInternalComparator(options2.comparator),
                          0 /* block_protection_bytes_per_key */),
       std::move(file_reader), sink->contents().size(), &table_reader));
@@ -5540,7 +5679,7 @@ TEST_P(BlockBasedTableTest, FixBlockAlignMismatchedFileChecksums) {
   const std::string kDBPath =
       test::PerThreadDBPath("block_align_padded_bytes_verify_file_checksums");
   ASSERT_OK(DestroyDB(kDBPath, options));
-  DB* db;
+  std::unique_ptr<DB> db;
   ASSERT_OK(DB::Open(options, kDBPath, &db));
   ASSERT_OK(db->Put(WriteOptions(), "k1", "v1"));
   ASSERT_OK(db->Flush(FlushOptions()));
@@ -5548,7 +5687,7 @@ TEST_P(BlockBasedTableTest, FixBlockAlignMismatchedFileChecksums) {
   // aligning blocks are used to generate the checksum to compare against the
   // one not generated by padded bytes
   ASSERT_OK(db->VerifyFileChecksums(ReadOptions()));
-  delete db;
+  db.reset();
 }
 
 class NoBufferAlignmenttWritableFile : public FSWritableFileOwnerWrapper {
@@ -5603,7 +5742,7 @@ TEST_P(BlockBasedTableTest,
   const std::string kDBPath = test::PerThreadDBPath(
       "block_align_flush_during_flush_verify_file_checksums");
   ASSERT_OK(DestroyDB(kDBPath, options));
-  DB* db;
+  std::unique_ptr<DB> db;
   ASSERT_OK(DB::Open(options, kDBPath, &db));
 
   ASSERT_OK(db->Put(WriteOptions(), "k1", "k2"));
@@ -5612,7 +5751,7 @@ TEST_P(BlockBasedTableTest,
   // Before the fix, VerifyFileChecksums() will fail as incorrect padded bytes
   // were used to generate checksum upon file creation
   ASSERT_OK(db->VerifyFileChecksums(ReadOptions()));
-  delete db;
+  db.reset();
 }
 
 TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) {
@@ -5675,11 +5814,12 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) {
       read_options_for_helper.verify_checksums = false;
       PersistentCacheOptions cache_options;
 
-      BlockFetcher block_fetcher(
-          file, nullptr /* prefetch_buffer */, footer, read_options_for_helper,
-          handle, contents, ioptions, false /* decompress */,
-          false /*maybe_compressed*/, block_type,
-          UncompressionDict::GetEmptyDict(), cache_options);
+      auto mgr = GetBuiltinV2CompressionManager();
+      BlockFetcher block_fetcher(file, nullptr /* prefetch_buffer */, footer,
+                                 read_options_for_helper, handle, contents,
+                                 ioptions, false /* decompress */,
+                                 false /*maybe_compressed*/, block_type,
+                                 mgr->GetDecompressor().get(), cache_options);
 
       ASSERT_OK(block_fetcher.ReadBlockContents());
     };
@@ -5812,12 +5952,12 @@ TEST_P(BlockBasedTableTest, PropertiesMetaBlockLast) {
   auto metaindex_handle = footer.metaindex_handle();
   BlockContents metaindex_contents;
   PersistentCacheOptions pcache_opts;
+  auto mgr = GetBuiltinV2CompressionManager();
   BlockFetcher block_fetcher(
       table_reader.get(), nullptr /* prefetch_buffer */, footer, ReadOptions(),
       metaindex_handle, &metaindex_contents, ioptions, false /* decompress */,
       false /*maybe_compressed*/, BlockType::kMetaIndex,
-      UncompressionDict::GetEmptyDict(), pcache_opts,
-      nullptr /*memory_allocator*/);
+      mgr->GetDecompressor().get(), pcache_opts, nullptr /*memory_allocator*/);
   ASSERT_OK(block_fetcher.ReadBlockContents());
   Block metaindex_block(std::move(metaindex_contents));
 
@@ -5894,12 +6034,12 @@ TEST_P(BlockBasedTableTest, SeekMetaBlocks) {
   auto metaindex_handle = footer.metaindex_handle();
   BlockContents metaindex_contents;
   PersistentCacheOptions pcache_opts;
+  auto mgr = GetBuiltinV2CompressionManager();
   BlockFetcher block_fetcher(
       table_reader.get(), nullptr /* prefetch_buffer */, footer, ReadOptions(),
       metaindex_handle, &metaindex_contents, ioptions, false /* decompress */,
       false /*maybe_compressed*/, BlockType::kMetaIndex,
-      UncompressionDict::GetEmptyDict(), pcache_opts,
-      nullptr /*memory_allocator*/);
+      mgr->GetDecompressor().get(), pcache_opts, nullptr /*memory_allocator*/);
   ASSERT_OK(block_fetcher.ReadBlockContents());
   Block metaindex_block(std::move(metaindex_contents));
 
@@ -5944,27 +6084,25 @@ TEST_P(BlockBasedTableTest, BadOptions) {
   options.table_factory.reset(NewBlockBasedTableFactory(bbto));
   ASSERT_OK(DestroyDB(kDBPath, options));
 
-  std::unique_ptr<DB> db;
   {
-    ROCKSDB_NAMESPACE::DB* _db;
-    ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &_db));
+    std::unique_ptr<ROCKSDB_NAMESPACE::DB> db;
+    ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db));
 
     bbto.block_size = 4096;
     options.compression = kSnappyCompression;
     options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-    ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &_db));
+    ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db));
 
     options.compression = kNoCompression;
     options.bottommost_compression = kSnappyCompression;
-    ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &_db));
+    ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db));
 
     options.bottommost_compression = kNoCompression;
     options.compression_per_level.emplace_back(kSnappyCompression);
-    ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &_db));
+    ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db));
 
     options.compression_per_level.clear();
-    ASSERT_OK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &_db));
-    db.reset(_db);
+    ASSERT_OK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db));
   }
 }
 
@@ -6204,6 +6342,12 @@ TEST_P(BlockBasedTableTest, OutOfBoundOnNext) {
 class ChargeCompressionDictionaryBuildingBufferTest
     : public BlockBasedTableTestBase {};
 TEST_F(ChargeCompressionDictionaryBuildingBufferTest, Basic) {
+  if (GetSupportedDictCompressions().empty()) {
+    ROCKSDB_GTEST_SKIP("No supported dict compression");
+    return;
+  }
+  const auto kCompression = GetSupportedDictCompressions()[0];
+
   constexpr std::size_t kSizeDummyEntry = 256 * 1024;
   constexpr std::size_t kMetaDataChargeOverhead = 10000;
   constexpr std::size_t kCacheCapacity = 8 * 1024 * 1024;
@@ -6227,7 +6371,7 @@ TEST_F(ChargeCompressionDictionaryBuildingBufferTest, Basic) {
         {CacheEntryRole::kCompressionDictionaryBuildingBuffer,
          {/*.charged = */ charge_compression_dictionary_building_buffer}});
     Options options;
-    options.compression = kSnappyCompression;
+    options.compression = kCompression;
     options.compression_opts.max_dict_bytes = kMaxDictBytes;
     options.compression_opts.max_dict_buffer_bytes = kMaxDictBufferBytes;
     options.table_factory.reset(NewBlockBasedTableFactory(table_options));
@@ -6248,7 +6392,7 @@ TEST_F(ChargeCompressionDictionaryBuildingBufferTest, Basic) {
         options.table_factory->NewTableBuilder(
             TableBuilderOptions(ioptions, moptions, read_options, write_options,
                                 ikc, &internal_tbl_prop_coll_factories,
-                                kSnappyCompression, options.compression_opts,
+                                kCompression, options.compression_opts,
                                 kUnknownColumnFamily, "test_cf", -1 /* level */,
                                 kUnknownNewestKeyTime),
             file_writer.get()));
@@ -6287,6 +6431,12 @@ TEST_F(ChargeCompressionDictionaryBuildingBufferTest, Basic) {
 
 TEST_F(ChargeCompressionDictionaryBuildingBufferTest,
        BasicWithBufferLimitExceed) {
+  if (GetSupportedDictCompressions().empty()) {
+    ROCKSDB_GTEST_SKIP("No supported dict compression");
+    return;
+  }
+  const auto kCompression = GetSupportedDictCompressions()[0];
+
   constexpr std::size_t kSizeDummyEntry = 256 * 1024;
   constexpr std::size_t kMetaDataChargeOverhead = 10000;
   constexpr std::size_t kCacheCapacity = 8 * 1024 * 1024;
@@ -6306,7 +6456,7 @@ TEST_F(ChargeCompressionDictionaryBuildingBufferTest,
       std::make_shared<FlushBlockEveryKeyPolicyFactory>();
 
   Options options;
-  options.compression = kSnappyCompression;
+  options.compression = kCompression;
   options.compression_opts.max_dict_bytes = kMaxDictBytes;
   options.compression_opts.max_dict_buffer_bytes = kMaxDictBufferBytes;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
@@ -6325,7 +6475,7 @@ TEST_F(ChargeCompressionDictionaryBuildingBufferTest,
   const WriteOptions write_options;
   std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
       TableBuilderOptions(ioptions, moptions, read_options, write_options, ikc,
-                          &internal_tbl_prop_coll_factories, kSnappyCompression,
+                          &internal_tbl_prop_coll_factories, kCompression,
                           options.compression_opts, kUnknownColumnFamily,
                           "test_cf", -1 /* level */, kUnknownNewestKeyTime),
       file_writer.get()));
@@ -6368,6 +6518,12 @@ TEST_F(ChargeCompressionDictionaryBuildingBufferTest,
 }
 
 TEST_F(ChargeCompressionDictionaryBuildingBufferTest, BasicWithCacheFull) {
+  if (GetSupportedDictCompressions().empty()) {
+    ROCKSDB_GTEST_SKIP("No supported dict compression");
+    return;
+  }
+  const auto kCompression = GetSupportedDictCompressions()[0];
+
   constexpr std::size_t kSizeDummyEntry = 256 * 1024;
   constexpr std::size_t kMetaDataChargeOverhead = 10000;
   // A small kCacheCapacity is chosen so that increase cache charging for
@@ -6393,7 +6549,7 @@ TEST_F(ChargeCompressionDictionaryBuildingBufferTest, BasicWithCacheFull) {
       std::make_shared<FlushBlockEveryKeyPolicyFactory>();
 
   Options options;
-  options.compression = kSnappyCompression;
+  options.compression = kCompression;
   options.compression_opts.max_dict_bytes = kMaxDictBytes;
   options.compression_opts.max_dict_buffer_bytes = kMaxDictBufferBytes;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
@@ -6412,7 +6568,7 @@ TEST_F(ChargeCompressionDictionaryBuildingBufferTest, BasicWithCacheFull) {
   const WriteOptions write_options;
   std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
       TableBuilderOptions(ioptions, moptions, read_options, write_options, ikc,
-                          &internal_tbl_prop_coll_factories, kSnappyCompression,
+                          &internal_tbl_prop_coll_factories, kCompression,
                           options.compression_opts, kUnknownColumnFamily,
                           "test_cf", -1 /* level */, kUnknownNewestKeyTime),
       file_writer.get()));
@@ -6525,35 +6681,197 @@ TEST_F(CacheUsageOptionsOverridesTest, SanitizeAndValidateOptions) {
   Destroy(options);
 }
 
-class ExternalTableReaderTest : public DBTestBase {
+class ExternalTableTest : public DBTestBase {
  public:
-  ExternalTableReaderTest()
-      : DBTestBase("external_table_reader_test", /*env_do_fsync=*/false) {}
+  ExternalTableTest()
+      : DBTestBase("external_table_test", /*env_do_fsync=*/false) {}
 
  protected:
-  class DummyExternalTableIterator : public Iterator {
+  class DummyExternalTableFile {
+   public:
+    explicit DummyExternalTableFile(const std::string& file_path,
+                                    FSWritableFile* file)
+        : file_path_(file_path), file_(file), file_size_(0) {
+      props_.comparator_name = BytewiseComparator()->Name();
+    }
+
+    Status Serialize(
+        const std::vector<std::pair<std::string, std::string>>& kv_vec) {
+      // First append the property block if one exists
+      uint32_t prop_block_size = static_cast<uint32_t>(prop_block_.length());
+      buf_.append(static_cast<char*>(static_cast<void*>(&prop_block_size)),
+                  sizeof(prop_block_size));
+      if (!prop_block_.empty()) {
+        buf_.append(prop_block_);
+      }
+      for (auto& kv : kv_vec) {
+        SerializeOne(kv.first, kv.second);
+        props_.raw_key_size += kv.first.length();
+        props_.raw_value_size += kv.second.length();
+      }
+      props_.num_entries = kv_vec.size();
+      file_size_ = buf_.length();
+      if (file_) {
+        return file_->Append(buf_, IOOptions(), /*dbg=*/nullptr);
+      } else {
+        return WriteStringToFile(Env::Default(), buf_, file_path_);
+      }
+    }
+
+    Status Deserialize(std::map<std::string, std::string>& kv_map) {
+      Status s = ReadFileToString(Env::Default(), file_path_, &buf_);
+      if (!s.ok()) {
+        return s;
+      }
+
+      uint32_t prop_block_size = 0;
+      buf_.copy(static_cast<char*>(static_cast<void*>(&prop_block_size)),
+                sizeof(prop_block_size));
+      buf_.erase(0, sizeof(prop_block_size));
+      prop_block_.assign(buf_.substr(0, prop_block_size));
+      buf_.erase(0, prop_block_size);
+      while (buf_.length() > 0) {
+        std::pair<std::string, std::string> kv;
+        s = DeserializeOne(kv);
+        if (!s.ok()) {
+          break;
+        }
+        size_t key_size = kv.first.length();
+        size_t value_size = kv.second.length();
+        kv_map.emplace(std::move(kv));
+        props_.raw_key_size += key_size;
+        props_.raw_value_size += value_size;
+      }
+      props_.num_entries = kv_map.size();
+      return s;
+    }
+
+    Status PutPropertiesBlock(const Slice& prop_block) {
+      prop_block_.assign(prop_block.data(), prop_block.size());
+      return Status::OK();
+    }
+
+    Status GetPropertiesBlock(std::unique_ptr<char[]>* block, uint64_t* size,
+                              uint64_t* file_offset) {
+      if (!prop_block_.empty()) {
+        *block = std::make_unique<char[]>(prop_block_.length());
+        memcpy(block->get(), prop_block_.data(), prop_block_.length());
+        *size = prop_block_.length();
+        *file_offset = sizeof(uint32_t);
+      } else {
+        *size = 0;
+      }
+      return Status::OK();
+    }
+
+    TableProperties GetTableProperties() const { return props_; }
+
+    uint64_t FileSize() const { return file_size_; }
+
+   private:
+    struct ItemHeader {
+      uint32_t key_size;
+      uint32_t value_size;
+    };
+
+    void SerializeOne(const Slice& key, const Slice& value) {
+      ItemHeader hdr;
+      hdr.key_size = static_cast<uint32_t>(key.size());
+      hdr.value_size = static_cast<uint32_t>(value.size());
+      buf_.append(static_cast<char*>(static_cast<void*>(&hdr)), sizeof(hdr));
+      buf_.append(key.data(), key.size());
+      buf_.append(value.data(), value.size());
+    }
+
+    Status DeserializeOne(std::pair<std::string, std::string>& kv) {
+      ItemHeader hdr;
+      size_t copied =
+          buf_.copy(static_cast<char*>(static_cast<void*>(&hdr)), sizeof(hdr));
+      if (copied < sizeof(hdr)) {
+        return Status::Corruption();
+      }
+      buf_.erase(0, sizeof(hdr));
+      if (buf_.length() < hdr.key_size + hdr.value_size) {
+        return Status::Corruption();
+      }
+      kv.first.assign(std::string_view(buf_.data(), hdr.key_size));
+      buf_.erase(0, hdr.key_size);
+      kv.second.assign(std::string_view(buf_.data(), hdr.value_size));
+      buf_.erase(0, hdr.value_size);
+      return Status::OK();
+    }
+
+    std::string file_path_;
+    FSWritableFile* file_;
+    std::string buf_;
+    TableProperties props_;
+    uint64_t file_size_;
+    std::string prop_block_;
+  };
+
+  class DummyExternalTableIterator : public ExternalTableIterator {
    public:
-    explicit DummyExternalTableIterator(bool empty) : empty_(empty) {}
+    explicit DummyExternalTableIterator(
+        const ReadOptions& /*ro*/,
+        const std::map<std::string, std::string>& kv_map)
+        : scan_options_(nullptr),
+          num_opts_(0),
+          scan_idx_(0),
+          kv_map_(kv_map),
+          valid_(false) {
+      TEST_SYNC_POINT_CALLBACK("DummyExternalTableIterator::Constructor",
+                               &status_);
+    }
 
-    bool Valid() const override { return empty_ ? !empty_ : valid_; }
+    bool Valid() const override { return valid_; }
 
     void SeekToFirst() override {
-      valid_ = true;
-      status_ = Status::OK();
+      if (scan_options_) {
+        status_ = Status::InvalidArgument();
+      } else {
+        iter_ = kv_map_.begin();
+        valid_ = iter_ != kv_map_.end();
+        status_ = Status::OK();
+      }
     }
 
     void SeekToLast() override {
-      valid_ = true;
-      status_ = Status::OK();
+      if (scan_options_) {
+        status_ = Status::InvalidArgument();
+      } else {
+        if (!kv_map_.empty()) {
+          iter_ = kv_map_.begin();
+          for (uint64_t i = 0; i < kv_map_.size() - 1; ++i) {
+            iter_++;
+          }
+          valid_ = true;
+        } else {
+          valid_ = false;
+        }
+        status_ = Status::OK();
+      }
     }
 
     void Seek(const Slice& target) override {
-      if (target.compare(key_str) <= 0) {
-        valid_ = true;
-      } else {
-        valid_ = false;
+      if (status_.ok()) {
+        iter_ = kv_map_.find(target.ToString());
+        valid_ = iter_ != kv_map_.end();
+        eof_ = iter_ == kv_map_.end();
+      }
+      if (scan_options_) {
+        if (scan_idx_ >= num_opts_ ||
+            target != scan_options_[scan_idx_].range.start.value().ToString()) {
+          status_ = Status::InvalidArgument();
+        } else {
+          if (valid_ && scan_options_[scan_idx_].range.limit.has_value() &&
+              iter_->first.compare(
+                  scan_options_[scan_idx_].range.limit.value().ToString()) >=
+                  0) {
+            valid_ = false;
+          }
+          scan_idx_++;
+        }
       }
-      status_ = Status::OK();
     }
 
     void SeekForPrev(const Slice& /*target*/) override {
@@ -6562,8 +6880,38 @@ class ExternalTableReaderTest : public DBTestBase {
     }
 
     void Next() override {
-      valid_ = false;
-      // status_ is still ok. valid_ indicates end of scan
+      iter_++;
+      valid_ = iter_ != kv_map_.end();
+      eof_ = iter_ == kv_map_.end();
+      if (valid_ && scan_options_ &&
+          scan_options_[scan_idx_ - 1].range.limit.has_value() &&
+          iter_->first.compare(
+              scan_options_[scan_idx_ - 1].range.limit.value().ToString()) >=
+              0) {
+        valid_ = false;
+      }
+      // status_ is still ok. !valid_ indicates end of scan
+    }
+
+    bool NextAndGetResult(IterateResult* result) override {
+      Next();
+      if (valid_) {
+        result->key = key();
+        result->bound_check_result = IterBoundCheck::kInbound;
+        result->value_prepared = true;
+      } else {
+        result->key = Slice();
+        result->bound_check_result =
+            eof_ ? IterBoundCheck::kUnknown : IterBoundCheck::kOutOfBound;
+        result->value_prepared = false;
+      }
+      return valid_;
+    }
+
+    bool PrepareValue() override { return valid_ ? true : false; }
+
+    IterBoundCheck UpperBoundCheckResult() override {
+      return eof_ ? IterBoundCheck::kUnknown : IterBoundCheck::kOutOfBound;
     }
 
     void Prev() override {
@@ -6573,7 +6921,7 @@ class ExternalTableReaderTest : public DBTestBase {
 
     Slice key() const override {
       // If valid_ is false or status_ is non-ok, behavior is indeterminate
-      return Slice(key_str);
+      return Slice(iter_->first);
     }
 
     Status status() const override {
@@ -6583,31 +6931,47 @@ class ExternalTableReaderTest : public DBTestBase {
 
     Slice value() const override {
       // If valid_ is false or status_ is non-ok, behavior is indeterminate
-      return Slice(value_str);
+      return Slice(iter_->second);
     }
 
-   private:
-    static const std::string key_str;
-    static const std::string value_str;
+    void Prepare(const ScanOptions scan_opts[], size_t num_opts) override {
+      scan_options_ = scan_opts;
+      num_opts_ = num_opts;
+    }
 
+   private:
+    const ScanOptions* scan_options_;
+    size_t num_opts_;
+    size_t scan_idx_;
+    std::map<std::string, std::string> kv_map_;
     bool valid_ = false;
-    bool empty_;
+    bool eof_ = false;
     Status status_ = Status::OK();
+    std::map<std::string, std::string>::iterator iter_;
   };
 
   class DummyExternalTableReader : public ExternalTableReader {
    public:
-    Iterator* NewIterator(const ReadOptions& read_options,
-                          const SliceTransform* /*prefix_extractor*/) override {
-      return new DummyExternalTableIterator((read_options.weight == 0) ? true
-                                                                       : false);
+    explicit DummyExternalTableReader(const std::string& file_path,
+                                      bool support_property_block)
+        : file_(file_path, /*file=*/nullptr),
+          support_property_block_(support_property_block) {
+      Status s = file_.Deserialize(kv_map_);
+      EXPECT_OK(s);
+    }
+
+    ExternalTableIterator* NewIterator(
+        const ReadOptions& read_options,
+        const SliceTransform* /*prefix_extractor*/) override {
+      return new DummyExternalTableIterator(read_options, kv_map_);
     }
 
     Status Get(const ReadOptions& /*read_options*/, const Slice& key,
                const SliceTransform* /*prefix_extractor*/,
                std::string* value) override {
-      if (!key.compare("foo")) {
-        value->assign("bar");
+      auto iter = kv_map_.find(key.ToString());
+      if (iter != kv_map_.end()) {
+        value->assign(iter->second);
         return Status::OK();
       }
       return Status::NotFound();
@@ -6626,6 +6990,14 @@ class ExternalTableReaderTest : public DBTestBase {
       }
     }
 
+    Status GetPropertiesBlock(std::unique_ptr<char[]>* block, uint64_t* size,
+                              uint64_t* file_offset) override {
+      if (!support_property_block_) {
+        return Status::NotSupported();
+      }
+      return file_.GetPropertiesBlock(block, size, file_offset);
+    }
+
     std::shared_ptr<const TableProperties> GetTableProperties() const override {
       std::shared_ptr<TableProperties> props =
           std::make_shared<TableProperties>();
@@ -6635,39 +7007,115 @@ class ExternalTableReaderTest : public DBTestBase {
       props->raw_value_size = 3;
       return props;
     }
+
+   private:
+    std::map<std::string, std::string> kv_map_;
+    DummyExternalTableFile file_;
+    bool support_property_block_;
+  };
+
+  class DummyExternalTableBuilder : public ExternalTableBuilder {
+   public:
+    explicit DummyExternalTableBuilder(const std::string& file_path,
+                                       FSWritableFile* file,
+                                       bool support_property_block)
+        : file_(file_path, file),
+          support_property_block_(support_property_block) {}
+
+    void Add(const Slice& key, const Slice& value) override {
+      if (!kv_vec_.empty()) {
+        ASSERT_LT(BytewiseComparator()->Compare(kv_vec_.back().first, key), 0);
+      }
+      kv_vec_.emplace_back(key.ToString(), value.ToString());
+    }
+
+    Status Finish() override {
+      status_ = file_.Serialize(kv_vec_);
+      return status_;
+    }
+
+    void Abandon() override { kv_vec_.clear(); }
+
+    uint64_t FileSize() const override { return file_.FileSize(); }
+
+    Status PutPropertiesBlock(const Slice& block) override {
+      if (!support_property_block_) {
+        return Status::NotSupported();
+      }
+      return file_.PutPropertiesBlock(block);
+    }
+
+    TableProperties GetTableProperties() const override {
+      return file_.GetTableProperties();
+    }
+
+    Status status() const override { return status_; }
+
+   private:
+    std::vector<std::pair<std::string, std::string>> kv_vec_;
+    DummyExternalTableFile file_;
+    Status status_;
+    bool support_property_block_;
   };
 
   class DummyExternalTableFactory : public ExternalTableFactory {
    public:
+    explicit DummyExternalTableFactory(bool support_property_block)
+        : support_property_block_(support_property_block) {}
     const char* Name() const override { return "DummyExternalTableFactory"; }
 
     Status NewTableReader(
-        const ReadOptions& /*read_options*/, const std::string& /*file_path*/,
-        const ExternalTableOptions& /*topts*/,
-        std::unique_ptr<ExternalTableReader>* table_reader) override {
-      table_reader->reset(new DummyExternalTableReader());
+        const ReadOptions& /*read_options*/, const std::string& file_path,
+        const ExternalTableOptions& topts,
+        std::unique_ptr<ExternalTableReader>* table_reader) const override {
+      // Sanity check some options
+      EXPECT_EQ(topts.file_options.handoff_checksum_type,
+                ChecksumType::kCRC32c);
+      table_reader->reset(
+          new DummyExternalTableReader(file_path, support_property_block_));
       return Status::OK();
     }
+
+    ExternalTableBuilder* NewTableBuilder(
+        const ExternalTableBuilderOptions& /*opts*/,
+        const std::string& file_path, FSWritableFile* file) const override {
+      return new DummyExternalTableBuilder(file_path, file,
+                                           support_property_block_);
+    }
+
+   private:
+    bool support_property_block_;
   };
 };
 
-const std::string ExternalTableReaderTest::DummyExternalTableIterator::key_str =
-    "foo";
-const std::string
-    ExternalTableReaderTest::DummyExternalTableIterator::value_str = "bar";
-
-TEST_F(ExternalTableReaderTest, BasicTest) {
+TEST_F(ExternalTableTest, BasicTest) {
   std::shared_ptr<ExternalTableFactory> factory =
-      std::make_shared<DummyExternalTableFactory>();
+      std::make_shared<DummyExternalTableFactory>(
+          /*support_property_block=*/false);
+
+  std::string file_path = test::PerThreadDBPath("external_table");
+  {
+    std::unique_ptr<ExternalTableBuilder> builder;
+    builder.reset(factory->NewTableBuilder(
+        ExternalTableBuilderOptions(ReadOptions(), WriteOptions(),
+                                    std::shared_ptr<const SliceTransform>(),
+                                    BytewiseComparator(), "default",
+                                    TableFileCreationReason::kMisc),
+        file_path, /*file=*/nullptr));
+    builder->Add("foo", "bar");
+    ASSERT_OK(builder->Finish());
+  }
 
   std::unique_ptr<ExternalTableReader> reader;
   std::shared_ptr<SliceTransform> prefix_extractor;
   ASSERT_OK(factory->NewTableReader(
-      {}, "", ExternalTableOptions(prefix_extractor, nullptr), &reader));
+      {}, file_path,
+      ExternalTableOptions(prefix_extractor, /*comparator=*/nullptr,
+                           /*fs=*/nullptr, FileOptions()),
+      &reader));
 
   ReadOptions ro;
-  ro.weight = 1;
-  std::unique_ptr<Iterator> iter(reader->NewIterator(ro, nullptr));
+  std::unique_ptr<ExternalTableIterator> iter(reader->NewIterator(ro, nullptr));
   ASSERT_NE(iter, nullptr);
   iter->Seek("foo");
   ASSERT_TRUE(iter->Valid() && iter->status().ok());
@@ -6689,25 +7137,32 @@ TEST_F(ExternalTableReaderTest, BasicTest) {
   ASSERT_EQ(statuses[1], Status::NotFound());
 }
 
-TEST_F(ExternalTableReaderTest, SstReaderTest) {
+TEST_F(ExternalTableTest, SstReaderTest) {
+  if (encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-encrypted environment");
+    return;
+  }
   Options options = GetDefaultOptions();
-  std::string dbname = test::PerThreadDBPath("external_table_reader_test");
+  std::string dbname = test::PerThreadDBPath("external_table_test");
   std::string ingest_file = dbname + "test.immutabledb";
   dbname += "_db";
 
   std::shared_ptr<ExternalTableFactory> factory =
-      std::make_shared<DummyExternalTableFactory>();
+      std::make_shared<DummyExternalTableFactory>(
+          /*support_property_block=*/false);
   options.table_factory = NewExternalTableFactory(factory);
 
-  // Create a file
-  ASSERT_OK(WriteStringToFile(options.env, "Hello World", ingest_file,
-                              /*should_sync=*/true));
+  std::unique_ptr<SstFileWriter> writer;
+  writer.reset(new SstFileWriter(EnvOptions(), options));
+  ASSERT_OK(writer->Open(ingest_file));
+  ASSERT_OK(writer->Put("foo", "bar"));
+  ASSERT_OK(writer->Finish());
+  writer.reset();
 
   std::unique_ptr<SstFileReader> reader(new SstFileReader(options));
   ASSERT_OK(reader->Open(ingest_file));
 
   ReadOptions ro;
-  ro.weight = 1;
   std::unique_ptr<Iterator> iter(reader->NewIterator(ro));
   ASSERT_NE(iter, nullptr);
   iter->Seek("foo");
@@ -6718,9 +7173,2545 @@ TEST_F(ExternalTableReaderTest, SstReaderTest) {
   ASSERT_TRUE(iter->status().ok());
 }
 
-}  // namespace ROCKSDB_NAMESPACE
+TEST_F(ExternalTableTest, ExternalFileChecksumTest) {
+  if (encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-encrypted environment");
+    return;
+  }
+  Options options = GetDefaultOptions();
+  std::string dbname = test::PerThreadDBPath("external_table_test");
+  std::string ingest_file = dbname + "test.immutable";
+  dbname += "_db";
+  ASSERT_OK(DestroyDB(dbname, options));
+
+  std::shared_ptr<ExternalTableFactory> factory =
+      std::make_shared<DummyExternalTableFactory>(
+          /*support_property_block=*/true);
+  options.table_factory = NewExternalTableFactory(factory);
+
+  // Create a file
+  options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  std::unique_ptr<SstFileWriter> writer;
+  writer.reset(new SstFileWriter(EnvOptions(), options));
+  ASSERT_OK(writer->Open(ingest_file));
+  ASSERT_OK(writer->Put("foo", "bar"));
+  ASSERT_OK(writer->Put("foo2", "bar2"));
+  ExternalSstFileInfo info;
+  ASSERT_OK(writer->Finish(&info));
+  writer.reset();
+
+  FileChecksumGenContext cksum_ctx;
+  FileChecksumGenCrc32c cksum_gen(cksum_ctx);
+  std::string file_data;
+  ASSERT_OK(ReadFileToString(options.env, ingest_file, &file_data));
+  cksum_gen.Update(file_data.data(), file_data.size());
+  cksum_gen.Finalize();
+  ASSERT_EQ(info.file_checksum, cksum_gen.GetChecksum());
+}
+
+TEST_F(ExternalTableTest, DBIterTest) {
+  if (encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-encrypted environment");
+    return;
+  }
+  Options options = GetDefaultOptions();
+  std::string dbname = test::PerThreadDBPath("external_table_test");
+  std::string ingest_file = dbname + "test.immutable";
+  dbname += "_db";
+  ASSERT_OK(DestroyDB(dbname, options));
+
+  std::shared_ptr<ExternalTableFactory> factory =
+      std::make_shared<DummyExternalTableFactory>(
+          /*support_property_block=*/true);
+  options.table_factory = NewExternalTableFactory(factory);
+
+  // Create a file
+  std::unique_ptr<SstFileWriter> writer;
+  writer.reset(new SstFileWriter(EnvOptions(), options));
+  ASSERT_OK(writer->Open(ingest_file));
+  ASSERT_OK(writer->Put("foo", "bar"));
+  ASSERT_OK(writer->Put("foo2", "bar2"));
+  ASSERT_OK(writer->Finish());
+  writer.reset();
+
+  std::unique_ptr<DB> db;
+  options.create_if_missing = true;
+  Status s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+  ColumnFamilyHandle* cfh = nullptr;
+  ASSERT_OK(db->CreateColumnFamily(options, "new_cf", &cfh));
+
+  IngestExternalFileOptions ifo;
+  ifo.allow_db_generated_files = true;
+  ifo.fill_cache = false;
+  s = db->IngestExternalFile(cfh, {ingest_file}, ifo);
+  ASSERT_OK(s);
+
+  std::unique_ptr<Iterator> iter(db->NewIterator({}, cfh));
+  ASSERT_NE(iter, nullptr);
+  iter->Seek("foo");
+  ASSERT_TRUE(iter->Valid() && iter->status().ok());
+  ASSERT_EQ(iter->value(), "bar");
+  iter->Next();
+  ASSERT_TRUE(iter->Valid() && iter->status().ok());
+  ASSERT_EQ(iter->key(), "foo2");
+  ASSERT_EQ(iter->value(), "bar2");
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_OK(iter->status());
+  iter.reset();
+
+  ASSERT_OK(db->DestroyColumnFamilyHandle(cfh));
+  ASSERT_OK(db->Close());
+}
+
+TEST_F(ExternalTableTest, DBMultiScanTest) {
+  if (encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-encrypted environment");
+    return;
+  }
+  Options options = GetDefaultOptions();
+  std::string dbname = test::PerThreadDBPath("external_table_test");
+  std::string ingest_file = dbname + "test.immutable";
+  dbname += "_db";
+  ASSERT_OK(DestroyDB(dbname, options));
+
+  std::shared_ptr<ExternalTableFactory> factory =
+      std::make_shared<DummyExternalTableFactory>(
+          /*support_property_block=*/true);
+  options.table_factory = NewExternalTableFactory(factory);
+
+  // Create a file
+  std::unique_ptr<SstFileWriter> writer;
+  writer.reset(new SstFileWriter(EnvOptions(), options));
+  ASSERT_OK(writer->Open(ingest_file));
+  for (int i = 0; i < 100; ++i) {
+    std::stringstream ss;
+    ss << std::setw(2) << std::setfill('0') << i;
+    ASSERT_OK(writer->Put("k" + ss.str(), "val" + ss.str()));
+  }
+  ASSERT_OK(writer->Finish());
+  writer.reset();
+
+  std::unique_ptr<DB> db;
+  options.create_if_missing = true;
+  Status s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+  ColumnFamilyHandle* cfh = nullptr;
+  ASSERT_OK(db->CreateColumnFamily(options, "new_cf", &cfh));
+
+  IngestExternalFileOptions ifo;
+  ifo.allow_db_generated_files = true;
+  ifo.fill_cache = false;
+  s = db->IngestExternalFile(cfh, {ingest_file}, ifo);
+  ASSERT_OK(s);
+
+  std::vector<std::string> key_ranges({"k03", "k10", "k25", "k50"});
+  ReadOptions ro;
+  MultiScanArgs scan_options(BytewiseComparator());
+  scan_options.insert(key_ranges[0], key_ranges[1]);
+  scan_options.insert(key_ranges[2], key_ranges[3]);
+  std::unique_ptr<MultiScan> iter = db->NewMultiScan(ro, cfh, scan_options);
+  try {
+    int idx = 0;
+    int count = 0;
+    for (auto range : *iter) {
+      for (auto it : range) {
+        ASSERT_GE(it.first.ToString().compare(key_ranges[idx]), 0);
+        ASSERT_LT(it.first.ToString().compare(key_ranges[idx + 1]), 0);
+        count++;
+      }
+      idx += 2;
+    }
+    ASSERT_EQ(count, 32);
+  } catch (MultiScanException& ex) {
+    // Make sure exception contains the status
+    ASSERT_NOK(ex.status());
+    std::cerr << "Iterator returned status " << ex.what();
+    abort();
+  } catch (std::logic_error& ex) {
+    std::cerr << "Iterator returned logic error " << ex.what();
+    abort();
+  }
+  iter.reset();
+
+  // Test the overlapping scan case
+  key_ranges[1] = "k30";
+  scan_options = MultiScanArgs(BytewiseComparator());
+  scan_options.insert(key_ranges[0], key_ranges[1]);
+  scan_options.insert(key_ranges[2], key_ranges[3]);
+
+  iter = db->NewMultiScan(ro, cfh, scan_options);
+  try {
+    int idx = 0;
+    int count = 0;
+    for (auto range : *iter) {
+      for (auto it : range) {
+        ASSERT_GE(it.first.ToString().compare(key_ranges[idx]), 0);
+        ASSERT_LT(it.first.ToString().compare(key_ranges[idx + 1]), 0);
+        count++;
+      }
+      idx += 2;
+    }
+    ASSERT_EQ(count, 52);
+  } catch (MultiScanException& ex) {
+    // Make sure exception contains the status
+    ASSERT_NOK(ex.status());
+  } catch (std::logic_error& ex) {
+    std::cerr << "Iterator returned logic error " << ex.what();
+    abort();
+  }
+  iter.reset();
+
+  // Test the no limit scan case
+  scan_options = MultiScanArgs(BytewiseComparator());
+  scan_options.insert(key_ranges[0]);
+  scan_options.insert(key_ranges[2]);
+  iter = db->NewMultiScan(ro, cfh, scan_options);
+  try {
+    int idx = 0;
+    int count = 0;
+    for (auto range : *iter) {
+      for (auto it : range) {
+        ASSERT_GE(it.first.ToString().compare(key_ranges[idx]), 0);
+        if (it.first.ToString().compare(key_ranges[idx + 1]) == 0) {
+          break;
+        }
+        count++;
+      }
+      idx += 2;
+    }
+    ASSERT_EQ(count, 52);
+  } catch (MultiScanException& ex) {
+    // Make sure exception contains the status
+    ASSERT_NOK(ex.status());
+  } catch (std::logic_error& ex) {
+    std::cerr << "Iterator returned logic error " << ex.what();
+    abort();
+  }
+  iter.reset();
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "DummyExternalTableIterator::Constructor", [](void* arg) {
+        Status* status = static_cast<Status*>(arg);
+        *status = Status::IOError();
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  iter = db->NewMultiScan(ro, cfh, scan_options);
+  try {
+    for (auto range : *iter) {
+      // Should not get here. Iterator should throw an exception
+      assert(false);
+      for (auto it : range) {
+        (void)it;
+        assert(false);
+      }
+    }
+  } catch (MultiScanException& ex) {
+    // Make sure exception contains the status
+    ASSERT_NOK(ex.status());
+  } catch (std::logic_error& ex) {
+    std::cerr << "Iterator returned logic error " << ex.what();
+    abort();
+  }
+  iter.reset();
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  ASSERT_OK(db->DestroyColumnFamilyHandle(cfh));
+  ASSERT_OK(db->Close());
+}
+
+TEST_F(ExternalTableTest, IngestionTest) {
+  if (encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-encrypted environment");
+    return;
+  }
+  Options options = GetDefaultOptions();
+  std::string dbname = test::PerThreadDBPath("external_table_test");
+  std::string ingest_file = dbname + "test.immutable";
+  dbname += "_db";
+  ASSERT_OK(DestroyDB(dbname, options));
+
+  std::shared_ptr<ExternalTableFactory> factory =
+      std::make_shared<DummyExternalTableFactory>(
+          /*support_property_block=*/true);
+  options.table_factory = NewExternalTableFactory(factory);
+
+  // Create a file
+  std::unique_ptr<SstFileWriter> writer;
+  writer.reset(new SstFileWriter(EnvOptions(), options));
+  ASSERT_OK(writer->Open(ingest_file));
+  ASSERT_OK(writer->Put("foo", "bar"));
+  ASSERT_OK(writer->Put("foo2", "bar2"));
+  ASSERT_OK(writer->Finish());
+  writer.reset();
+
+  std::unique_ptr<DB> db;
+  options.create_if_missing = true;
+  Status s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+  ColumnFamilyHandle* cfh = nullptr;
+  ASSERT_OK(db->CreateColumnFamily(options, "new_cf", &cfh));
+
+  IngestExternalFileOptions ifo;
+  ifo.allow_db_generated_files = false;
+  ifo.fill_cache = false;
+  s = db->IngestExternalFile(cfh, {ingest_file}, ifo);
+  ASSERT_OK(s);
+
+  std::unique_ptr<Iterator> iter(db->NewIterator({}, cfh));
+  ASSERT_NE(iter, nullptr);
+  iter->Seek("foo");
+  ASSERT_TRUE(iter->Valid() && iter->status().ok());
+  ASSERT_EQ(iter->value(), "bar");
+  iter->Next();
+  ASSERT_TRUE(iter->Valid() && iter->status().ok());
+  ASSERT_EQ(iter->key(), "foo2");
+  ASSERT_EQ(iter->value(), "bar2");
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_OK(iter->status());
+  iter.reset();
+
+  // Create an overlapping file to ingest with atomic_replace_range option
+  ingest_file += "2";
+  writer.reset(new SstFileWriter(EnvOptions(), options));
+  ASSERT_OK(writer->Open(ingest_file));
+  ASSERT_OK(writer->Put("foo", "val"));
+  ASSERT_OK(writer->Put("foo2", "val2"));
+  ASSERT_OK(writer->Finish());
+  writer.reset();
+
+  ifo.snapshot_consistency = false;
+  s = db->IngestExternalFiles({{cfh,
+                                {ingest_file},
+                                ifo,
+                                {},
+                                {},
+                                Temperature::kUnknown,
+                                {{nullptr, nullptr}}}});
+  ASSERT_OK(s);
+
+  iter.reset(db->NewIterator({}, cfh));
+  ASSERT_NE(iter, nullptr);
+  iter->Seek("foo");
+  ASSERT_TRUE(iter->Valid() && iter->status().ok());
+  ASSERT_EQ(iter->value(), "val");
+  iter->Next();
+  ASSERT_TRUE(iter->Valid() && iter->status().ok());
+  ASSERT_EQ(iter->key(), "foo2");
+  ASSERT_EQ(iter->value(), "val2");
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_OK(iter->status());
+  iter.reset();
+
+  // Create an overlapping file to ingest without atomic_replace_range option.
+  // This should fail as we don't support ingesting an external file with
+  // non-zero assigned sequence number.
+  ingest_file += "3";
+  writer.reset(new SstFileWriter(EnvOptions(), options));
+  ASSERT_OK(writer->Open(ingest_file));
+  ASSERT_OK(writer->Put("foo", "newval"));
+  ASSERT_OK(writer->Put("foo2", "newval2"));
+  ASSERT_OK(writer->Finish());
+  writer.reset();
+
+  s = db->IngestExternalFiles(
+      {{cfh, {ingest_file}, ifo, {}, {}, Temperature::kUnknown, {}}});
+  ASSERT_EQ(s, Status::NotSupported());
+
+  ASSERT_OK(db->DestroyColumnFamilyHandle(cfh));
+  ASSERT_OK(db->Close());
+}
+
+class UserDefinedIndexTestBase : public BlockBasedTableTestBase {
+ public:
+  class CustomFlushBlockPolicy : public FlushBlockPolicy {
+   public:
+    explicit CustomFlushBlockPolicy(int keys_per_block)
+        : keys_in_current_block_(0), keys_per_block_(keys_per_block) {}
+
+    bool Update(const Slice& /*key*/, const Slice& /*value*/) override {
+      if (keys_in_current_block_ >= keys_per_block_) {
+        keys_in_current_block_ = 1;
+        return true;
+      }
+      keys_in_current_block_++;
+      return false;
+    }
+
+   private:
+    int keys_in_current_block_;
+    int keys_per_block_;
+  };
+
+  class CustomFlushBlockPolicyFactory : public FlushBlockPolicyFactory {
+   public:
+    CustomFlushBlockPolicyFactory(int keys_per_block = 3)
+        : keys_per_block_(keys_per_block) {}
+    const char* Name() const override { return "CustomFlushBlockPolicy"; }
+    FlushBlockPolicy* NewFlushBlockPolicy(const BlockBasedTableOptions&,
+                                          const BlockBuilder&) const override {
+      return new CustomFlushBlockPolicy(keys_per_block_);
+    }
+    int keys_per_block_;
+  };
+
+ public:
+  class TestUserDefinedIndexFactory : public UserDefinedIndexFactory {
+   public:
+    const char* Name() const override { return "test_index"; }
+    Status NewBuilder(
+        const UserDefinedIndexOption& /*option*/,
+        std::unique_ptr<UserDefinedIndexBuilder>& builder) const override {
+      builder = std::make_unique<TestUserDefinedIndexBuilder>();
+      return Status::OK();
+    }
+
+    struct CustomizedMapComparator {
+      CustomizedMapComparator(const Comparator* _comparator)
+          : comparator(_comparator) {}
+      const Comparator* comparator;
+      bool operator()(const std::string& lhs, const std::string& rhs) const {
+        return comparator->Compare(lhs, rhs) < 0;
+      }
+    };
+
+    // Deprecated API
+    UserDefinedIndexBuilder* NewBuilder() const override { return nullptr; }
+
+    std::unique_ptr<UserDefinedIndexReader> NewReader(
+        Slice& /*index_block*/) const override {
+      return nullptr;
+    }
+
+    Status NewReader(
+        const UserDefinedIndexOption& option, Slice& index_block,
+        std::unique_ptr<UserDefinedIndexReader>& reader) const override {
+      reader = std::make_unique<TestUserDefinedIndexReader>(
+          index_block, option.comparator, this);
+      return Status::OK();
+    }
+
+    uint64_t seek_error_count_ = 0;
+    uint64_t next_error_count_ = 0;
+
+   private:
+    class TestUserDefinedIndexBuilder : public UserDefinedIndexBuilder {
+     public:
+      TestUserDefinedIndexBuilder() : entries_added_(0), keys_added_(0) {}
+
+      Slice AddIndexEntry(const Slice& last_key_in_current_block,
+                          const Slice* first_key_in_next_block,
+                          const BlockHandle& block_handle,
+                          std::string* separator_scratch) override {
+        if (keys_added_ == 0) {
+          return last_key_in_current_block;
+        }
+        EXPECT_EQ(last_key_in_current_block.size(), 5);
+        if (first_key_in_next_block) {
+          EXPECT_EQ(first_key_in_next_block->size(), 5);
+        }
+        // Unused parameters
+        (void)separator_scratch;
+        entries_added_++;
+        index_data_[last_key_in_current_block.ToString()].clear();
+        // Store the block handle for each key
+        PutFixed64(&index_data_[last_key_in_current_block.ToString()],
+                   block_handle.offset);
+        PutFixed64(&index_data_[last_key_in_current_block.ToString()],
+                   block_handle.size);
+        PutFixed32(&index_data_[last_key_in_current_block.ToString()],
+                   keys_added_);
+        keys_added_ = 0;
+        return last_key_in_current_block;
+      }
+
+      void OnKeyAdded(const Slice& key, ValueType /*value*/,
+                      const Slice& /*value*/) override {
+        if (key.starts_with("dummy")) {
+          return;
+        }
+        EXPECT_EQ(key.size(), 5);
+        // Track keys added to the index
+        keys_added_++;
+        // Add dummy entry
+        PutFixed64(&index_data_[key.ToString()], 0);
+        PutFixed64(&index_data_[key.ToString()], 0);
+        PutFixed32(&index_data_[key.ToString()], 0);
+      }
+
+      Status Finish(Slice* index_contents) override {
+        if (entries_added_ == 0) {
+          *index_contents = Slice();
+          return Status::OK();
+        }
+        // Serialize the index data
+        std::string result;
+        for (const auto& entry : index_data_) {
+          PutLengthPrefixedSlice(&result, entry.first);
+          result.append(entry.second);
+        }
+        index_contents_data_ = result;
+        *index_contents = index_contents_data_;
+        return Status::OK();
+      }
+
+      int GetEntriesAdded() const { return entries_added_; }
+
+     private:
+      int entries_added_;
+      std::map<std::string, std::string> index_data_;
+      uint32_t keys_added_;
+      std::string index_contents_data_;
+    };
+
+    class TestUserDefinedIndexReader : public UserDefinedIndexReader {
+     public:
+      explicit TestUserDefinedIndexReader(
+          Slice& index_block, const Comparator* comparator,
+          const TestUserDefinedIndexFactory* factory)
+          : factory_(factory),
+            comparator_(comparator),
+            index_data_(CustomizedMapComparator(comparator)) {
+        Slice block = index_block;
+        while (!block.empty()) {
+          Slice key;
+          uint64_t offset = 0;
+          uint64_t size = 0;
+          uint32_t num_keys = 0;
+          EXPECT_TRUE(GetLengthPrefixedSlice(&block, &key));
+          EXPECT_TRUE(GetFixed64(&block, &offset));
+          EXPECT_TRUE(GetFixed64(&block, &size));
+          EXPECT_TRUE(GetFixed32(&block, &num_keys));
+
+          UserDefinedIndexBuilder::BlockHandle handle{0, 0};
+          handle.offset = offset;
+          handle.size = size;
+          index_data_[key.ToString()] =
+              std::make_pair<UserDefinedIndexBuilder::BlockHandle, uint32_t>(
+                  std::move(handle), std::move(num_keys));
+        }
+      }
+
+      std::unique_ptr<UserDefinedIndexIterator> NewIterator(
+          const ReadOptions& /*ro*/) override {
+        return std::make_unique<TestUserDefinedIndexIterator>(
+            index_data_, factory_, comparator_);
+      }
+
+      size_t ApproximateMemoryUsage() const override { return 0; }
+
+     private:
+      class TestUserDefinedIndexIterator : public UserDefinedIndexIterator {
+       public:
+        TestUserDefinedIndexIterator(
+            std::map<std::string,
+                     std::pair<UserDefinedIndexBuilder::BlockHandle, uint32_t>,
+                     CustomizedMapComparator>& index,
+            const TestUserDefinedIndexFactory* factory,
+            const Comparator* comparator)
+            : index_(index),
+              iter_(index_.end()),
+              scan_opts_(nullptr),
+              num_opts_(0),
+              target_num_keys_(0),
+              seek_error_count_(factory->seek_error_count_),
+              next_error_count_(factory->next_error_count_),
+              comparator_(comparator) {}
+
+        Status SeekAndGetResult(const Slice& key,
+                                IterateResult* result) override {
+          Status s;
+          if (seek_error_count_) {
+            seek_error_count_--;
+            s = Status::IOError();
+          }
+          if (!s.ok()) {
+            return s;
+          }
+          if (scan_opts_) {
+            // Seeks should be in order specified in scan_opts_
+            EXPECT_EQ(comparator_->Compare(
+                          scan_opts_[scan_idx_].range.start.value(), key),
+                      0);
+            EXPECT_TRUE(scan_opts_[scan_idx_].property_bag.has_value());
+            target_num_keys_ = std::stoi(scan_opts_[scan_idx_]
+                                             .property_bag.value()
+                                             .find("count")
+                                             ->second);
+            scan_idx_++;
+          }
+          iter_ = index_.lower_bound(key.ToString());
+          if ((iter_ != index_.end()) && IsInbound()) {
+            AdvanceToNextIndexEntry();
+            result->bound_check_result = IterBoundCheck::kInbound;
+            result->key = Slice(iter_->first);
+            if (scan_opts_ && target_num_keys_ > 0 &&
+                comparator_->Compare(key, iter_->first) == 0) {
+              target_num_keys_--;
+            }
+          } else {
+            result->bound_check_result = IterBoundCheck::kOutOfBound;
+            result->key = Slice();
+          }
+          return Status::OK();
+        }
+
+        Status NextAndGetResult(IterateResult* result) override {
+          Status s;
+          if (next_error_count_) {
+            next_error_count_--;
+            s = Status::IOError();
+          }
+          if (!s.ok()) {
+            return s;
+          }
+          if (scan_opts_ && scan_opts_[scan_idx_ - 1].range.limit.has_value()) {
+            if (comparator_->Compare(
+                    iter_->first,
+                    scan_opts_[scan_idx_ - 1].range.limit.value()) >= 0) {
+              result->bound_check_result = IterBoundCheck::kOutOfBound;
+              result->key = Slice();
+              return Status::OK();
+            }
+          }
+          if (scan_opts_ && target_num_keys_ == 0) {
+            result->key = Slice();
+            result->bound_check_result = IterBoundCheck::kOutOfBound;
+            return Status::OK();
+          }
+          iter_++;
+          if ((iter_ != index_.end()) && IsInbound()) {
+            AdvanceToNextIndexEntry();
+            result->bound_check_result = IterBoundCheck::kInbound;
+            result->key = Slice(iter_->first);
+            target_num_keys_ -=
+                std::min(target_num_keys_, iter_->second.second);
+          } else {
+            // EOF
+            result->bound_check_result = IterBoundCheck::kUnknown;
+            result->key = Slice();
+          }
+          return Status::OK();
+        }
+
+        void AdvanceToNextIndexEntry() {
+          while (iter_->second.second == 0) {
+            iter_++;
+          }
+        }
+
+        bool IsInbound() {
+          if (!scan_opts_) {
+            return true;
+          }
+          if (scan_opts_[scan_idx_ - 1].range.limit.has_value() &&
+              comparator_->Compare(
+                  scan_opts_[scan_idx_ - 1].range.limit.value(),
+                  iter_->first) <= 0) {
+            return false;
+          }
+          return true;
+        }
+
+        UserDefinedIndexBuilder::BlockHandle value() override {
+          UserDefinedIndexBuilder::BlockHandle handle{0, 0};
+          handle.offset = iter_->second.first.offset;
+          handle.size = iter_->second.first.size;
+          return handle;
+        }
+
+        void Prepare(const ScanOptions scan_opts[], size_t num_opts) override {
+          // Prepare should only be called once
+          EXPECT_EQ(scan_opts_, nullptr);
+          scan_opts_ = scan_opts;
+          num_opts_ = num_opts;
+          scan_idx_ = 0;
+        }
+
+       private:
+        std::map<std::string,
+                 std::pair<UserDefinedIndexBuilder::BlockHandle, uint32_t>,
+                 CustomizedMapComparator>& index_;
+        std::map<std::string, std::pair<UserDefinedIndexBuilder::BlockHandle,
+                                        uint32_t>>::iterator iter_;
+        const ScanOptions* scan_opts_;
+        size_t num_opts_{};
+        size_t scan_idx_{};
+        uint32_t target_num_keys_;
+        uint64_t seek_error_count_;
+        uint64_t next_error_count_;
+        const Comparator* comparator_;
+      };
+
+      const TestUserDefinedIndexFactory* factory_;
+      const Comparator* comparator_;
+      std::map<std::string,
+               std::pair<UserDefinedIndexBuilder::BlockHandle, uint32_t>,
+               CustomizedMapComparator>
+          index_data_;
+    };
+  };
+
+ protected:
+  std::vector<std::pair<std::string, std::string>> generateKVWithValue(
+      int key_count, const std::string& value) {
+    std::vector<std::pair<std::string, std::string>> kvs(key_count);
+    for (int i = 0; i < key_count; i++) {
+      std::stringstream ss;
+      ss << std::setw(2) << std::setfill('0') << i;
+      std::string key = "key" + ss.str();
+      kvs[i] = std::make_pair(key, value);
+    }
+    if (is_reverse_comparator_) {
+      std::reverse(kvs.begin(), kvs.end());
+    }
+    return kvs;
+  }
+
+  std::vector<std::pair<std::string, std::string>> generateKVs(
+      int key_count, int value_size = 0) {
+    std::vector<std::pair<std::string, std::string>> kvs(key_count);
+    for (int i = 0; i < key_count; i++) {
+      std::stringstream ss;
+      ss << std::setw(2) << std::setfill('0') << i;
+      std::string key = "key" + ss.str();
+      std::string value;
+      if (value_size != 0) {
+        value = rnd.RandomString(1024);
+      } else {
+        value = "value" + ss.str();
+      }
+      kvs[i] = std::make_pair(key, value);
+    }
+    if (is_reverse_comparator_) {
+      std::reverse(kvs.begin(), kvs.end());
+    }
+    return kvs;
+  }
+
+  void BasicTest(bool use_partitioned_index);
+
+  void ValidateMultiScan(
+      std::vector<std::tuple<std::vector<std::string>, int, int>>
+          scan_opt_validation_arg,
+      std::unordered_map<std::string, std::string> property_bag,
+      const ReadOptions& ro, MultiScanArgs& scan_opts,
+      std::vector<int>& key_counts, std::unique_ptr<DB>& db,
+      ColumnFamilyHandle* cfh) {
+    key_counts.clear();
+    (*scan_opts).clear();
+
+    if (is_reverse_comparator_) {
+      for (auto& scan_opt_validation_range : scan_opt_validation_arg) {
+        // reverse each range
+        std::reverse(std::get<0>(scan_opt_validation_range).begin(),
+                     std::get<0>(scan_opt_validation_range).end());
+      }
+      // reverse all the ranges
+      std::reverse(scan_opt_validation_arg.begin(),
+                   scan_opt_validation_arg.end());
+    }
+
+    for (auto& scan_opt_validation_range : scan_opt_validation_arg) {
+      scan_opts.insert(std::get<0>(scan_opt_validation_range)[0],
+                       std::get<0>(scan_opt_validation_range)[1],
+                       std::optional(property_bag));
+      if (is_reverse_comparator_) {
+        key_counts.push_back(std::get<2>(scan_opt_validation_range));
+      } else {
+        key_counts.push_back(std::get<1>(scan_opt_validation_range));
+      }
+    }
+
+    Slice ub;
+    ReadOptions read_opts = ro;
+    int key_count = 0;
+    int index = 0;
+    auto opts = scan_opts.GetScanRanges();
+    read_opts.iterate_upper_bound = &ub;
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_opts, cfh));
+    iter->Prepare(scan_opts);
+    for (auto opt : opts) {
+      ub = opt.range.limit.value();
+      iter->Seek(opt.range.start.value());
+      if (kVerbose) {
+        printf("range start key %s, end key %s\n",
+               opt.range.start.value().ToString().c_str(),
+               opt.range.limit.value().ToString().c_str());
+      }
+      EXPECT_OK(iter->status());
+      while (iter->Valid()) {
+        if (kVerbose) {
+          printf("found key %s\n", iter->key().ToString().c_str());
+        }
+        key_count++;
+        iter->Next();
+      }
+      EXPECT_EQ(key_count, key_counts[index]);
+      key_count = 0;
+      index++;
+    }
+    EXPECT_OK(iter->status());
+  }
+  Options options_;
+  const Comparator* comparator_;
+  bool is_reverse_comparator_;
+  Random rnd{301};
+};
+
+class UserDefinedIndexTest
+    : public UserDefinedIndexTestBase,
+      public testing::WithParamInterface<const Comparator*> {
+  void SetUp() override {
+    comparator_ = GetParam();
+    options_.comparator = comparator_;
+    is_reverse_comparator_ = comparator_ == ReverseBytewiseComparator();
+  }
+};
+
+void UserDefinedIndexTestBase::BasicTest(bool use_partitioned_index) {
+  BlockBasedTableOptions table_options;
+  std::string dbname = test::PerThreadDBPath("user_defined_index_test");
+  std::string ingest_file = dbname + "test.sst";
+
+  // Set up the user-defined index factory
+  auto user_defined_index_factory =
+      std::make_shared<TestUserDefinedIndexFactory>();
+  table_options.user_defined_index_factory = user_defined_index_factory;
+  if (use_partitioned_index) {
+    table_options.partition_filters = true;
+    table_options.decouple_partitioned_filters = true;
+    table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
+  }
+  // Set up custom flush block policy that flushes every 3 keys
+  table_options.flush_block_policy_factory =
+      std::make_shared<CustomFlushBlockPolicyFactory>();
+
+  options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  std::unique_ptr<SstFileWriter> writer;
+  writer.reset(new SstFileWriter(EnvOptions(), options_));
+  ASSERT_OK(writer->Open(ingest_file));
+
+  auto kvs = generateKVs(/*key_count*/ 100);
+  for (const auto& kv : kvs) {
+    ASSERT_OK(writer->Put(kv.first, kv.second));
+  }
+  ASSERT_OK(writer->Finish());
+  writer.reset();
+
+  ImmutableOptions ioptions(options_);
+  MutableCFOptions moptions((ColumnFamilyOptions(options_)));
+  EnvOptions eoptions(options_);
+  TableReaderOptions toptions(
+      ioptions, moptions.prefix_extractor,
+      /*_compression_manager=*/nullptr, eoptions, ioptions.internal_comparator,
+      moptions.block_protection_bytes_per_key,
+      /*skip_filters*/ false, /*immortal*/ false,
+      /*force_direct_prefetch*/ false, /*level*/ -1,
+      /*block_cache_tracer*/ nullptr,
+      /*max_file_size_for_l0_meta_pin*/ 0, /*cur_db_session_id*/ "",
+      /*cur_file_num*/ 0,
+      /* unique_id */ {}, /* largest_seqno */ 0,
+      /* tail_size */ 0, ioptions.persist_user_defined_timestamps);
+  // Verify that the user-defined index was created
+  std::string meta_block_name = kUserDefinedIndexPrefix + "test_index";
+  BlockHandle block_handle;
+  uint64_t file_size = 0;
+  std::unique_ptr<FSRandomAccessFile> file;
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  const auto& fs = options_.env->GetFileSystem();
+  ASSERT_OK(fs->GetFileSize(ingest_file, IOOptions(), &file_size, nullptr));
+  ASSERT_OK(fs->NewRandomAccessFile(ingest_file, eoptions, &file, nullptr));
+  file_reader.reset(new RandomAccessFileReader(std::move(file), ingest_file));
+  ASSERT_OK(FindMetaBlockInFile(file_reader.get(), file_size,
+                                kBlockBasedTableMagicNumber, ioptions,
+                                ReadOptions(), meta_block_name, &block_handle));
+  file_reader.reset();
+  // With our custom flush policy that flushes every 3 keys,
+  // we expect around 34 data blocks (100/3 rounded up)
+  // Verify the number of entries in the user-defined index
+  // Each data block should have an entry in the index
+  // With our flush policy of 3 keys per block, we expect around 34 entries
+  int expected_entries = (100 + 2) / 3;  // Ceiling of 100/3
+  ASSERT_GE(block_handle.size(),
+            expected_entries);  // At least this many entries
+
+  std::unique_ptr<SstFileReader> reader(new SstFileReader(options_));
+  ASSERT_OK(reader->Open(ingest_file));
+
+  ReadOptions ro;
+  std::unique_ptr<Iterator> iter(reader->NewIterator(ro));
+  ASSERT_NE(iter, nullptr);
+
+  // Test that we can read all the keys
+  int key_count = 0;
+  for (iter->SeekToFirst(); iter->Valid() && iter->status().ok();
+       iter->Next()) {
+    key_count++;
+  }
+  ASSERT_EQ(key_count, 100);  // We added 100 keys
+  ASSERT_OK(iter->status());
+  iter.reset();
+
+  ro.table_index_factory = user_defined_index_factory.get();
+  iter.reset(reader->NewIterator(ro));
+  ASSERT_NE(iter, nullptr);
+
+  // Test seek specific key
+  key_count = 0;
+  for (iter->Seek("key40"); iter->Valid(); iter->Next()) {
+    key_count++;
+  }
+  ASSERT_EQ(key_count, is_reverse_comparator_ ? 41 : 60);
+  ASSERT_OK(iter->status());
+
+  // Test upper bound
+  Slice ub(is_reverse_comparator_ ? "key25" : "key75");
+  ro.iterate_upper_bound = &ub;
+  iter.reset(reader->NewIterator(ro));
+  ASSERT_NE(iter, nullptr);
+
+  // Test seek specific key with upper bound
+  key_count = 0;
+  for (iter->Seek("key40"); iter->Valid(); iter->Next()) {
+    key_count++;
+  }
+  ASSERT_EQ(key_count, is_reverse_comparator_ ? 15 : 35);
+  ASSERT_OK(iter->status());
+
+  user_defined_index_factory->seek_error_count_ = 1;
+  iter.reset(reader->NewIterator(ro));
+  ASSERT_NE(iter, nullptr);
+  iter->Seek("key40");
+  ASSERT_NOK(iter->status());
+
+  user_defined_index_factory->seek_error_count_ = 0;
+  user_defined_index_factory->next_error_count_ = 1;
+  iter.reset(reader->NewIterator(ro));
+  ASSERT_NE(iter, nullptr);
+  iter->Seek(is_reverse_comparator_ ? "key92" : "key09");
+  ASSERT_OK(iter->status());
+  iter->Next();
+  ASSERT_OK(iter->status());
+  iter->Next();
+  if (!is_reverse_comparator_) {
+    ASSERT_OK(iter->status());
+    iter->Next();
+  }
+  ASSERT_NOK(iter->status());
+  user_defined_index_factory->next_error_count_ = 0;
+
+  ro.iterate_upper_bound = &ub;
+  iter.reset(reader->NewIterator(ro));
+  ASSERT_NE(iter, nullptr);
+  MultiScanArgs scan_opts(comparator_);
+
+  std::unordered_map<std::string, std::string> property_bag;
+  property_bag["count"] = std::to_string(25);
+  std::vector<std::string> boundaries = {"key10", "key50"};
+  if (is_reverse_comparator_) {
+    std::reverse(boundaries.begin(), boundaries.end());
+  }
+
+  scan_opts.insert(boundaries[0], boundaries[1], std::optional(property_bag));
+  iter->Prepare(scan_opts);
+  // Test that UDI is used to help fetch the number of keys
+  key_count = 0;
+  ub = boundaries[1];
+  for (iter->Seek(scan_opts.GetScanRanges()[0].range.start.value());
+       iter->Valid(); iter->Next()) {
+    key_count++;
+  }
+  // The index may undercount by 2 blocks
+  ASSERT_EQ(key_count, 29);
+  ASSERT_OK(iter->status());
+}
+
+TEST_P(UserDefinedIndexTest, BasicTestWithPartitionedIndex) {
+  BasicTest(/*use_partitioned_index=*/true);
+}
+
+TEST_P(UserDefinedIndexTest, BasicTestWithoutPartitionedIndex) {
+  BasicTest(/*use_partitioned_index=*/false);
+}
+
+TEST_P(UserDefinedIndexTest, InvalidArgumentTest1) {
+  BlockBasedTableOptions table_options;
+  std::string dbname = test::PerThreadDBPath("user_defined_index_test");
+  std::string ingest_file = dbname + "test.sst";
+
+  // Set up the user-defined index factory
+  auto user_defined_index_factory =
+      std::make_shared<TestUserDefinedIndexFactory>();
+  table_options.user_defined_index_factory = user_defined_index_factory;
+
+  // Set up custom flush block policy that flushes every 3 keys
+  table_options.flush_block_policy_factory =
+      std::make_shared<CustomFlushBlockPolicyFactory>();
+
+  options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options_.compression_opts.parallel_threads = 10;
+
+  std::unique_ptr<SstFileWriter> writer;
+  writer.reset(new SstFileWriter(EnvOptions(), options_));
+  ASSERT_OK(writer->Open(ingest_file));
+
+  std::string key = "foo";
+  std::string value = "bar";
+  ASSERT_EQ(writer->Put(key, value), Status::InvalidArgument());
+  ASSERT_EQ(writer->Finish(), Status::InvalidArgument());
+  writer.reset();
+}
+
+TEST_P(UserDefinedIndexTest, InvalidArgumentTest2) {
+  BlockBasedTableOptions table_options;
+  std::string dbname = test::PerThreadDBPath("user_defined_index_test");
+  std::string ingest_file = dbname + "test.sst";
+
+  // Set up the user-defined index factory
+  auto user_defined_index_factory =
+      std::make_shared<TestUserDefinedIndexFactory>();
+  table_options.user_defined_index_factory = user_defined_index_factory;
+
+  // Set up custom flush block policy that flushes every 3 keys
+  table_options.flush_block_policy_factory =
+      std::make_shared<CustomFlushBlockPolicyFactory>();
+
+  options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  std::unique_ptr<SstFileWriter> writer;
+  writer.reset(new SstFileWriter(EnvOptions(), options_));
+  ASSERT_OK(writer->Open(ingest_file));
+
+  std::string key = "foo";
+  std::string value = "bar";
+  ASSERT_OK(writer->Merge(key, value));
+  ASSERT_EQ(writer->Finish(), Status::InvalidArgument());
+  writer.reset();
+}
+
+TEST_P(UserDefinedIndexTest, IngestTest) {
+  BlockBasedTableOptions table_options;
+  std::string dbname = test::PerThreadDBPath("user_defined_index_test");
+  std::string ingest_file = dbname + "test.sst";
+
+  // Set up the user-defined index factory
+  auto user_defined_index_factory =
+      std::make_shared<TestUserDefinedIndexFactory>();
+  table_options.user_defined_index_factory = user_defined_index_factory;
+
+  // Set up custom flush block policy that flushes every 3 keys
+  table_options.flush_block_policy_factory =
+      std::make_shared<CustomFlushBlockPolicyFactory>();
+
+  options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  std::unique_ptr<SstFileWriter> writer;
+  writer.reset(new SstFileWriter(EnvOptions(), options_));
+  ASSERT_OK(writer->Open(ingest_file));
+
+  auto kvs = generateKVs(/*key_count*/ 100);
+  for (const auto& kv : kvs) {
+    ASSERT_OK(writer->Put(kv.first, kv.second));
+  }
+
+  ASSERT_OK(writer->Finish());
+  writer.reset();
+
+  std::unique_ptr<DB> db;
+  options_.create_if_missing = true;
+  Status s = DB::Open(options_, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+  ColumnFamilyHandle* cfh = nullptr;
+  ASSERT_OK(db->CreateColumnFamily(options_, "new_cf", &cfh));
+
+  IngestExternalFileOptions ifo;
+  s = db->IngestExternalFile(cfh, {ingest_file}, ifo);
+  ASSERT_OK(s);
+
+  ReadOptions ro;
+  std::unique_ptr<Iterator> iter(db->NewIterator(ro, cfh));
+  ASSERT_NE(iter, nullptr);
+  ASSERT_OK(iter->status());
+
+  // Test that we can read all the keys
+  int key_count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    key_count++;
+  }
+  ASSERT_EQ(key_count, 100);  // We added 100 keys
+  ASSERT_OK(iter->status());
+  iter.reset();
+
+  ro.table_index_factory = user_defined_index_factory.get();
+  iter.reset(db->NewIterator(ro, cfh));
+  ASSERT_NE(iter, nullptr);
+
+  // Test seek specific key
+  key_count = 0;
+  for (iter->Seek("key40"); iter->Valid(); iter->Next()) {
+    key_count++;
+  }
+  ASSERT_EQ(key_count, is_reverse_comparator_ ? 41 : 60);
+  ASSERT_OK(iter->status());
+
+  // Test upper bound
+  Slice ub(is_reverse_comparator_ ? "key25" : "key75");
+  ro.iterate_upper_bound = &ub;
+  iter.reset(db->NewIterator(ro, cfh));
+  ASSERT_NE(iter, nullptr);
+
+  // Test seek specific key with upper bound
+  key_count = 0;
+  for (iter->Seek("key40"); iter->Valid(); iter->Next()) {
+    key_count++;
+  }
+  ASSERT_EQ(key_count, is_reverse_comparator_ ? 15 : 35);
+  ASSERT_OK(iter->status());
+  iter.reset();
+
+  ASSERT_OK(db->DestroyColumnFamilyHandle(cfh));
+  ASSERT_OK(db->Close());
+  ASSERT_OK(DestroyDB(dbname, options_));
+}
+
+TEST_P(UserDefinedIndexTest, EmptyRangeTest) {
+  BlockBasedTableOptions table_options;
+  std::string dbname = test::PerThreadDBPath("user_defined_index_test");
+  std::string ingest_file = dbname + "test.sst";
+
+  // Set up the user-defined index factory
+  auto user_defined_index_factory =
+      std::make_shared<TestUserDefinedIndexFactory>();
+  table_options.user_defined_index_factory = user_defined_index_factory;
+
+  // Set up custom flush block policy that flushes every 3 keys
+  table_options.flush_block_policy_factory =
+      std::make_shared<CustomFlushBlockPolicyFactory>();
+
+  options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  std::unique_ptr<SstFileWriter> writer;
+  writer.reset(new SstFileWriter(EnvOptions(), options_));
+  ASSERT_OK(writer->Open(ingest_file));
+
+  // Generate key range key0 ~ key19, key40 ~ key59, key80 ~ key99
+  std::vector<std::pair<std::string, std::string>> kvs;
+  bool skip = false;
+  for (int i = 0; i < 100; i++) {
+    if (i > 0 && i % 20 == 0) {
+      skip = !skip;
+    }
+    if (skip) {
+      continue;
+    }
+    std::stringstream ss;
+    ss << std::setw(2) << std::setfill('0') << i;
+    std::string key = "key" + ss.str();
+    std::string value = "value" + ss.str();
+    kvs.emplace_back(key, value);
+  }
+
+  if (is_reverse_comparator_) {
+    std::reverse(kvs.begin(), kvs.end());
+  }
+
+  for (const auto& kv : kvs) {
+    ASSERT_OK(writer->Put(kv.first, kv.second));
+  }
+  ASSERT_OK(writer->Finish());
+  writer.reset();
+
+  std::unique_ptr<DB> db;
+  options_.create_if_missing = true;
+  Status s = DB::Open(options_, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+  ColumnFamilyHandle* cfh = nullptr;
+  ASSERT_OK(db->CreateColumnFamily(options_, "new_cf", &cfh));
+
+  IngestExternalFileOptions ifo;
+  s = db->IngestExternalFile(cfh, {ingest_file}, ifo);
+  ASSERT_OK(s);
+
+  ReadOptions ro;
+  std::unique_ptr<Iterator> iter(db->NewIterator(ro, cfh));
+  ASSERT_NE(iter, nullptr);
+  ASSERT_OK(iter->status());
+
+  // Test that we can read all the keys
+  int key_count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    key_count++;
+  }
+  ASSERT_EQ(key_count, 60);
+  ASSERT_OK(iter->status());
+  iter.reset();
+
+  ro.table_index_factory = user_defined_index_factory.get();
+  std::vector<int> key_counts;
+  MultiScanArgs scan_opts(options_.comparator);
+  std::unordered_map<std::string, std::string> property_bag;
+  property_bag["count"] = std::to_string(5);
+
+  ValidateMultiScan({{{"key25", "key30"}, 0, 0},
+                     {{"key33", "key37"}, 0, 0},
+                     // Non-empty scan with range greater than count
+                     // In the key42:key56 range, we might read an additional
+                     // block worth of keys due to the boundaries (5 + 3)
+                     {{"key42", "key56"}, 8, 7},
+                     // Empty scan succeeding a non-empty one
+                     {{"key65", "key70"}, 0, 0},
+                     // A non-empty scan with range smaller than count
+                     {{"key85", "key87"}, 2, 2},
+                     // Scan range completely outside the DB
+                     {{"key991", "key999"}, 0, 0}},
+                    property_bag, ro, scan_opts, key_counts, db, cfh);
+
+  // Scans that overlap with part of key range, with overlap less than count
+  ValidateMultiScan({{{"key18", "key25"}, 2, 1}, {{"key38", "key43"}, 3, 4}},
+                    property_bag, ro, scan_opts, key_counts, db, cfh);
+
+  // Scans that overlap with part of key range, with overlap same as count
+  ValidateMultiScan({{{"key15", "key26"}, 5, 4}, {{"key38", "key46"}, 6, 7}},
+                    property_bag, ro, scan_opts, key_counts, db, cfh);
+
+  // Scans that overlap with part of key range, with overlap greater than count
+  ValidateMultiScan({{{"key10", "key26"}, 8, 8},
+                     // Cross block boundary
+                     {{"key38", "key49"}, 7, 9}},
+                    property_bag, ro, scan_opts, key_counts, db, cfh);
+
+  // Scan bigger than one contiguous range of keys, with overlap greater than
+  // count
+  ValidateMultiScan({{{"key75", "key991"}, 8, 9}}, property_bag, ro, scan_opts,
+                    key_counts, db, cfh);
+
+  // Scan bigger than one contiguous range of keys, with overlap less than count
+  property_bag["count"] = std::to_string(25);
+  ValidateMultiScan({{{"key75", "key991"}, 20, 20}}, property_bag, ro,
+                    scan_opts, key_counts, db, cfh);
+
+  ASSERT_OK(db->DestroyColumnFamilyHandle(cfh));
+  ASSERT_OK(db->Close());
+  ASSERT_OK(DestroyDB(dbname, options_));
+}
+
+// Verify that external file ingestion fails if we try to ingest an SST file
+// without the UDI and a UDI factory is configured in BlockBasedTableOptions
+// and fail_if_no_udi_on_open is true in BlockBasedTableOptions.
+TEST_P(UserDefinedIndexTest, IngestFailTest) {
+  BlockBasedTableOptions table_options;
+  std::string dbname = test::PerThreadDBPath("user_defined_index_test");
+  std::string ingest_file = dbname + "test.sst";
+
+  // Set up custom flush block policy that flushes every 3 keys
+  table_options.flush_block_policy_factory =
+      std::make_shared<CustomFlushBlockPolicyFactory>();
+
+  options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  std::unique_ptr<SstFileWriter> writer;
+  writer.reset(new SstFileWriter(EnvOptions(), options_));
+  ASSERT_OK(writer->Open(ingest_file));
+
+  auto kvs = generateKVs(/*key_count*/ 100);
+  for (const auto& kv : kvs) {
+    ASSERT_OK(writer->Put(kv.first, kv.second));
+  }
+  ASSERT_OK(writer->Finish());
+  writer.reset();
+
+  // Set up the user-defined index factory
+  auto user_defined_index_factory =
+      std::make_shared<TestUserDefinedIndexFactory>();
+  table_options.user_defined_index_factory = user_defined_index_factory;
+  table_options.fail_if_no_udi_on_open = true;
+  options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  std::unique_ptr<DB> db;
+  options_.create_if_missing = true;
+  Status s = DB::Open(options_, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+  ColumnFamilyHandle* cfh = nullptr;
+  ASSERT_OK(db->CreateColumnFamily(options_, "new_cf", &cfh));
+
+  IngestExternalFileOptions ifo;
+  s = db->IngestExternalFile(cfh, {ingest_file}, ifo);
+  ASSERT_NOK(s);
+
+  ASSERT_OK(db->SetOptions(
+      cfh, {{"block_based_table_factory", "{fail_if_no_udi_on_open=false;}"}}));
+  s = db->IngestExternalFile(cfh, {ingest_file}, ifo);
+  ASSERT_OK(s);
+
+  ASSERT_OK(db->DestroyColumnFamilyHandle(cfh));
+  ASSERT_OK(db->Close());
+  ASSERT_OK(DestroyDB(dbname, options_));
+}
+
+TEST_P(UserDefinedIndexTest, IngestEmptyUDI) {
+  BlockBasedTableOptions table_options;
+  std::string dbname = test::PerThreadDBPath("user_defined_index_test");
+  std::string ingest_file = dbname + "test.sst";
+  std::string ingest_file2 = dbname + "dummy.sst";
+
+  // Set up the user-defined index factory
+  auto user_defined_index_factory =
+      std::make_shared<TestUserDefinedIndexFactory>();
+  table_options.user_defined_index_factory = user_defined_index_factory;
+  // Set up custom flush block policy that flushes every 3 keys
+  table_options.flush_block_policy_factory =
+      std::make_shared<CustomFlushBlockPolicyFactory>();
+
+  options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  std::unique_ptr<SstFileWriter> writer;
+  writer.reset(new SstFileWriter(EnvOptions(), options_));
+  ASSERT_OK(writer->Open(ingest_file));
+
+  auto kvs = generateKVs(/*key_count*/ 100);
+  for (const auto& kv : kvs) {
+    ASSERT_OK(writer->Put(kv.first, kv.second));
+  }
+  ASSERT_OK(writer->Finish());
+  writer.reset();
+  writer.reset(new SstFileWriter(EnvOptions(), options_));
+  ASSERT_OK(writer->Open(ingest_file2));
+  ASSERT_OK(writer->Put("dummy", "val"));
+  ASSERT_OK(writer->Finish());
+  writer.reset();
+
+  table_options.fail_if_no_udi_on_open = true;
+  options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  std::unique_ptr<DB> db;
+  options_.create_if_missing = true;
+  Status s = DB::Open(options_, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+  ColumnFamilyHandle* cfh = nullptr;
+  ASSERT_OK(db->CreateColumnFamily(options_, "new_cf", &cfh));
+
+  std::vector<IngestExternalFileArg> ifa;
+  ifa.emplace_back();
+  ifa[0].column_family = cfh;
+  ifa[0].external_files.emplace_back(ingest_file);
+  ifa[0].external_files.emplace_back(ingest_file2);
+  s = db->IngestExternalFiles(ifa);
+  ASSERT_OK(s);
+
+  ASSERT_OK(db->DestroyColumnFamilyHandle(cfh));
+  ASSERT_OK(db->Close());
+  ASSERT_OK(DestroyDB(dbname, options_));
+}
+
+TEST_P(UserDefinedIndexTest, MultiScanFailureTest) {
+  BlockBasedTableOptions table_options;
+  std::string dbname = test::PerThreadDBPath("user_defined_index_test");
+  std::string ingest_file = dbname + "test.sst";
+
+  // Set up the user-defined index factory
+  auto user_defined_index_factory =
+      std::make_shared<TestUserDefinedIndexFactory>();
+  table_options.user_defined_index_factory = user_defined_index_factory;
+
+  // Set up custom flush block policy that flushes every 3 keys
+  table_options.flush_block_policy_factory =
+      std::make_shared<CustomFlushBlockPolicyFactory>();
+
+  options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  std::unique_ptr<SstFileWriter> writer;
+  writer.reset(new SstFileWriter(EnvOptions(), options_));
+  ASSERT_OK(writer->Open(ingest_file));
+
+  // Use bigger value, so that prefetch size limit will be effective
+  auto kvs = generateKVs(/*key_count*/ 100, /* value_size */ 1024);
+  for (const auto& kv : kvs) {
+    ASSERT_OK(writer->Put(kv.first, kv.second));
+  }
+  ASSERT_OK(writer->Finish());
+  writer.reset();
+
+  std::unique_ptr<DB> db;
+  options_.create_if_missing = true;
+  Status s = DB::Open(options_, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+  ColumnFamilyHandle* cfh = nullptr;
+  ASSERT_OK(db->CreateColumnFamily(options_, "new_cf", &cfh));
+
+  IngestExternalFileOptions ifo;
+  s = db->IngestExternalFile(cfh, {ingest_file}, ifo);
+  ASSERT_OK(s);
+
+  std::vector<std::string> key_ranges({"key03", "key05", "key12", "key14"});
+  ReadOptions ro;
+  ro.table_index_factory = user_defined_index_factory.get();
+  Slice ub;
+  ro.iterate_upper_bound = &ub;
+  std::unordered_map<std::string, std::string> property_bag;
+  property_bag["count"] = std::to_string(5);
+  MultiScanArgs scan_options(comparator_);
+  if (is_reverse_comparator_) {
+    std::reverse(key_ranges.begin(), key_ranges.end());
+  }
+  scan_options.insert(key_ranges[0], key_ranges[1], property_bag);
+  scan_options.insert(key_ranges[2], key_ranges[3], property_bag);
+  scan_options.max_prefetch_size = 3500;
+  std::unique_ptr<Iterator> iter(db->NewIterator(ro, cfh));
+  ASSERT_NE(iter, nullptr);
+  iter->Prepare(scan_options);
+  int count = 0;
+  ub = key_ranges[1];
+  iter->Seek(key_ranges[0]);
+  while (iter->status().ok() && iter->Valid()) {
+    ASSERT_GE(comparator_->Compare(iter->key(), key_ranges[0]), 0);
+    ASSERT_LT(comparator_->Compare(iter->key(), key_ranges[1]), 0);
+    count++;
+    iter->Next();
+  }
+  ASSERT_OK(iter->status()) << iter->status().ToString();
+  ASSERT_EQ(count, 2);
+
+  ub = key_ranges[3];
+  iter->Seek(key_ranges[2]);
+  // This should fail due to reaching max_prefetch_size limit
+  ASSERT_EQ(iter->status(), Status::Incomplete());
+  iter.reset();
+
+  // Empty range multiscan error
+  iter.reset(db->NewIterator(ro, cfh));
+  scan_options = MultiScanArgs(comparator_);
+  iter->Prepare(scan_options);
+  ASSERT_EQ(iter->status(), Status::InvalidArgument("Empty MultiScanArgs"));
+
+  // Check no seek key error
+  iter.reset(db->NewIterator(ro, cfh));
+  scan_options = MultiScanArgs(comparator_);
+  scan_options.insert(key_ranges[0], key_ranges[2], property_bag);
+  iter->Prepare(scan_options);
+  iter->SeekToFirst();
+  ASSERT_EQ(iter->status(),
+            Status::InvalidArgument("No seek key for MultiScan"));
+
+  // Seek is not allowed to seen a key that is not following the prepare order
+  iter.reset(db->NewIterator(ro, cfh));
+  ASSERT_NE(iter, nullptr);
+  scan_options.max_prefetch_size = 0;
+  iter->Prepare(scan_options);
+  ub = key_ranges[3];
+  iter->Seek(key_ranges[2]);
+  ASSERT_EQ(
+      iter->status(),
+      Status::InvalidArgument(
+          "Seek target does not match the start of the next prepared range at "
+          "index 0"));
+  ASSERT_FALSE(iter->Valid());
+  iter.reset();
+
+  // limit is equal to start error
+  iter.reset(db->NewIterator(ro, cfh));
+  ASSERT_NE(iter, nullptr);
+  (*scan_options).clear();
+  scan_options.insert(key_ranges[0], key_ranges[0], property_bag);
+  iter->Prepare(scan_options);
+  ASSERT_EQ(iter->status(),
+            Status::InvalidArgument(
+                "Scan start key is large or equal than limit at index 0"));
+  iter.reset();
+
+  // overlapping ranges error
+  iter.reset(db->NewIterator(ro, cfh));
+  ASSERT_NE(iter, nullptr);
+  (*scan_options).clear();
+  scan_options.insert(key_ranges[0], key_ranges[2], property_bag);
+  scan_options.insert(key_ranges[1], key_ranges[3], property_bag);
+  iter->Prepare(scan_options);
+  ASSERT_EQ(iter->status(),
+            Status::InvalidArgument("Overlapping ranges at index 1"));
+  iter.reset();
+
+  // Validate an error is returned if upper bound is not set to the same value
+  // as limit
+  iter.reset(db->NewIterator(ro, cfh));
+  scan_options = MultiScanArgs(comparator_);
+  scan_options.insert(key_ranges[0], key_ranges[1], property_bag);
+  iter->Prepare(scan_options);
+  ub = "";
+  iter->Seek(key_ranges[0]);
+  ASSERT_EQ(iter->status(),
+            Status::InvalidArgument(
+                "Upper bound is not set to the same limit value of the next "
+                "prepared range at index 0"));
+  ASSERT_FALSE(iter->Valid());
+
+  // Validate an error is returned when seek more keys than prepared
+  iter.reset(db->NewIterator(ro, cfh));
+  scan_options = MultiScanArgs(comparator_);
+  scan_options.insert(key_ranges[0], key_ranges[1], property_bag);
+  iter->Prepare(scan_options);
+  ub = key_ranges[1];
+  iter->Seek(key_ranges[0]);
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  iter->Seek(key_ranges[2]);
+  ASSERT_EQ(iter->status(),
+            Status::InvalidArgument(
+                "Seek called after exhausting all of the scan ranges"));
+  ASSERT_FALSE(iter->Valid());
+  iter.reset();
+
+  // Check error is returned if upper bound is not set and limit is set
+  ro.iterate_upper_bound = nullptr;
+  iter.reset(db->NewIterator(ro, cfh));
+  scan_options = MultiScanArgs(comparator_);
+  scan_options.insert(key_ranges[0], key_ranges[1], property_bag);
+  iter->Prepare(scan_options);
+  iter->Seek(key_ranges[0]);
+  ASSERT_EQ(iter->status(),
+            Status::InvalidArgument(
+                "Upper bound is not set to the same limit value of the next "
+                "prepared range at index 0"));
+  ASSERT_FALSE(iter->Valid());
+  iter.reset();
+
+  // Upper bound is allowed to be empty, if limit is not set
+  ro.iterate_upper_bound = nullptr;
+  iter.reset(db->NewIterator(ro, cfh));
+  scan_options = MultiScanArgs(comparator_);
+  scan_options.insert(key_ranges[0], property_bag);
+  iter->Prepare(scan_options);
+  iter->Seek(key_ranges[0]);
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  iter.reset();
+
+  ASSERT_OK(db->DestroyColumnFamilyHandle(cfh));
+  ASSERT_OK(db->Close());
+  ASSERT_OK(DestroyDB(dbname, options_));
+}
+
+TEST_P(UserDefinedIndexTest, ConfigTest) {
+  BlockBasedTableOptions table_options;
+  std::string dbname = test::PerThreadDBPath("user_defined_index_test");
+  std::string ingest_file = dbname + "test.sst";
+
+  // Set up the user-defined index factory
+  auto user_defined_index_factory =
+      std::make_shared<TestUserDefinedIndexFactory>();
+  table_options.user_defined_index_factory = user_defined_index_factory;
+
+  // Set up custom flush block policy that flushes every 3 keys
+  table_options.flush_block_policy_factory =
+      std::make_shared<CustomFlushBlockPolicyFactory>();
+
+  options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  std::unique_ptr<SstFileWriter> writer;
+  writer.reset(new SstFileWriter(EnvOptions(), options_));
+  ASSERT_OK(writer->Open(ingest_file));
+
+  auto kvs = generateKVs(/*key_count*/ 100);
+  for (const auto& kv : kvs) {
+    ASSERT_OK(writer->Put(kv.first, kv.second));
+  }
+  ASSERT_OK(writer->Finish());
+  writer.reset();
+
+  table_options.user_defined_index_factory.reset();
+  options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  // Set up the user-defined index factory
+  ObjectLibrary::Default().get()->AddFactory<UserDefinedIndexFactory>(
+      "test_index", [](const std::string& /* uri */,
+                       std::unique_ptr<UserDefinedIndexFactory>* guard,
+                       std::string* /* errmsg */) {
+        auto factory = new TestUserDefinedIndexFactory();
+        guard->reset(factory);
+        return guard->get();
+      });
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      ConfigOptions(), options_,
+      "block_based_table_factory={user_defined_index_factory=test_index;}",
+      &options_));
+
+  std::unique_ptr<DB> db;
+  options_.create_if_missing = true;
+  Status s = DB::Open(options_, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+  ColumnFamilyHandle* cfh = nullptr;
+  ASSERT_OK(db->CreateColumnFamily(options_, "new_cf", &cfh));
+
+  IngestExternalFileOptions ifo;
+  s = db->IngestExternalFile(cfh, {ingest_file}, ifo);
+  ASSERT_OK(s);
+
+  ReadOptions ro;
+  Slice ub;
+  ro.iterate_upper_bound = &ub;
+  ro.table_index_factory = user_defined_index_factory.get();
+  std::unique_ptr<Iterator> iter(db->NewIterator(ro, cfh));
+  ASSERT_NE(iter, nullptr);
+  MultiScanArgs scan_opts(options_.comparator);
+  std::unordered_map<std::string, std::string> property_bag;
+  property_bag["count"] = std::to_string(25);
+
+  std::vector<std::string> boundaries = {"key10", "key50"};
+  if (is_reverse_comparator_) {
+    std::reverse(boundaries.begin(), boundaries.end());
+  }
+
+  scan_opts.insert(boundaries[0], boundaries[1], std::optional(property_bag));
+  iter->Prepare(scan_opts);
+  // Test that UDI is used to help fetch the number of keys
+  ub = boundaries[1];
+  int key_count = 0;
+  for (iter->Seek(scan_opts.GetScanRanges()[0].range.start.value());
+       iter->Valid(); iter->Next()) {
+    key_count++;
+  }
+  // Number of blocks prepared is based on UDI, it would be slightly higher than
+  // the limit
+  // The index may undercount by 2 blocks
+  ASSERT_EQ(key_count, 29);
+  ASSERT_OK(iter->status());
+  iter.reset();
+
+  ASSERT_OK(db->DestroyColumnFamilyHandle(cfh));
+  ASSERT_OK(db->Close());
+  ASSERT_OK(DestroyDB(dbname, options_));
+}
+
+TEST_P(UserDefinedIndexTest, RangeDelete) {
+  BlockBasedTableOptions table_options;
+  options_.num_levels = 50;
+  options_.compaction_style = kCompactionStyleUniversal;
+  options_.disable_auto_compactions = true;
+  std::string dbname = test::PerThreadDBPath("user_defined_index_test");
+  std::string ingest_file = dbname + "test.sst";
+
+  // Set up the user-defined index factory
+  auto user_defined_index_factory =
+      std::make_shared<TestUserDefinedIndexFactory>();
+  table_options.user_defined_index_factory = user_defined_index_factory;
+
+  // Set up custom flush block policy that flushes every 3 keys
+  table_options.flush_block_policy_factory =
+      std::make_shared<CustomFlushBlockPolicyFactory>();
+
+  options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  auto create_ingestion_data_file = [&](const std::string& filename) {
+    std::unique_ptr<SstFileWriter> writer;
+    writer.reset(new SstFileWriter(EnvOptions(), options_));
+    ASSERT_OK(writer->Open(filename));
+    auto kvs = generateKVs(100);
+
+    for (const auto& kv : kvs) {
+      ASSERT_OK(writer->Put(kv.first, kv.second));
+    }
+    ASSERT_OK(writer->Finish());
+    writer.reset();
+  };
+
+  // Create first ingestion file with data
+  create_ingestion_data_file(ingest_file + "_0");
+
+  // Create second ingestion file with range delete only that covers the first
+  // file to delete all of its keys.
+  {
+    std::unique_ptr<SstFileWriter> writer;
+    writer.reset(new SstFileWriter(EnvOptions(), options_));
+    ASSERT_OK(writer->Open(ingest_file + "_1"));
+    if (is_reverse_comparator_) {
+      ASSERT_OK(writer->DeleteRange("keyz", "key"));
+    } else {
+      ASSERT_OK(writer->DeleteRange("key", "keyz"));
+    }
+    ASSERT_OK(writer->Finish());
+    writer.reset();
+  }
+
+  // Create the second ingestion file with data
+  create_ingestion_data_file(ingest_file + "_2");
+
+  std::unique_ptr<DB> db;
+  options_.create_if_missing = true;
+  Status s = DB::Open(options_, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+  ColumnFamilyHandle* cfh = nullptr;
+  ASSERT_OK(db->CreateColumnFamily(options_, "new_cf", &cfh));
+
+  IngestExternalFileOptions ifo;
+  // ingest first data file key00~key99
+  s = db->IngestExternalFile(cfh, {ingest_file + "_0"}, ifo);
+  ASSERT_OK(s);
+  // ingest delete range (key-keyz) and new data file (key00-key99) together
+  s = db->IngestExternalFile(cfh, {ingest_file + "_1", ingest_file + "_2"},
+                             ifo);
+  ASSERT_OK(s);
+
+  std::vector<Slice> range = {
+      Slice("key10"),
+      Slice("key25"),
+      Slice("key80"),
+      Slice("key95"),
+  };
+
+  if (is_reverse_comparator_) {
+    std::reverse(range.begin(), range.end());
+  }
+
+  Slice ub("");
+  ReadOptions ro;
+  ro.iterate_upper_bound = &ub;
+  std::unique_ptr<Iterator> iter(db->NewIterator(ro, cfh));
+  ASSERT_NE(iter, nullptr);
+
+  MultiScanArgs scan_opts(options_.comparator);
+  std::unordered_map<std::string, std::string> property_bag;
+  property_bag["count"] = std::to_string(9);
+
+  std::vector<std::vector<char>> decoded_ranges;
+  for (size_t i = 0; i < range.size() / 2; i++) {
+    scan_opts.insert(range[i * 2], range[i * 2 + 1],
+                     std::optional(property_bag));
+  }
+  iter->Prepare(scan_opts);
+
+  for (size_t i = 0; i < range.size() / 2; i++) {
+    // Update upper bound before each seek
+    ub = range[2 * i + 1];
+    auto key_count = 0;
+    for (iter->Seek(range[i * 2]); iter->Valid(); iter->Next()) {
+      key_count++;
+    }
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(key_count, 15);
+  }
+
+  iter.reset();
+
+  ASSERT_OK(db->DestroyColumnFamilyHandle(cfh));
+  ASSERT_OK(db->Close());
+  ASSERT_OK(DestroyDB(dbname, options_));
+}
+
+TEST_P(UserDefinedIndexTest, QueryCrossTwoFiles) {
+  BlockBasedTableOptions table_options;
+  options_.num_levels = 50;
+  options_.compaction_style = kCompactionStyleUniversal;
+  options_.disable_auto_compactions = true;
+  options_.sst_partitioner_factory = NewSstPartitionerFixedPrefixFactory(4);
+  std::string dbname = test::PerThreadDBPath("user_defined_index_test");
+  std::string ingest_file = dbname + "test.sst";
+
+  // Set up the user-defined index factory
+  auto user_defined_index_factory =
+      std::make_shared<TestUserDefinedIndexFactory>();
+  table_options.user_defined_index_factory = user_defined_index_factory;
+
+  // Set up custom flush block policy that flushes every 3 keys
+  table_options.flush_block_policy_factory =
+      std::make_shared<CustomFlushBlockPolicyFactory>();
+
+  options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  auto create_ingestion_data_file = [&](const std::string& filename,
+                                        const std::string& value) {
+    std::unique_ptr<SstFileWriter> writer;
+    writer.reset(new SstFileWriter(EnvOptions(), options_));
+    ASSERT_OK(writer->Open(filename));
+    auto kvs = generateKVWithValue(100, value);
+
+    for (const auto& kv : kvs) {
+      ASSERT_OK(writer->Put(kv.first, kv.second));
+    }
+    ASSERT_OK(writer->Finish());
+    writer.reset();
+  };
+
+  // Create first ingestion file with data
+  create_ingestion_data_file(ingest_file + "_0", "old");
+
+  std::unique_ptr<DB> db;
+  options_.create_if_missing = true;
+  Status s = DB::Open(options_, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+  ColumnFamilyHandle* cfh = nullptr;
+  ASSERT_OK(db->CreateColumnFamily(options_, "new_cf", &cfh));
+
+  IngestExternalFileOptions ifo;
+  // ingest data file key00~key99
+  s = db->IngestExternalFile(cfh, {ingest_file + "_0"}, ifo);
+  ASSERT_OK(s);
+
+  // Compact the file with SST partitioner, so that files are split into
+  // multiple ones
+  s = db->CompactRange(
+      {.exclusive_manual_compaction = true,
+       .bottommost_level_compaction = BottommostLevelCompaction::kForce},
+      cfh, nullptr, nullptr);
+  ASSERT_OK(s);
+
+  std::vector<Slice> range = {
+      // Each range span across 2 files
+      Slice("key16"),
+      Slice("key24"),
+      Slice("key26"),
+      Slice("key34"),
+  };
+
+  if (is_reverse_comparator_) {
+    std::reverse(range.begin(), range.end());
+  }
+
+  Slice ub("");
+  ReadOptions ro;
+  ro.iterate_upper_bound = &ub;
+  std::unique_ptr<Iterator> iter(db->NewIterator(ro, cfh));
+  ASSERT_NE(iter, nullptr);
+
+  MultiScanArgs scan_opts(options_.comparator);
+  std::unordered_map<std::string, std::string> property_bag;
+  auto read_key_per_range_limit = 2;
+  property_bag["count"] = std::to_string(read_key_per_range_limit);
+
+  for (size_t i = 0; i < range.size() / 2; i++) {
+    scan_opts.insert(range[i * 2], range[i * 2 + 1],
+                     std::optional(property_bag));
+  }
+  iter->Prepare(scan_opts);
+
+  for (size_t i = 0; i < range.size() / 2; i++) {
+    // Update upper bound before each seek
+    ub = range[2 * i + 1];
+    auto key_count = 0;
+    for (iter->Seek(range[i * 2]); iter->Valid(); iter->Next()) {
+      key_count++;
+      ASSERT_EQ(iter->value(), "old");
+      if (key_count >= read_key_per_range_limit) {
+        break;
+      }
+    }
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(key_count, read_key_per_range_limit);
+  }
+
+  // Create another ingestion file with range delete only that covers the first
+  // file to delete all of its keys.
+  {
+    std::unique_ptr<SstFileWriter> writer;
+    writer.reset(new SstFileWriter(EnvOptions(), options_));
+    ASSERT_OK(writer->Open(ingest_file + "_1"));
+    if (is_reverse_comparator_) {
+      ASSERT_OK(writer->DeleteRange("keyz", "key"));
+    } else {
+      ASSERT_OK(writer->DeleteRange("key", "keyz"));
+    }
+    ASSERT_OK(writer->Finish());
+    writer.reset();
+  }
+  s = db->IngestExternalFile(cfh, {ingest_file + "_1"}, ifo);
+  ASSERT_OK(s);
+
+  // ingest new data
+  create_ingestion_data_file(ingest_file + "_2", "new");
+  s = db->IngestExternalFile(cfh, {ingest_file + "_2"}, ifo);
+  ASSERT_OK(s);
+
+  iter.reset(db->NewIterator(ro, cfh));
+  ASSERT_NE(iter, nullptr);
+  ASSERT_OK(iter->status());
+
+  iter->Prepare(scan_opts);
+
+  for (size_t i = 0; i < range.size() / 2; i++) {
+    // Update upper bound before each seek
+    ub = range[2 * i + 1];
+    auto key_count = 0;
+    for (iter->Seek(range[i * 2]); iter->Valid(); iter->Next()) {
+      key_count++;
+      ASSERT_EQ(iter->value(), "new");
+      if (key_count >= read_key_per_range_limit) {
+        break;
+      }
+    }
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(key_count, read_key_per_range_limit);
+  }
+
+  iter.reset();
+
+  ASSERT_OK(db->DestroyColumnFamilyHandle(cfh));
+  ASSERT_OK(db->Close());
+  ASSERT_OK(DestroyDB(dbname, options_));
+}
+
+INSTANTIATE_TEST_CASE_P(UserDefinedIndexTest, UserDefinedIndexTest,
+                        ::testing::Values(BytewiseComparator(),
+                                          ReverseBytewiseComparator()));
+
+struct UserDefinedIndexStressTestParam {
+  const Comparator* comparator;
+  bool enable_udi;
+  bool enable_compaction_with_sst_partitioner;
+
+  using UserDefinedIndexStressTestTuple =
+      std::tuple<const Comparator*, bool, bool>;
+
+  UserDefinedIndexStressTestParam(const UserDefinedIndexStressTestTuple& tuple)
+      : comparator(std::get<0>(tuple)),
+        enable_udi(std::get<1>(tuple)),
+        enable_compaction_with_sst_partitioner(std::get<2>(tuple)) {}
+};
+
+std::ostream& operator<<(std::ostream& os,
+                         const UserDefinedIndexStressTestParam& param) {
+  return os << "UserDefinedIndexStressTestParam{comparator="
+            << (param.comparator ? param.comparator->Name() : "nullptr")
+            << ", enable_udi=" << param.enable_udi
+            << ", enable_compaction_with_sst_partitioner="
+            << param.enable_compaction_with_sst_partitioner << "}";
+}
+
+struct DataRange {
+  size_t start;  // inclusive
+  size_t end;    // exclusive
+  std::string value;
+  bool is_range_delete;
+  bool skipped;
+  size_t scan_key_count_limit;
+  std::string start_key;
+  std::string end_key;
+
+  // print the range in human readable format
+  std::string ToString() const {
+    std::ostringstream oss;
+    oss << "[" << start << ", " << end << "), value: " << value
+        << ", is_range_delete: " << is_range_delete << ", skipped: " << skipped
+        << ", scan_key_count_limit: " << scan_key_count_limit
+        << ", start_key: " << start_key << ", end_key: " << end_key;
+    return oss.str();
+  }
+};
+class UserDefinedIndexStressTest
+    : public UserDefinedIndexTestBase,
+      public testing::WithParamInterface<
+          UserDefinedIndexStressTestParam::UserDefinedIndexStressTestTuple> {
+ public:
+  void SetUp() override {
+    rand_seed_ = static_cast<uint32_t>(
+        std::chrono::duration_cast<std::chrono::nanoseconds>(
+            std::chrono::system_clock::now().time_since_epoch())
+            .count());
+
+    std::cout << "Random seed: " << rand_seed_ << std::endl;
+
+    rnd = Random(rand_seed_);
+    UserDefinedIndexStressTestParam param = GetParam();
+    comparator_ = param.comparator;
+    enable_udi_ = param.enable_udi;
+    enable_compaction_with_sst_partitioner_ =
+        param.enable_compaction_with_sst_partitioner;
+    options_.comparator = comparator_;
+    is_reverse_comparator_ = comparator_ == ReverseBytewiseComparator();
+    options_.compaction_style = kCompactionStyleUniversal;
+
+    // Set up custom flush block policy that flushes every 3 keys
+    table_options_.flush_block_policy_factory =
+        std::make_shared<CustomFlushBlockPolicyFactory>();
+
+    options_.table_factory.reset(NewBlockBasedTableFactory(table_options_));
+  }
+
+  void TearDown() override {
+    ASSERT_OK(db_->DestroyColumnFamilyHandle(ingest_cfh_));
+    ASSERT_OK(db_->DestroyColumnFamilyHandle(regular_cfh_));
+
+    ASSERT_OK(db_->Close());
+    ASSERT_OK(DestroyDB(dbname_, options_));
+  }
+
+ protected:
+  static constexpr auto kKeyRange = 100;
+  bool enable_udi_{};
+  bool enable_compaction_with_sst_partitioner_{};
+  uint32_t rand_seed_{};
+  std::shared_ptr<UserDefinedIndexFactory> user_defined_index_factory_;
+  BlockBasedTableOptions table_options_;
+  const Comparator* comparator_{};
+  bool is_reverse_comparator_{};
+  Random rnd{0};
+  ColumnFamilyHandle* ingest_cfh_ = nullptr;
+  ColumnFamilyHandle* regular_cfh_ = nullptr;
+  std::unique_ptr<DB> db_;
+  std::vector<std::vector<DataRange>> ranges_in_levels_;
+  std::string dbname_;
+
+  void SetupDB(const std::string& dbname) {
+    options_.create_if_missing = true;
+    options_.disable_auto_compactions = true;
+    Status s = DB::Open(options_, dbname, &db_);
+    ASSERT_OK(s);
+    ASSERT_TRUE(db_ != nullptr);
+    if (enable_compaction_with_sst_partitioner_) {
+      // Use a SST partitioner to create multiple files, use the first 4 bytes
+      // of key to partition the file, The key is formatted with 2 digit
+      // following "key" string, e.g. key01, key99
+      options_.sst_partitioner_factory = NewSstPartitionerFixedPrefixFactory(4);
+    }
+
+    ASSERT_OK(db_->CreateColumnFamily(options_, "regular_cf", &regular_cfh_));
+
+    if (enable_udi_) {
+      // Set up the user-defined index factory
+      user_defined_index_factory_ =
+          std::make_shared<TestUserDefinedIndexFactory>();
+      table_options_.user_defined_index_factory = user_defined_index_factory_;
+    }
+
+    options_.table_factory.reset(NewBlockBasedTableFactory(table_options_));
+    ASSERT_OK(db_->CreateColumnFamily(options_, "ingest_cf", &ingest_cfh_));
+  }
+
+  template <typename T>
+  std::string FormatKey(T i) {
+    std::stringstream ss;
+    ss << std::setw(2) << std::setfill('0') << i;
+    return "key" + ss.str();
+  }
+
+  std::vector<DataRange> GenerateKeyRanges(size_t range_count,
+                                           int skip_range_count,
+                                           const std::string& value) {
+    std::set<size_t> boundaries;
+    // generate n + 1 number of unique boundaries to form n contiguoes ranges
+    while (boundaries.size() < range_count + 1) {
+      boundaries.insert(rnd.Uniform(kKeyRange));
+    }
+    std::vector<size_t> sorted_boundaries(boundaries.begin(), boundaries.end());
+    if (is_reverse_comparator_) {
+      std::reverse(sorted_boundaries.begin(), sorted_boundaries.end());
+    }
+    auto ranges = std::vector<DataRange>();
+    std::optional<size_t> prev_bound;
+    for (auto it = sorted_boundaries.begin(); it != sorted_boundaries.end();
+         it++) {
+      if (prev_bound.has_value()) {
+        ranges.push_back({.start = prev_bound.value(),
+                          .end = *it,
+                          .value = value,
+                          .is_range_delete = rnd.OneIn(6),
+                          .skipped = false,
+                          .scan_key_count_limit = rnd.Uniform(10) + 1,
+                          .start_key = FormatKey(prev_bound.value()),
+                          .end_key = FormatKey(*it)});
+      }
+      prev_bound = *it;
+    }
+    // skipped some of them
+    for (int j = 0; j < skip_range_count; j++) {
+      ranges[rnd.Uniform(static_cast<uint32_t>(range_count))].skipped = true;
+    }
+
+    if (kVerbose) {
+      for (auto const& range : ranges) {
+        std::cout << range.ToString() << std::endl;
+      }
+    }
+
+    return ranges;
+  }
+
+  void CreateSstFileWithRanges(const std::string& ingest_file,
+                               const std::vector<DataRange>& ranges,
+                               bool& data_added) {
+    std::unique_ptr<SstFileWriter> writer;
+
+    data_added = false;
+
+    std::vector<DataRange> ranges_in_file;
+
+    for (auto const& range : ranges) {
+      assert(range.start != range.end);
+      if (range.skipped) {
+        continue;
+      }
+
+      if (writer == nullptr) {
+        // lazy create writer until there is data to be written to avoid
+        // unchecked status error
+        writer = std::make_unique<SstFileWriter>(EnvOptions(), options_);
+        ASSERT_OK(writer->Open(ingest_file));
+      }
+
+      ranges_in_file.push_back(range);
+
+      data_added = true;
+
+      if (range.is_range_delete) {
+        ASSERT_OK(writer->DeleteRange(range.start_key, range.end_key));
+      } else {
+        for (size_t i = range.start; i != range.end;) {
+          auto key = FormatKey(i);
+          range.start < range.end ? i++ : i--;
+          ASSERT_OK(writer->Put(key, range.value));
+        }
+      }
+    }
+    if (kVerbose) {
+      std::cout << "Ingested file: " + ingest_file + "; Range: {" << std::endl;
+      for (const auto& range : ranges_in_file) {
+        std::cout << "    " << range.ToString() << "," << std::endl;
+      }
+      std::cout << "}" << std::endl;
+    }
+    if (data_added) {
+      ASSERT_OK(writer->Finish());
+    }
+  }
+
+  void RangeScan(std::unique_ptr<Iterator>& iter,
+                 const std::vector<DataRange>& ranges, Slice& upper_bound,
+                 std::vector<std::pair<std::string, std::string>>& result,
+                 bool use_multi_scan) {
+    ASSERT_NE(iter, nullptr);
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!ranges.empty());
+
+    MultiScanArgs scan_opts(options_.comparator);
+    std::unordered_map<std::string, std::string> property_bag;
+    if (use_multi_scan) {
+      for (auto const& range : ranges) {
+        if (range.skipped) {
+          continue;
+        }
+        property_bag["count"] = std::to_string(range.scan_key_count_limit);
+        scan_opts.insert(range.start_key, range.end_key, property_bag);
+        // print range start end key
+        if (kVerbose) {
+          std::cout << "range start " << range.start_key << " end "
+                    << range.end_key << std::endl;
+        }
+      }
+      iter->Prepare(scan_opts);
+      ASSERT_OK(iter->status());
+    }
+
+    for (auto const& range : ranges) {
+      if (range.skipped) {
+        continue;
+      }
+      size_t scan_key_count = 0;
+      if (kVerbose) {
+        std::cout << "seek key " << range.start_key << std::endl;
+      }
+      upper_bound = range.end_key;
+      for (iter->Seek(range.start_key);
+           iter->Valid() && scan_key_count < range.scan_key_count_limit;
+           iter->Next()) {
+        if (kVerbose) {
+          std::cout << "key " << iter->key().ToString() << " value "
+                    << iter->value().ToString() << std::endl;
+        }
+        result.emplace_back(iter->key().ToString(), iter->value().ToString());
+        scan_key_count++;
+      }
+      ASSERT_OK(iter->status());
+    }
+  }
+
+  void AddDataToRegularCF() {
+    for (auto const& ranges_in_level : ranges_in_levels_) {
+      for (auto const& range : ranges_in_level) {
+        if (!range.skipped) {
+          for (auto i = range.start; i != range.end;
+               range.start < range.end ? i++ : i--) {
+            if (range.is_range_delete) {
+              ASSERT_OK(
+                  db_->Delete(WriteOptions(), regular_cfh_, FormatKey(i)));
+            } else {
+              ASSERT_OK(db_->Put(WriteOptions(), regular_cfh_, FormatKey(i),
+                                 range.value));
+            }
+          }
+        }
+      }
+    }
+    ASSERT_OK(db_->Flush(FlushOptions(), regular_cfh_));
+  }
+
+  void ValidateQueryResult() {
+    // Query both CF with same range scan and validate result are same
+    for (auto i = 0; i < 200; i++) {
+      if (kVerbose) {
+        std::cout << "iteration " << i << std::endl;
+      }
+      SCOPED_TRACE("Iteration " + std::to_string(i));
+      // randomly generate 1 to 3 ranges
+      auto ranges = GenerateKeyRanges(rnd.Uniform(3) + 4, 2, "");
+
+      // Query regular CF
+      std::vector<std::pair<std::string, std::string>> expected_result;
+      Slice upper_bound("");
+      ReadOptions ro;
+      ro.iterate_upper_bound = &upper_bound;
+
+      std::unique_ptr<Iterator> iter(db_->NewIterator(ro, regular_cfh_));
+      ASSERT_NO_FATAL_FAILURE(
+          RangeScan(iter, ranges, upper_bound, expected_result, false));
+      ASSERT_OK(iter->status());
+
+      // Query ingest CF
+      iter.reset(db_->NewIterator(ro, ingest_cfh_));
+      std::vector<std::pair<std::string, std::string>> ingest_cf_result;
+      ASSERT_NO_FATAL_FAILURE(
+          RangeScan(iter, ranges, upper_bound, ingest_cf_result, false));
+
+      ASSERT_EQ(expected_result, ingest_cf_result);
+      ASSERT_OK(iter->status());
+
+      // Query ingest CF with UDI if it is enabled
+      if (enable_udi_) {
+        ro.table_index_factory = user_defined_index_factory_.get();
+      }
+
+      iter.reset(db_->NewIterator(ro, ingest_cfh_));
+      std::vector<std::pair<std::string, std::string>>
+          ingest_cf_multi_scan_result;
+      ASSERT_NO_FATAL_FAILURE(RangeScan(iter, ranges, upper_bound,
+                                        ingest_cf_multi_scan_result, true));
+      ASSERT_EQ(expected_result, ingest_cf_multi_scan_result);
+      ASSERT_OK(iter->status());
+    }
+  }
+
+  void IngestFilesInOneLevel(const std::vector<DataRange>& ranges_in_level,
+                             const std::string& ingest_file_name_prefix,
+                             size_t& ingest_file_count,
+                             const IngestExternalFileOptions& ifo,
+                             bool combine_ranges = false) {
+    // Generate SST file and bulk load them one level at a time
+    std::vector<std::string> ingest_files;
+    if (combine_ranges) {
+      size_t i = 0;
+      while (i < ranges_in_level.size()) {
+        // if combine ranges, generate 1 SST file that combines muliple ranges
+        // together
+        // Randomly combine ranges to SST file.
+        size_t batch_end_idx =
+            std::min(i + rnd.Uniform(3) + 2, ranges_in_level.size());
+        bool data_added = false;
+        ASSERT_NO_FATAL_FAILURE(CreateSstFileWithRanges(
+            ingest_file_name_prefix + std::to_string(ingest_file_count),
+            {ranges_in_level.begin() + i,
+             ranges_in_level.begin() + batch_end_idx},
+            data_added));
+        if (data_added) {
+          ingest_files.push_back(ingest_file_name_prefix +
+                                 std::to_string(ingest_file_count));
+          ingest_file_count++;
+        }
+        i = batch_end_idx;
+      }
+    } else {
+      for (auto const& range : ranges_in_level) {
+        if (!range.skipped) {
+          bool data_added = false;
+          ASSERT_NO_FATAL_FAILURE(CreateSstFileWithRanges(
+              ingest_file_name_prefix + std::to_string(ingest_file_count),
+              {range}, data_added));
+          ASSERT_TRUE(data_added);
+          ingest_files.push_back(ingest_file_name_prefix +
+                                 std::to_string(ingest_file_count));
+          ingest_file_count++;
+        }
+      }
+    }
+
+    ASSERT_OK(db_->IngestExternalFile(ingest_cfh_, ingest_files, ifo));
+  }
+
+  void IngestDataToCF() {
+    IngestExternalFileOptions ifo;
+    ifo.snapshot_consistency = false;
+    auto ingest_file_name_prefix = dbname_ + "ingest_file_";
+    size_t ingest_file_count = 0;
+    for (auto const& ranges_in_level : ranges_in_levels_) {
+      ASSERT_NO_FATAL_FAILURE(IngestFilesInOneLevel(
+          ranges_in_level, ingest_file_name_prefix, ingest_file_count, ifo));
+    }
+
+    ASSERT_GE(ingest_file_count, 0);
+  }
+
+  void CompactIngestedCF() {
+    auto s = db_->CompactRange(
+        {.exclusive_manual_compaction = true,
+         .bottommost_level_compaction = BottommostLevelCompaction::kForce},
+        ingest_cfh_, nullptr, nullptr);
+    ASSERT_OK(s);
+  }
+};
+
+TEST_P(UserDefinedIndexStressTest, PartialDeleteRange) {
+  // Create 2 column families. One use normal put/del, the other uses sst
+  // ingest Randomly generate multiple non overlapping range for multiple
+  // levels Range scan same range between the 2 CF and validate the result is
+  // same
+  SCOPED_TRACE("Start with random seed: " + std::to_string(rand_seed_));
+  dbname_ =
+      test::PerThreadDBPath("UserDefinedIndexStressTest_PartialDeleteRange");
+  SCOPED_TRACE("dbname: " + dbname_);
+  ASSERT_NO_FATAL_FAILURE(SetupDB(dbname_));
+
+  if (enable_udi_) {
+    // Skip UDI for now.
+    // The issue is that with UDI enabled, prepare might not prepare enough keys
+    // at lower level due to range delete from upper level.
+    // E.g. consider a LSM tree:
+    // L1: Data         [0-1]
+    // L2: Delete Range [0-6]
+    // L3: Data         [0-9]
+    // When multiscan queries range [0-9) with UDI count as 3, the L3 file
+    // will only prepare range [0-3). However, this range is masked out by upper
+    // layer delete range from [0-6] from L2. This causes query to only return
+    // [0,1], while [0,1,7] is the right result. Until prepare is able to
+    // preparing additional block supported, UDI is skipped.
+    return;
+  }
+
+  for (int i = 0; i < 5; i++) {
+    ranges_in_levels_.push_back(
+        GenerateKeyRanges(rnd.Uniform(3) + 4, 2,
+                          "L" + std::to_string(options_.num_levels - 1 - i)));
+  }
+
+  ASSERT_NO_FATAL_FAILURE(IngestDataToCF());
+
+  if (enable_compaction_with_sst_partitioner_) {
+    ASSERT_NO_FATAL_FAILURE(CompactIngestedCF());
+  }
+
+  ASSERT_NO_FATAL_FAILURE(AddDataToRegularCF());
+
+  ASSERT_NO_FATAL_FAILURE(ValidateQueryResult());
+}
+
+TEST_P(UserDefinedIndexStressTest, DeleteRangeMixedWithDataFile) {
+  // Create 2 column families. One use normal put/del, the other uses sst
+  // ingest.
+  // Test the case where there are 3 levels, the middle level is a delete
+  // range file that span across the entire key space. The top and bottom level
+  // file have multiple files and each one has both data and delete range. Scan
+  // same range between the 2 CF and validate the result is same
+  SCOPED_TRACE("Start with random seed: " + std::to_string(rand_seed_));
+  dbname_ = test::PerThreadDBPath(
+      "UserDefinedIndexStressTest_DeleteRangeMixedWithDataFile");
+  SCOPED_TRACE("dbname: " + dbname_);
+  ASSERT_NO_FATAL_FAILURE(SetupDB(dbname_));
+
+  // Test 3 levels.
+  // Bottom level is mixed data with delete range.
+  ranges_in_levels_.push_back(GenerateKeyRanges(rnd.Uniform(3) + 6, 2, "L6"));
+  // Middle level delete range across entire key space.
+  if (is_reverse_comparator_) {
+    ranges_in_levels_.push_back({{.start = 100,
+                                  .end = 0,
+                                  .is_range_delete = true,
+                                  .skipped = false,
+                                  .start_key = "keyz",
+                                  .end_key = "key"}});
+  } else {
+    ranges_in_levels_.push_back({{.start = 0,
+                                  .end = 100,
+                                  .is_range_delete = true,
+                                  .skipped = false,
+                                  .start_key = "key",
+                                  .end_key = "keyz"}});
+  }
+
+  // Top level is mixed data with delete range.
+  ranges_in_levels_.push_back(GenerateKeyRanges(rnd.Uniform(3) + 6, 2, "L4"));
+
+  IngestExternalFileOptions ifo;
+  ifo.snapshot_consistency = false;
+  auto ingest_file_name_prefix = dbname_ + "ingest_file_";
+  size_t ingest_file_count = 0;
+  auto first_level = true;
+  for (auto const& ranges_in_level : ranges_in_levels_) {
+    ASSERT_NO_FATAL_FAILURE(
+        IngestFilesInOneLevel(ranges_in_level, ingest_file_name_prefix,
+                              ingest_file_count, ifo, /*combine_ranges=*/true));
+    if (first_level) {
+      first_level = false;
+      if (enable_compaction_with_sst_partitioner_) {
+        // When compaction is enabled, do a compaction at the first level
+        ASSERT_NO_FATAL_FAILURE(CompactIngestedCF());
+      }
+    }
+  }
+
+  ASSERT_NO_FATAL_FAILURE(AddDataToRegularCF());
+
+  ASSERT_NO_FATAL_FAILURE(ValidateQueryResult());
+}
+
+TEST_P(UserDefinedIndexStressTest, DeleteRange) {
+  // Create 2 column families. One use normal put/del, the other uses sst
+  // ingest.
+  // Test the case where there are 3 levels, the middle level is a delete
+  // range file that span across the entire key space. Range scan same range
+  // between the 2 CF and validate the result is same
+  SCOPED_TRACE("Start with random seed: " + std::to_string(rand_seed_));
+  dbname_ = test::PerThreadDBPath("UserDefinedIndexStressTest_DeleteRange");
+  SCOPED_TRACE("dbname: " + dbname_);
+  ASSERT_NO_FATAL_FAILURE(SetupDB(dbname_));
+
+  // Test 3 levels.
+  // bottom level constains multiple files, each could have data or delete
+  // ranges or both.
+  ranges_in_levels_.push_back(GenerateKeyRanges(rnd.Uniform(3) + 4, 2, "L6"));
+  // middle level delete range across entire key space
+  if (is_reverse_comparator_) {
+    ranges_in_levels_.push_back({{.start = 100,
+                                  .end = 0,
+                                  .is_range_delete = true,
+                                  .skipped = false,
+                                  .start_key = "keyz",
+                                  .end_key = "key"}});
+  } else {
+    ranges_in_levels_.push_back({{.start = 0,
+                                  .end = 100,
+                                  .is_range_delete = true,
+                                  .skipped = false,
+                                  .start_key = "key",
+                                  .end_key = "keyz"}});
+  }
+  // Top level constains multiple files, each could have data or delete
+  // ranges or both.
+  ranges_in_levels_.push_back(GenerateKeyRanges(rnd.Uniform(3) + 4, 2, "L4"));
+
+  IngestExternalFileOptions ifo;
+  ifo.snapshot_consistency = false;
+  auto ingest_file_name_prefix = dbname_ + "ingest_file_";
+  size_t ingest_file_count = 0;
+  auto first_level = true;
+  for (auto const& ranges_in_level : ranges_in_levels_) {
+    ASSERT_NO_FATAL_FAILURE(IngestFilesInOneLevel(
+        ranges_in_level, ingest_file_name_prefix, ingest_file_count, ifo));
+    if (first_level) {
+      first_level = false;
+      if (enable_compaction_with_sst_partitioner_) {
+        // When compaction is enabled, do a compaction at the first level
+        ASSERT_NO_FATAL_FAILURE(CompactIngestedCF());
+      }
+    }
+  }
+
+  ASSERT_NO_FATAL_FAILURE(AddDataToRegularCF());
+
+  ASSERT_NO_FATAL_FAILURE(ValidateQueryResult());
+}
+
+TEST_P(UserDefinedIndexStressTest, AtomicReplaceBulkLoad) {
+  // Create 2 column families. One use normal put/del, the other uses SST
+  // ingest. The SST ingest uses atomic range replace.
+  SCOPED_TRACE("Start with random seed: " + std::to_string(rand_seed_));
+  dbname_ =
+      test::PerThreadDBPath("UserDefinedIndexStressTest_AtomicReplaceBulkLoad");
+  SCOPED_TRACE("dbname: " + dbname_);
+  ASSERT_NO_FATAL_FAILURE(SetupDB(dbname_));
+
+  // Test 3 levels.
+  // bottom level constains multiple files, each could have data or delete
+  // ranges or both.
+  ranges_in_levels_.push_back(GenerateKeyRanges(rnd.Uniform(3) + 4, 2, "L6"));
+  // middle level delete range across entire key space
+  if (is_reverse_comparator_) {
+    ranges_in_levels_.push_back({{.start = 100,
+                                  .end = 0,
+                                  .is_range_delete = true,
+                                  .skipped = false,
+                                  .start_key = "keyz",
+                                  .end_key = "key"}});
+  } else {
+    ranges_in_levels_.push_back({{.start = 0,
+                                  .end = 100,
+                                  .is_range_delete = true,
+                                  .skipped = false,
+                                  .start_key = "key",
+                                  .end_key = "keyz"}});
+  }
+  // Top level constains multiple files, each could have data or delete
+  // ranges or both.
+  ranges_in_levels_.push_back(GenerateKeyRanges(rnd.Uniform(3) + 4, 2, "L4"));
+
+  IngestExternalFileOptions ifo;
+  ifo.snapshot_consistency = false;
+  auto ingest_file_name_prefix = dbname_ + "ingest_file_";
+  size_t ingest_file_count = 0;
+  auto first_level = true;
+  for (auto const& ranges_in_level : ranges_in_levels_) {
+    ASSERT_NO_FATAL_FAILURE(IngestFilesInOneLevel(
+        ranges_in_level, ingest_file_name_prefix, ingest_file_count, ifo));
+    if (first_level) {
+      first_level = false;
+      if (enable_compaction_with_sst_partitioner_) {
+        // When compaction is enabled, do a compaction at the first level
+        ASSERT_NO_FATAL_FAILURE(CompactIngestedCF());
+      }
+    }
+  }
+
+  // Ingest the a new file with atomic replace with full key space, this layer
+  // is exactly same as the one at the top level
+  bool data_added;
+  ASSERT_NO_FATAL_FAILURE(CreateSstFileWithRanges(
+      ingest_file_name_prefix + std::to_string(++ingest_file_count),
+      ranges_in_levels_[2], data_added));
+
+  IngestExternalFileArg ingest_arg;
+  ingest_arg.column_family = ingest_cfh_;
+  ingest_arg.options = ifo;
+  ingest_arg.external_files.push_back(ingest_file_name_prefix +
+                                      std::to_string(ingest_file_count));
+  ingest_arg.atomic_replace_range = RangeOpt(nullptr, nullptr);
+
+  ASSERT_OK(db_->IngestExternalFiles(
+      std::vector<IngestExternalFileArg>({ingest_arg})));
+
+  ASSERT_NO_FATAL_FAILURE(AddDataToRegularCF());
+
+  ASSERT_NO_FATAL_FAILURE(ValidateQueryResult());
+}
+
+INSTANTIATE_TEST_CASE_P(
+    UserDefinedIndexStressTest, UserDefinedIndexStressTest,
+    testing::Combine(testing::Values(BytewiseComparator(),
+                                     ReverseBytewiseComparator()),
+                     testing::Bool(), testing::Bool()));
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  // Opt-in this whole test file
+  ROCKSDB_NAMESPACE::TEST_AllowUnsupportedFormatVersion() = true;
 
-int main(int argc, char** argv) {
   ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
diff --git a/table/unique_id.cc b/table/unique_id.cc
index 8bfa8bcfd383..6da691082770 100644
--- a/table/unique_id.cc
+++ b/table/unique_id.cc
@@ -14,7 +14,7 @@ namespace ROCKSDB_NAMESPACE {
 
 std::string EncodeSessionId(uint64_t upper, uint64_t lower) {
   std::string db_session_id(20U, '\0');
-  char *buf = db_session_id.data();
+  char* buf = db_session_id.data();
   // Preserving `lower` is slightly tricky. 36^12 is slightly more than
   // 62 bits, so we use 12 chars plus the bottom two bits of one more.
   // (A tiny fraction of 20 digit strings go unused.)
@@ -26,8 +26,8 @@ std::string EncodeSessionId(uint64_t upper, uint64_t lower) {
   return db_session_id;
 }
 
-Status DecodeSessionId(const std::string &db_session_id, uint64_t *upper,
-                       uint64_t *lower) {
+Status DecodeSessionId(const std::string& db_session_id, uint64_t* upper,
+                       uint64_t* lower) {
   const size_t len = db_session_id.size();
   if (len == 0) {
     return Status::NotSupported("Missing db_session_id");
@@ -41,7 +41,7 @@ Status DecodeSessionId(const std::string &db_session_id, uint64_t *upper,
     return Status::NotSupported("Too long db_session_id");
   }
   uint64_t a = 0, b = 0;
-  const char *buf = &db_session_id.front();
+  const char* buf = &db_session_id.front();
   bool success = ParseBaseChars<36>(&buf, len - 12U, &a);
   if (!success) {
     return Status::NotSupported("Bad digit in db_session_id");
@@ -56,8 +56,8 @@ Status DecodeSessionId(const std::string &db_session_id, uint64_t *upper,
   return Status::OK();
 }
 
-Status GetSstInternalUniqueId(const std::string &db_id,
-                              const std::string &db_session_id,
+Status GetSstInternalUniqueId(const std::string& db_id,
+                              const std::string& db_session_id,
                               uint64_t file_number, UniqueIdPtr out,
                               bool force) {
   if (!force) {
@@ -160,11 +160,11 @@ std::string EncodeUniqueIdBytes(UniqueIdPtr in) {
   return ret;
 }
 
-Status DecodeUniqueIdBytes(const std::string &unique_id, UniqueIdPtr out) {
+Status DecodeUniqueIdBytes(const std::string& unique_id, UniqueIdPtr out) {
   if (unique_id.size() != (out.extended ? 24 : 16)) {
     return Status::NotSupported("Not a valid unique_id");
   }
-  const char *buf = &unique_id.front();
+  const char* buf = &unique_id.front();
   out.ptr[0] = DecodeFixed64(&buf[0]);
   out.ptr[1] = DecodeFixed64(&buf[8]);
   if (out.extended) {
@@ -174,8 +174,8 @@ Status DecodeUniqueIdBytes(const std::string &unique_id, UniqueIdPtr out) {
 }
 
 template <typename ID>
-Status GetUniqueIdFromTablePropertiesHelper(const TableProperties &props,
-                                            std::string *out_id) {
+Status GetUniqueIdFromTablePropertiesHelper(const TableProperties& props,
+                                            std::string* out_id) {
   ID tmp{};
   Status s = GetSstInternalUniqueId(props.db_id, props.db_session_id,
                                     props.orig_file_number, &tmp);
@@ -188,23 +188,27 @@ Status GetUniqueIdFromTablePropertiesHelper(const TableProperties &props,
   return s;
 }
 
-Status GetExtendedUniqueIdFromTableProperties(const TableProperties &props,
-                                              std::string *out_id) {
+Status GetExtendedUniqueIdFromTableProperties(const TableProperties& props,
+                                              std::string* out_id) {
   return GetUniqueIdFromTablePropertiesHelper<UniqueId64x3>(props, out_id);
 }
 
-Status GetUniqueIdFromTableProperties(const TableProperties &props,
-                                      std::string *out_id) {
+Status GetUniqueIdFromTableProperties(const TableProperties& props,
+                                      std::string* out_id) {
   return GetUniqueIdFromTablePropertiesHelper<UniqueId64x2>(props, out_id);
 }
 
-std::string UniqueIdToHumanString(const std::string &id) {
-  // Not so efficient, but that's OK
-  std::string str = Slice(id).ToString(/*hex*/ true);
-  for (size_t i = 16; i < str.size(); i += 17) {
-    str.insert(i, "-");
+std::string UniqueIdToHumanString(const std::string& id) {
+  std::string hex = Slice(id).ToString(/*hex*/ true);
+  std::string result;
+  result.reserve(hex.size() + hex.size() / 16);
+  for (size_t i = 0; i < hex.size(); i++) {
+    if (i > 0 && i % 16 == 0) {
+      result.push_back('-');
+    }
+    result.push_back(hex[i]);
   }
-  return str;
+  return result;
 }
 
 std::string InternalUniqueIdToHumanString(UniqueIdPtr in) {
diff --git a/table/unique_id_impl.h b/table/unique_id_impl.h
index 6e3dc62c794d..47d10c9712be 100644
--- a/table/unique_id_impl.h
+++ b/table/unique_id_impl.h
@@ -26,14 +26,14 @@ constexpr UniqueId64x3 kNullUniqueId64x3 = {};
 
 // Dynamic pointer wrapper for one of the two above
 struct UniqueIdPtr {
-  uint64_t *ptr = nullptr;
+  uint64_t* ptr = nullptr;
   bool extended = false;
 
-  /*implicit*/ UniqueIdPtr(UniqueId64x2 *id) {
+  /*implicit*/ UniqueIdPtr(UniqueId64x2* id) {
     ptr = (*id).data();
     extended = false;
   }
-  /*implicit*/ UniqueIdPtr(UniqueId64x3 *id) {
+  /*implicit*/ UniqueIdPtr(UniqueId64x3* id) {
     ptr = (*id).data();
     extended = true;
   }
@@ -45,8 +45,8 @@ struct UniqueIdPtr {
 // unique id, so can be manipulated in more ways but very carefully.
 // These must be long term stable to ensure GetUniqueIdFromTableProperties
 // is long term stable.
-Status GetSstInternalUniqueId(const std::string &db_id,
-                              const std::string &db_session_id,
+Status GetSstInternalUniqueId(const std::string& db_id,
+                              const std::string& db_session_id,
                               uint64_t file_number, UniqueIdPtr out,
                               bool force = false);
 
@@ -66,7 +66,7 @@ void ExternalUniqueIdToInternal(UniqueIdPtr in_out);
 std::string EncodeUniqueIdBytes(UniqueIdPtr in);
 
 // Reverse of EncodeUniqueIdBytes.
-Status DecodeUniqueIdBytes(const std::string &unique_id, UniqueIdPtr out);
+Status DecodeUniqueIdBytes(const std::string& unique_id, UniqueIdPtr out);
 
 // For presenting internal IDs for debugging purposes. Visually distinct from
 // UniqueIdToHumanString for external IDs.
@@ -87,7 +87,7 @@ std::string EncodeSessionId(uint64_t upper, uint64_t lower);
 // Reverse of EncodeSessionId. Returns NotSupported on error rather than
 // Corruption because non-standard session IDs should be allowed with degraded
 // functionality.
-Status DecodeSessionId(const std::string &db_session_id, uint64_t *upper,
-                       uint64_t *lower);
+Status DecodeSessionId(const std::string& db_session_id, uint64_t* upper,
+                       uint64_t* lower);
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/test_util/sync_point.cc b/test_util/sync_point.cc
index bec02d4f67a3..2b9ab2f69625 100644
--- a/test_util/sync_point.cc
+++ b/test_util/sync_point.cc
@@ -79,4 +79,8 @@ void SetupSyncPointsToMockDirectIO() {
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 #endif
 }
+
+#ifndef NDEBUG
+std::atomic<int> g_throw_on_testable_assertion_failure{0};
+#endif  // NDEBUG
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/test_util/sync_point.h b/test_util/sync_point.h
index 6022073e573a..081e90cb1231 100644
--- a/test_util/sync_point.h
+++ b/test_util/sync_point.h
@@ -6,10 +6,9 @@
 
 #include <assert.h>
 
+#include <atomic>
 #include <functional>
-#include <mutex>
 #include <string>
-#include <thread>
 #include <vector>
 
 #include "rocksdb/rocksdb_namespace.h"
@@ -180,3 +179,37 @@ void SetupSyncPointsToMockDirectIO();
     }                                               \
   }
 #endif  // NDEBUG
+
+// An alternative to assert() that is more test-friendly than using
+// ASSERT_DEATH. Relies on exception propagation.
+#ifdef NDEBUG
+#define testable_assert(cond)
+#else
+namespace ROCKSDB_NAMESPACE {
+// Intentionally not based on std::exception to reduce places where this
+// would be caught
+struct TestableAssertionFailure {};
+// Tracks whether to throw on testable_assert failure instead of aborting.
+// This is an atomic counter for re-entrancy / thread-safety.
+extern std::atomic<int> g_throw_on_testable_assertion_failure;
+}  // namespace ROCKSDB_NAMESPACE
+#define testable_assert(cond)                                          \
+  do {                                                                 \
+    if (ROCKSDB_NAMESPACE::g_throw_on_testable_assertion_failure.load( \
+            std::memory_order_relaxed) > 0) {                          \
+      if (cond) {                                                      \
+      } else                                                           \
+        throw ROCKSDB_NAMESPACE::TestableAssertionFailure();           \
+    } else {                                                           \
+      assert(cond);                                                    \
+    }                                                                  \
+  } while (0)  // require ; in caller
+#define ASSERT_TESTABLE_FAILURE(expr)                                   \
+  do {                                                                  \
+    ROCKSDB_NAMESPACE::g_throw_on_testable_assertion_failure.fetch_add( \
+        1, std::memory_order_relaxed);                                  \
+    ASSERT_THROW(expr, ROCKSDB_NAMESPACE::TestableAssertionFailure);    \
+    ROCKSDB_NAMESPACE::g_throw_on_testable_assertion_failure.fetch_sub( \
+        1, std::memory_order_relaxed);                                  \
+  } while (0)  // require ; in caller
+#endif
diff --git a/test_util/testutil.cc b/test_util/testutil.cc
index 35884a7b3789..f9f9e0bf680a 100644
--- a/test_util/testutil.cc
+++ b/test_util/testutil.cc
@@ -29,6 +29,7 @@
 #include "test_util/mock_time_env.h"
 #include "test_util/sync_point.h"
 #include "util/random.h"
+#include "util/string_util.h"
 
 #ifndef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
 void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
@@ -44,7 +45,7 @@ const std::set<uint32_t> kFooterFormatVersionsToTest{
     6U,
     // In case any interesting future changes
     kDefaultFormatVersion,
-    kLatestFormatVersion,
+    kLatestBbtFormatVersion,
 };
 const ReadOptionsNoIo kReadOptionsNoIo;
 
@@ -91,9 +92,9 @@ bool ShouldPersistUDT(const UserDefinedTimestampTestMode& test_mode) {
   return test_mode != UserDefinedTimestampTestMode::kStripUserDefinedTimestamp;
 }
 
-Slice CompressibleString(Random* rnd, double compressed_fraction, int len,
+Slice CompressibleString(Random* rnd, double compressed_to_fraction, int len,
                          std::string* dst) {
-  int raw = static_cast<int>(len * compressed_fraction);
+  int raw = static_cast<int>(len * compressed_to_fraction);
   if (raw < 1) {
     raw = 1;
   }
@@ -311,7 +312,6 @@ void RandomInitDBOptions(DBOptions* db_opt, Random* rnd) {
   db_opt->track_and_verify_wals = rnd->Uniform(2);
   db_opt->verify_sst_unique_id_in_manifest = rnd->Uniform(2);
   db_opt->skip_stats_update_on_db_open = rnd->Uniform(2);
-  db_opt->skip_checking_sst_file_sizes_on_db_open = rnd->Uniform(2);
   db_opt->use_adaptive_mutex = rnd->Uniform(2);
   db_opt->use_fsync = rnd->Uniform(2);
   db_opt->recycle_log_file_num = rnd->Uniform(2);
@@ -386,7 +386,6 @@ void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, DBOptions& db_options,
   cf_opt->level0_stop_writes_trigger = rnd->Uniform(100);
   cf_opt->max_bytes_for_level_multiplier = rnd->Uniform(100);
   cf_opt->max_write_buffer_number = rnd->Uniform(100);
-  cf_opt->max_write_buffer_number_to_maintain = rnd->Uniform(100);
   cf_opt->max_write_buffer_size_to_maintain = rnd->Uniform(10000);
   cf_opt->min_write_buffer_number_to_merge = rnd->Uniform(100);
   cf_opt->num_levels = rnd->Uniform(100);
diff --git a/test_util/testutil.h b/test_util/testutil.h
index 2d693b5f201f..c07b0139a4d4 100644
--- a/test_util/testutil.h
+++ b/test_util/testutil.h
@@ -23,6 +23,7 @@
 #include "rocksdb/slice.h"
 #include "rocksdb/table.h"
 #include "table/internal_iterator.h"
+#include "util/defer.h"
 #include "util/mutexlock.h"
 
 #ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
@@ -71,9 +72,16 @@ bool ShouldPersistUDT(const UserDefinedTimestampTestMode& test_mode);
 // Store in *dst a string of length "len" that will compress to
 // "N*compressed_fraction" bytes and return a Slice that references
 // the generated data.
-Slice CompressibleString(Random* rnd, double compressed_fraction, int len,
+Slice CompressibleString(Random* rnd, double compressed_to_fraction, int len,
                          std::string* dst);
 
+inline std::string CompressibleString(Random* rnd,
+                                      double compressed_to_fraction, int len) {
+  std::string dst;
+  CompressibleString(rnd, compressed_to_fraction, len, &dst);
+  return dst;
+}
+
 #ifndef NDEBUG
 // An internal comparator that just forward comparing results from the
 // user comparator in it. Can be used to test entities that have no dependency
@@ -359,6 +367,11 @@ class StringSource : public FSRandomAccessFile {
 
   void set_total_reads(int tr) { total_reads_ = tr; }
 
+  IOStatus GetFileSize(uint64_t* file_size) override {
+    *file_size = contents_.size();
+    return IOStatus::OK();
+  }
+
  private:
   std::string contents_;
   uint64_t uniq_id_;
@@ -731,6 +744,149 @@ class StringFS : public FileSystemWrapper {
   std::unordered_map<std::string, std::string> files_;
 };
 
+// A compressor that essentially implements a custom compression algorithm
+// by leveraging an existing compression algorithm and putting a custom header
+// on it to detect any attempts to decompress it with the wrong compression
+// type or dictionary.
+template <CompressionType kCompression>
+struct CompressorCustomAlg : public CompressorWrapper {
+  static bool Supported() { return LZ4_Supported(); }
+
+  explicit CompressorCustomAlg(
+      std::unique_ptr<Compressor> wrapped =
+          GetBuiltinV2CompressionManager()->GetCompressor({}, kLZ4Compression))
+      : CompressorWrapper(std::move(wrapped)),
+        dictionary_hash_(GetSliceHash(wrapped_->GetSerializedDict())) {
+    static_assert(kCompression > kLastBuiltinCompression);
+  }
+
+  const char* Name() const override { return "CompressorCustomAlg"; }
+
+  CompressionType GetPreferredCompressionType() const override {
+    return kCompression;
+  }
+
+  std::unique_ptr<Compressor> Clone() const override {
+    return std::make_unique<CompressorCustomAlg>(wrapped_->Clone());
+  }
+
+  Status CompressBlock(Slice uncompressed_data, char* compressed_output,
+                       size_t* compressed_output_size,
+                       CompressionType* out_compression_type,
+                       ManagedWorkingArea* working_area) override {
+    size_t allowed_output_size = *compressed_output_size;
+    Status s = wrapped_->CompressBlock(uncompressed_data, compressed_output,
+                                       compressed_output_size,
+                                       out_compression_type, working_area);
+    if (s.ok() && *out_compression_type != kNoCompression) {
+      assert(*out_compression_type == kLZ4Compression);
+      if (*compressed_output_size + 5 > allowed_output_size) {
+        *out_compression_type = kNoCompression;
+        return Status::OK();
+      }
+      // Generate & insert header
+      std::memmove(compressed_output + 5, compressed_output,
+                   *compressed_output_size);
+      compressed_output[0] = lossless_cast<char>(kCompression);
+      EncodeFixed32(&compressed_output[1], dictionary_hash_);
+      *compressed_output_size += 5;
+      *out_compression_type = kCompression;
+    }
+    return s;
+  }
+
+  std::unique_ptr<Compressor> MaybeCloneSpecialized(
+      CacheEntryRole block_type, DictConfigArgs&& dict_config) const override {
+    auto clone =
+        wrapped_->CloneMaybeSpecialized(block_type, std::move(dict_config));
+    return std::make_unique<CompressorCustomAlg>(std::move(clone));
+  }
+
+ protected:
+  uint32_t dictionary_hash_;
+};
+
+// A decompressor suitable for all the instantiable CompressorCustomAlg
+// implementations. Can be configured to check that it is only used to
+// decompress certain types using SetAllowedTypes().
+struct DecompressorCustomAlg : public DecompressorWrapper {
+  using TypeSet = SmallEnumSet<CompressionType, kDisableCompressionOption>;
+
+  DecompressorCustomAlg(std::shared_ptr<Decompressor> wrapped =
+                            GetBuiltinV2CompressionManager()->GetDecompressor())
+      : DecompressorWrapper(std::move(wrapped)),
+        dictionary_hash_(GetSliceHash(wrapped_->GetSerializedDict())),
+        allowed_types_(TypeSet::All()) {}
+
+  const char* Name() const override { return "DecompressorCustomAlg"; }
+
+  Status MaybeCloneForDict(const Slice& serialized_dict,
+                           std::unique_ptr<Decompressor>* out) override {
+    Status s = wrapped_->MaybeCloneForDict(serialized_dict, out);
+    if (s.ok()) {
+      assert(*out != nullptr);
+      auto clone = std::make_unique<DecompressorCustomAlg>(std::move(*out));
+      clone->SetAllowedTypes(allowed_types_);
+      *out = std::move(clone);
+      assert(out->get()->GetSerializedDict() == serialized_dict);
+    } else {
+      assert(*out == nullptr);
+    }
+    return s;
+  }
+
+  Status ExtractUncompressedSize(Args& args) override {
+    if (args.compression_type >= kFirstCustomCompression &&
+        args.compression_type <= kLastCustomCompression) {
+      assert(args.compressed_data.size() > 0);
+      assert(args.compressed_data[0] ==
+             lossless_cast<char>(args.compression_type));
+      assert(DecodeFixed32(args.compressed_data.data() + 1) ==
+             dictionary_hash_);
+      // Strip off our header because ExtractUncompressedSize() is also going
+      // to strip off the uncompressed size data.
+      args.compressed_data.remove_prefix(5);
+      // It's ok to modify other parts of args if we restore to original
+      SaveAndRestore<CompressionType> save_compression_type(
+          &args.compression_type, kLZ4Compression);
+      return wrapped_->ExtractUncompressedSize(args);
+    } else {
+      // Also support built-in compressions
+      return wrapped_->ExtractUncompressedSize(args);
+    }
+  }
+
+  Status DecompressBlock(const Args& args, char* uncompressed_output) override {
+    if (args.compression_type >= kFirstCustomCompression &&
+        args.compression_type <= kLastCustomCompression) {
+      // Also allowed to copy args and modify
+      Args modified_args = args;
+      modified_args.compression_type = kLZ4Compression;
+      return wrapped_->DecompressBlock(modified_args, uncompressed_output);
+    } else {
+      // Also support built-in compressions
+      return wrapped_->DecompressBlock(args, uncompressed_output);
+    }
+  }
+
+  void SetAllowedTypes(const CompressionType* types_begin,
+                       const CompressionType* types_end) {
+    TypeSet allowed_types;
+    for (auto type = types_begin; type != types_end; ++type) {
+      allowed_types.Add(*type);
+    }
+    allowed_types_ = std::move(allowed_types);
+  }
+
+  void SetAllowedTypes(TypeSet allowed_types) {
+    allowed_types_ = std::move(allowed_types);
+  }
+
+ protected:
+  uint32_t dictionary_hash_;
+  SmallEnumSet<CompressionType, kDisableCompressionOption> allowed_types_;
+};
+
 // Randomly initialize the given DBOptions
 void RandomInitDBOptions(DBOptions* db_opt, Random* rnd);
 
diff --git a/third-party/gtest-1.8.1/fused-src/gtest/gtest-all.cc b/third-party/gtest-1.8.1/fused-src/gtest/gtest-all.cc
index b19c9f2a8115..f3c10c469daf 100644
--- a/third-party/gtest-1.8.1/fused-src/gtest/gtest-all.cc
+++ b/third-party/gtest-1.8.1/fused-src/gtest/gtest-all.cc
@@ -477,6 +477,38 @@ GTEST_DECLARE_bool_(death_test_use_fork);
 
 namespace internal {
 
+template <typename RawType>
+AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression,
+                                         const char* rhs_expression,
+                                         RawType lhs_value, RawType rhs_value) {
+  const FloatingPoint<RawType> lhs(lhs_value), rhs(rhs_value);
+
+  if (lhs.AlmostEquals(rhs)) {
+    return AssertionSuccess();
+  }
+
+  ::std::stringstream lhs_ss;
+  lhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+         << lhs_value;
+
+  ::std::stringstream rhs_ss;
+  rhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+         << rhs_value;
+
+  return EqFailure(lhs_expression, rhs_expression,
+                   StringStreamToString(&lhs_ss), StringStreamToString(&rhs_ss),
+                   false);
+}
+
+template
+AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression,
+                                         const char* rhs_expression,
+                                         float lhs_value, float rhs_value);
+template
+AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression,
+                                         const char* rhs_expression,
+                                         double lhs_value, double rhs_value);
+
 // The value of GetTestTypeId() as seen from within the Google Test
 // library.  This is solely for testing GetTestTypeId().
 GTEST_API_ extern const TypeId kTestTypeIdInGoogleTest;
diff --git a/third-party/gtest-1.8.1/fused-src/gtest/gtest.h b/third-party/gtest-1.8.1/fused-src/gtest/gtest.h
index 2d82d8e4d0b1..f6e3fabed005 100644
--- a/third-party/gtest-1.8.1/fused-src/gtest/gtest.h
+++ b/third-party/gtest-1.8.1/fused-src/gtest/gtest.h
@@ -3973,7 +3973,7 @@ const char* StringFromGTestEnv(const char* flag, const char* default_val);
 #include <ctype.h>
 #include <float.h>
 #include <string.h>
-#include <iomanip>
+// #include <iomanip> // Not included in newer versions of gtest
 #include <limits>
 #include <map>
 #include <set>
@@ -21451,27 +21451,7 @@ template <typename RawType>
 AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression,
                                          const char* rhs_expression,
                                          RawType lhs_value,
-                                         RawType rhs_value) {
-  const FloatingPoint<RawType> lhs(lhs_value), rhs(rhs_value);
-
-  if (lhs.AlmostEquals(rhs)) {
-    return AssertionSuccess();
-  }
-
-  ::std::stringstream lhs_ss;
-  lhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
-         << lhs_value;
-
-  ::std::stringstream rhs_ss;
-  rhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
-         << rhs_value;
-
-  return EqFailure(lhs_expression,
-                   rhs_expression,
-                   StringStreamToString(&lhs_ss),
-                   StringStreamToString(&rhs_ss),
-                   false);
-}
+                                         RawType rhs_value);
 
 // Helper function for implementing ASSERT_NEAR.
 //
diff --git a/tools/blob_dump.cc b/tools/blob_dump.cc
index 23b5f8f7903a..520b194ee1a2 100644
--- a/tools/blob_dump.cc
+++ b/tools/blob_dump.cc
@@ -27,12 +27,10 @@ int main(int argc, char** argv) {
       {"file", required_argument, nullptr, 'f'},
       {"show_key", optional_argument, nullptr, 'k'},
       {"show_blob", optional_argument, nullptr, 'b'},
-      {"show_uncompressed_blob", optional_argument, nullptr, 'r'},
       {"show_summary", optional_argument, nullptr, 's'},
   };
   DisplayType show_key = DisplayType::kRaw;
   DisplayType show_blob = DisplayType::kNone;
-  DisplayType show_uncompressed_blob = DisplayType::kNone;
   bool show_summary = false;
   std::string file;
   while (true) {
@@ -47,7 +45,6 @@ int main(int argc, char** argv) {
                 "Usage: blob_dump --file=filename "
                 "[--show_key[=none|raw|hex|detail]] "
                 "[--show_blob[=none|raw|hex|detail]] "
-                "[--show_uncompressed_blob[=none|raw|hex|detail]] "
                 "[--show_summary]\n");
         return 0;
       case 'f':
@@ -73,17 +70,6 @@ int main(int argc, char** argv) {
           show_blob = DisplayType::kHex;
         }
         break;
-      case 'r':
-        if (optarg) {
-          if (display_types.count(arg_str) == 0) {
-            fprintf(stderr, "Unrecognized blob display type.\n");
-            return -1;
-          }
-          show_uncompressed_blob = display_types.at(arg_str);
-        } else {
-          show_uncompressed_blob = DisplayType::kHex;
-        }
-        break;
       case 's':
         show_summary = true;
         break;
@@ -93,8 +79,7 @@ int main(int argc, char** argv) {
     }
   }
   BlobDumpTool tool;
-  Status s =
-      tool.Run(file, show_key, show_blob, show_uncompressed_blob, show_summary);
+  Status s = tool.Run(file, show_key, show_blob, show_summary);
   if (!s.ok()) {
     fprintf(stderr, "Failed: %s\n", s.ToString().c_str());
     return -1;
diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc b/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc
index 77a6d1b2bb3b..146e1d5c174e 100644
--- a/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc
+++ b/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc
@@ -26,6 +26,7 @@ int main() {
 #include "test_util/testutil.h"
 #include "tools/block_cache_analyzer/block_cache_trace_analyzer.h"
 #include "trace_replay/block_cache_tracer.h"
+#include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -88,7 +89,7 @@ class BlockCacheTracerTest : public testing::Test {
       case 4:
         return TableReaderCaller::kUserIterator;
     }
-    // This cannot happend.
+    // This cannot happen.
     assert(false);
     return TableReaderCaller::kMaxBlockCacheLookupCaller;
   }
diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh
index b137fcc2a922..44c513caf2f5 100755
--- a/tools/check_format_compatible.sh
+++ b/tools/check_format_compatible.sh
@@ -11,6 +11,8 @@
 # Return value 0 means all regression tests pass. 1 if not pass.
 #
 # Environment options:
+#  SANITY_CHECK=1 - Do a syntax check and git checkout test as a sanity check
+#    that the script hasn't been broken by e.g. adding a new release wrongly.
 #  SHORT_TEST=1 - Test only the oldest branch for each kind of test. This is
 #    a good choice for PR validation as it is relatively fast and will find
 #    most issues.
@@ -135,7 +137,7 @@ EOF
 
 # To check for DB forward compatibility with loading options (old version
 # reading data from new), as well as backward compatibility
-declare -a db_forward_with_options_refs=("8.6.fb" "8.7.fb" "8.8.fb" "8.9.fb" "8.10.fb" "8.11.fb" "9.0.fb" "9.1.fb" "9.2.fb" "9.3.fb" "9.4.fb" "9.5.fb" "9.6.fb" "9.7.fb" "9.8.fb" "9.9.fb" "9.10.fb" "9.11.fb" "10.0.fb")
+declare -a db_forward_with_options_refs=("10.4.fb" "10.5.fb" "10.6.fb" "10.7.fb" "10.8.fb" "10.9.fb" "10.10.fb" "10.11.fb")
 # To check for DB forward compatibility without loading options (in addition
 # to the "with loading options" set), as well as backward compatibility
 declare -a db_forward_no_options_refs=() # N/A at the moment
@@ -143,7 +145,7 @@ declare -a db_forward_no_options_refs=() # N/A at the moment
 # To check for SST ingestion backward compatibility (new version reading
 # data from old) (ldb ingest_extern_sst added in 5.16.x, back-ported to
 # 5.14.x, 5.15.x)
-declare -a ext_backward_only_refs=("5.14.fb" "5.15.fb" "5.16.fb" "5.17.fb" "5.18.fb" "6.0.fb" "6.1.fb" "6.2.fb" "6.3.fb" "6.4.fb" "6.5.fb" "6.6.fb" "6.7.fb" "6.8.fb" "6.9.fb" "6.10.fb" "6.11.fb" "6.12.fb" "6.13.fb" "6.14.fb" "6.15.fb" "6.16.fb" "6.17.fb" "6.18.fb" "6.19.fb" "6.20.fb" "6.21.fb" "6.22.fb" "6.23.fb" "6.24.fb" "6.25.fb" "6.26.fb" "6.27.fb" "6.28.fb" "6.29.fb" "7.0.fb" "7.1.fb" "7.2.fb" "7.3.fb" "7.4.fb" "7.5.fb" "7.6.fb" "7.7.fb" "7.8.fb" "7.9.fb" "7.10.fb" "8.0.fb" "8.1.fb" "8.2.fb" "8.3.fb" "8.4.fb" "8.5.fb")
+declare -a ext_backward_only_refs=("5.14.fb" "5.15.fb" "5.16.fb" "5.17.fb" "5.18.fb" "6.0.fb" "6.1.fb" "6.2.fb" "6.3.fb" "6.4.fb" "6.5.fb" "6.6.fb" "6.7.fb" "6.8.fb" "6.9.fb" "6.10.fb" "6.11.fb" "6.12.fb" "6.13.fb" "6.14.fb" "6.15.fb" "6.16.fb" "6.17.fb" "6.18.fb" "6.19.fb" "6.20.fb" "6.21.fb" "6.22.fb" "6.23.fb" "6.24.fb" "6.25.fb" "6.26.fb" "6.27.fb" "6.28.fb" "6.29.fb" "7.0.fb" "7.1.fb" "7.2.fb" "7.3.fb" "7.4.fb" "7.5.fb" "7.6.fb" "7.7.fb" "7.8.fb" "7.9.fb" "7.10.fb" "8.0.fb" "8.1.fb" "8.2.fb" "8.3.fb" "8.4.fb" "8.5.fb" "8.6.fb" "8.7.fb" "8.8.fb" "8.9.fb" "8.10.fb" "8.11.fb" "9.0.fb" "9.1.fb" "9.2.fb" "9.3.fb" "9.4.fb" "9.5.fb" "9.6.fb" "9.7.fb" "9.8.fb" "9.9.fb" "9.10.fb" "9.11.fb" "10.0.fb" "10.1.fb" "10.2.fb" "10.3.fb")
 # To check for SST ingestion forward compatibility (old version reading
 # data from new) as well as backward compatibility
 declare -a ext_forward_refs=("${db_forward_no_options_refs[@]}" "${db_forward_with_options_refs[@]}")
@@ -157,8 +159,9 @@ declare -a bak_forward_refs=("${db_forward_no_options_refs[@]}" "${db_forward_wi
 
 # Branches (git refs) to check for DB backward compatibility (new version
 # reading data from old) (in addition to the "forward compatible" list)
-# NOTE: 2.7.fb.branch shows assertion violation in some configurations
-declare -a db_backward_only_refs=("2.2.fb.branch" "2.3.fb.branch" "2.4.fb.branch" "2.5.fb.branch" "2.6.fb.branch" "2.8.1.fb" "3.0.fb.branch" "3.1.fb" "3.2.fb" "3.3.fb" "3.4.fb" "3.5.fb" "3.6.fb" "3.7.fb" "3.8.fb" "3.9.fb" "4.2.fb" "4.3.fb" "4.4.fb" "4.5.fb" "4.6.fb" "4.7.fb" "4.8.fb" "4.9.fb" "4.10.fb" "${bak_backward_only_refs[@]}")
+# NOTE: format_version < 2 support was removed, so we only test back to 4.6.fb
+# (when format_version=2 became the default)
+declare -a db_backward_only_refs=("4.6.fb" "4.7.fb" "4.8.fb" "4.9.fb" "4.10.fb" "${bak_backward_only_refs[@]}")
 
 if [ "$SHORT_TEST" ]; then
   # Use only the first (if exists) of each list
@@ -195,10 +198,14 @@ if [ "$SHORT_TEST" == "" ]; then
   done
 fi
 
+invoke_make()
+{
+    [ "$SANITY_CHECK" ] || make "$@"
+}
 generate_db()
 {
     set +e
-    bash "$script_copy_dir"/generate_random_db.sh "$1" "$2"
+    [ "$SANITY_CHECK" ] || bash "$script_copy_dir"/generate_random_db.sh "$1" "$2"
     if [ $? -ne 0 ]; then
         echo ==== Error loading data from $2 to $1 ====
         exit 1
@@ -209,7 +216,7 @@ generate_db()
 compare_db()
 {
     set +e
-    bash "$script_copy_dir"/verify_random_db.sh "$1" "$2" "$3" "$4" "$5"
+    [ "$SANITY_CHECK" ] || bash "$script_copy_dir"/verify_random_db.sh "$1" "$2" "$3" "$4" "$5"
     if [ $? -ne 0 ]; then
         echo ==== Read different content from $1 and $2 or error happened. ====
         exit 1
@@ -217,10 +224,21 @@ compare_db()
     set -e
 }
 
+compact_db()
+{
+    set +e
+    [ "$SANITY_CHECK" ] || bash "$script_copy_dir"/compact_db.sh "$1" "$2" "$3"
+    if [ $? -ne 0 ]; then
+        echo ==== Error compacting DB at $1 ====
+        exit 1
+    fi
+    set -e
+}
+
 write_external_sst()
 {
     set +e
-    bash "$script_copy_dir"/write_external_sst.sh "$1" "$2" "$3"
+    [ "$SANITY_CHECK" ] || bash "$script_copy_dir"/write_external_sst.sh "$1" "$2" "$3"
     if [ $? -ne 0 ]; then
         echo ==== Error writing external SST file using data from $1 to $3 ====
         exit 1
@@ -231,7 +249,7 @@ write_external_sst()
 ingest_external_sst()
 {
     set +e
-    bash "$script_copy_dir"/ingest_external_sst.sh "$1" "$2"
+    [ "$SANITY_CHECK" ] || bash "$script_copy_dir"/ingest_external_sst.sh "$1" "$2"
     if [ $? -ne 0 ]; then
         echo ==== Error ingesting external SST in $2 to DB at $1 ====
         exit 1
@@ -242,7 +260,7 @@ ingest_external_sst()
 backup_db()
 {
     set +e
-    bash "$script_copy_dir"/backup_db.sh "$1" "$2"
+    [ "$SANITY_CHECK" ] || bash "$script_copy_dir"/backup_db.sh "$1" "$2"
     if [ $? -ne 0 ]; then
         echo ==== Error backing up DB $1 to $2 ====
         exit 1
@@ -253,7 +271,7 @@ backup_db()
 restore_db()
 {
     set +e
-    bash "$script_copy_dir"/restore_db.sh "$1" "$2"
+    [ "$SANITY_CHECK" ] || bash "$script_copy_dir"/restore_db.sh "$1" "$2"
     if [ $? -ne 0 ]; then
         echo ==== Error restoring from $1 to $2 ====
         exit 1
@@ -297,8 +315,8 @@ current_checkout_name="$current_checkout_name ($current_checkout_hash)"
 echo "== Building $current_checkout_name debug"
 git checkout -B $tmp_branch $current_checkout_hash
 force_no_fbcode
-make clean
-DISABLE_WARNING_AS_ERROR=1 make ldb -j$J
+invoke_make clean
+DISABLE_WARNING_AS_ERROR=1 invoke_make ldb -j$J
 
 echo "== Using $current_checkout_name, generate DB with extern SST and ingest"
 current_ext_test_dir=$ext_test_dir"/current"
@@ -318,8 +336,8 @@ do
   echo "== Building $checkout_ref debug"
   git reset --hard $tmp_origin/$checkout_ref
   force_no_fbcode
-  make clean
-  DISABLE_WARNING_AS_ERROR=1 make ldb -j$J
+  invoke_make clean
+  DISABLE_WARNING_AS_ERROR=1 invoke_make ldb -j$J
 
   # We currently assume DB backward compatibility for every branch listed
   echo "== Use $checkout_ref to generate a DB ..."
@@ -349,6 +367,13 @@ do
   then
     echo "== Use $checkout_ref to open DB generated using $current_checkout_name..."
     compare_db $db_test_dir/$checkout_ref $current_db_test_dir forward_${checkout_ref}_dump.txt 0
+
+    echo "== Use $checkout_ref to compact a copy of DB generated using $current_checkout_name..."
+    [ "$SANITY_CHECK" ] || cp -a $current_db_test_dir ${current_db_test_dir}_copy_for_${checkout_ref}
+    compact_db ${current_db_test_dir}_copy_for_${checkout_ref} 0
+
+    echo "== After compaction, re-verify DB copy originally from $current_checkout_name..."
+    compare_db ${current_db_test_dir}_copy_for_${checkout_ref} $current_db_test_dir forward_${checkout_ref}_dump_after_compact.txt 0
   fi
 
   if member_of_array "$checkout_ref" "${db_forward_with_options_refs[@]}"
@@ -376,15 +401,21 @@ done
 echo "== Building $current_checkout_name debug (again, final)"
 git reset --hard $current_checkout_hash
 force_no_fbcode
-make clean
-DISABLE_WARNING_AS_ERROR=1 make ldb -j$J
+invoke_make clean
+DISABLE_WARNING_AS_ERROR=1 invoke_make ldb -j$J
 
 for checkout_ref in "${checkout_refs[@]}"
 do
-  # We currently assume DB backward compatibility for every branch listed
+  # We assume DB backward compatibility for every branch listed
   echo "== Use $current_checkout_name to open DB generated using $checkout_ref..."
   compare_db $db_test_dir/$checkout_ref $current_db_test_dir db_dump.txt 1 0
 
+  echo "== Use $current_checkout_name to compact DB generated using $checkout_ref..."
+  compact_db $db_test_dir/$checkout_ref 1 0
+
+  echo "== After compaction, re-verify DB originally from $checkout_ref..."
+  compare_db $db_test_dir/$checkout_ref $current_db_test_dir db_dump_after_compact.txt 1 0
+
   if member_of_array "$checkout_ref" "${ext_backward_only_refs[@]}" ||
     member_of_array "$checkout_ref" "${ext_forward_refs[@]}"
   then
@@ -404,4 +435,8 @@ do
   fi
 done
 
-echo ==== Compatibility Test PASSED ====
+if [ "$SANITY_CHECK" ]; then
+  echo "==== check_format_compatible.sh sanity check PASSED ===="
+else
+  echo ==== Compatibility Test PASSED ====
+fi
diff --git a/tools/compact_db.sh b/tools/compact_db.sh
new file mode 100755
index 000000000000..8bcd95c0e906
--- /dev/null
+++ b/tools/compact_db.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#
+# A shell script to compact DB generated by generate_random_db.sh.
+# ./ldb needs to be available to be executed.
+#
+# Usage: <SCRIPT> <DB Path> [if_try_load_options] [if_ignore_unknown_options]
+
+if [ "$#" -lt 1 ]; then
+  echo "usage: $BASH_SOURCE <db_directory> [if_try_load_options] [if_ignore_unknown_options]"
+  exit 1
+fi
+
+db_dir=$1
+try_load_options=${2:-"1"}
+ignore_unknown_options=${3:-"0"}
+extra_params=
+
+if [ "$try_load_options" = "0" ]; then
+  extra_params=" --try_load_options=false"
+elif [ "$try_load_options" = "1" ]; then
+  extra_params=" --try_load_options=true"
+fi
+
+if [ "$ignore_unknown_options" = "1" ]; then
+  extra_params="$extra_params --ignore_unknown_options"
+fi
+
+set -e
+echo == Compacting DB at $db_dir
+./ldb compact --db=$db_dir $extra_params
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 9155be672e7b..5098953cd993 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -47,6 +47,7 @@
 #include "options/cf_options.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
+#include "rocksdb/advanced_compression.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/db.h"
@@ -83,6 +84,7 @@
 #include "util/gflags_compat.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
+#include "util/simple_mixed_compressor.h"
 #include "util/stderr_logger.h"
 #include "util/string_util.h"
 #include "util/xxhash.h"
@@ -92,7 +94,6 @@
 #include "utilities/merge_operators/bytesxor.h"
 #include "utilities/merge_operators/sortlist.h"
 #include "utilities/persistent_cache/block_cache_tier.h"
-
 #ifdef MEMKIND
 #include "memory/memkind_kmem_allocator.h"
 #endif
@@ -129,6 +130,7 @@ DEFINE_string(
     "compact1,"
     "waitforcompaction,"
     "multireadrandom,"
+    "multiscan,"
     "mixgraph,"
     "readseq,"
     "readtorowcache,"
@@ -158,6 +160,7 @@ DEFINE_string(
     "readrandomoperands,"
     "backup,"
     "restore,"
+    "openandcompact,"
     "approximatememtablestats",
 
     "Comma-separated list of operations to run in the specified"
@@ -229,6 +232,9 @@ DEFINE_string(
     "\tcompact1  -- compact L1 into L2\n"
     "\twaitforcompaction - pause until compaction is (probably) done\n"
     "\tflush - flush the memtable\n"
+    "\topenandcompact -- Open DB and compact all files to bottommost level, "
+    "writing output to separate directory without modifying source DB. "
+    "Designed for remote compaction service testing\n"
     "\tstats       -- Print DB stats\n"
     "\tresetstats  -- Reset DB stats\n"
     "\tlevelstats  -- Print the number of files and bytes per level\n"
@@ -333,6 +339,13 @@ DEFINE_bool(use_uint64_comparator, false, "use Uint64 user comparator");
 
 DEFINE_int64(batch_size, 1, "Batch size");
 
+DEFINE_int64(multiscan_size, 10,
+             "MultiScan size - number of multiscans of size `batch_size`");
+
+DEFINE_int64(
+    multiscan_stride, 100,
+    "The amount of keys between two successive Scan operations in multiscan");
+
 static bool ValidateKeySize(const char* /*flagname*/, int32_t /*value*/) {
   return true;
 }
@@ -434,6 +447,14 @@ DEFINE_int64(db_write_buffer_size,
              ROCKSDB_NAMESPACE::Options().db_write_buffer_size,
              "Number of bytes to buffer in all memtables before compacting");
 
+DEFINE_int64(max_manifest_file_size,
+             ROCKSDB_NAMESPACE::Options().max_manifest_file_size,
+             "Max manifest file size (or minimum max with auto-tuning)");
+
+DEFINE_int32(max_manifest_space_amp_pct,
+             ROCKSDB_NAMESPACE::Options().max_manifest_space_amp_pct,
+             "Max manifest space amp percentage for auto-tuning");
+
 DEFINE_bool(cost_write_buffer_to_cache, false,
             "The usage of memtable is costed to the block cache");
 
@@ -459,20 +480,6 @@ DEFINE_int32(min_write_buffer_number_to_merge,
              " writing less data to storage if there are duplicate records "
              " in each of these individual write buffers.");
 
-DEFINE_int32(max_write_buffer_number_to_maintain,
-             ROCKSDB_NAMESPACE::Options().max_write_buffer_number_to_maintain,
-             "The total maximum number of write buffers to maintain in memory "
-             "including copies of buffers that have already been flushed. "
-             "Unlike max_write_buffer_number, this parameter does not affect "
-             "flushing. This controls the minimum amount of write history "
-             "that will be available in memory for conflict checking when "
-             "Transactions are used. If this value is too low, some "
-             "transactions may fail at commit time due to not being able to "
-             "determine whether there were any write conflicts. Setting this "
-             "value to 0 will cause write buffers to be freed immediately "
-             "after they are flushed.  If this value is set to -1, "
-             "'max_write_buffer_number' will be used.");
-
 DEFINE_int64(max_write_buffer_size_to_maintain,
              ROCKSDB_NAMESPACE::Options().max_write_buffer_size_to_maintain,
              "The total maximum size of write buffers to maintain in memory "
@@ -581,7 +588,7 @@ DEFINE_double(cache_high_pri_pool_ratio, 0.0,
 DEFINE_double(cache_low_pri_pool_ratio, 0.0,
               "Ratio of block cache reserve for low pri blocks.");
 
-DEFINE_string(cache_type, "lru_cache", "Type of block cache.");
+DEFINE_string(cache_type, "hyper_clock_cache", "Type of block cache.");
 
 DEFINE_bool(use_compressed_secondary_cache, false,
             "Use the CompressedSecondaryCache as the secondary cache.");
@@ -610,20 +617,15 @@ static enum ROCKSDB_NAMESPACE::CompressionType
     FLAGS_compressed_secondary_cache_compression_type_e =
         ROCKSDB_NAMESPACE::kLZ4Compression;
 
+DEFINE_string(compression_manager, "none",
+              "Set the compression manager type to mixed(roundrobin) or other "
+              "type. None for BuilInCompressor");
 DEFINE_int32(compressed_secondary_cache_compression_level,
              ROCKSDB_NAMESPACE::CompressionOptions().level,
              "Compression level. The meaning of this value is library-"
              "dependent. If unset, we try to use the default for the library "
              "specified in `--compressed_secondary_cache_compression_type`");
 
-DEFINE_uint32(
-    compressed_secondary_cache_compress_format_version, 2,
-    "compress_format_version can have two values: "
-    "compress_format_version == 1 -- decompressed size is not included"
-    " in the block header."
-    "compress_format_version == 2 -- decompressed size is included"
-    " in the block header in varint32 format.");
-
 DEFINE_bool(use_tiered_cache, false,
             "If use_compressed_secondary_cache is true and "
             "use_tiered_volatile_cache is true, then allocate a tiered cache "
@@ -720,6 +722,16 @@ DEFINE_bool(block_align,
             ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_align,
             "Align data blocks on page size");
 
+DEFINE_uint64(
+    super_block_alignment_size,
+    ROCKSDB_NAMESPACE::BlockBasedTableOptions().super_block_alignment_size,
+    "Configure super block size");
+
+DEFINE_uint64(super_block_alignment_space_overhead_ratio,
+              ROCKSDB_NAMESPACE::BlockBasedTableOptions()
+                  .super_block_alignment_space_overhead_ratio,
+              "Configure space overhead for super block alignment");
+
 DEFINE_int64(prepopulate_block_cache, 0,
              "Pre-populate hot/warm blocks in block cache. 0 to disable and 1 "
              "to insert during flush");
@@ -1022,6 +1034,14 @@ DEFINE_uint64(fifo_compaction_ttl, 0, "TTL for the SST Files in seconds.");
 
 DEFINE_uint64(fifo_age_for_warm, 0, "age_for_warm for FIFO compaction.");
 
+DEFINE_uint64(fifo_compaction_max_data_files_size_mb, 0,
+              "Combined SST + blob file size limit for FIFO compaction "
+              "trimming. 0 means use max_table_files_size (SST-only).");
+
+DEFINE_bool(fifo_compaction_use_kv_ratio_compaction, false,
+            "Enable capacity-derived intra-L0 compaction for FIFO with "
+            "BlobDB. Requires fifo_compaction_max_data_files_size_mb > 0.");
+
 // Stacked BlobDB Options
 DEFINE_bool(use_blob_db, false, "[Stacked BlobDB] Open a BlobDB instance.");
 
@@ -1030,15 +1050,6 @@ DEFINE_bool(
     ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().enable_garbage_collection,
     "[Stacked BlobDB] Enable BlobDB garbage collection.");
 
-DEFINE_double(
-    blob_db_gc_cutoff,
-    ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().garbage_collection_cutoff,
-    "[Stacked BlobDB] Cutoff ratio for BlobDB garbage collection.");
-
-DEFINE_bool(blob_db_is_fifo,
-            ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().is_fifo,
-            "[Stacked BlobDB] Enable FIFO eviction strategy in BlobDB.");
-
 DEFINE_uint64(blob_db_max_db_size,
               ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().max_db_size,
               "[Stacked BlobDB] Max size limit of the directory where blob "
@@ -1053,26 +1064,10 @@ DEFINE_uint64(
     ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().ttl_range_secs,
     "[Stacked BlobDB] TTL bucket size to use when creating blob files.");
 
-DEFINE_uint64(
-    blob_db_min_blob_size,
-    ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().min_blob_size,
-    "[Stacked BlobDB] Smallest blob to store in a file. Blobs "
-    "smaller than this will be inlined with the key in the LSM tree.");
-
-DEFINE_uint64(blob_db_bytes_per_sync,
-              ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().bytes_per_sync,
-              "[Stacked BlobDB] Bytes to sync blob file at.");
-
 DEFINE_uint64(blob_db_file_size,
               ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().blob_file_size,
               "[Stacked BlobDB] Target size of each blob file.");
 
-DEFINE_string(
-    blob_db_compression_type, "snappy",
-    "[Stacked BlobDB] Algorithm to use to compress blobs in blob files.");
-static enum ROCKSDB_NAMESPACE::CompressionType
-    FLAGS_blob_db_compression_type_e = ROCKSDB_NAMESPACE::kSnappyCompression;
-
 // Integrated BlobDB options
 DEFINE_bool(
     enable_blob_files,
@@ -1284,6 +1279,9 @@ DEFINE_bool(
 DEFINE_bool(paranoid_memory_checks, false,
             "Sets CF option paranoid_memory_checks");
 
+DEFINE_bool(memtable_veirfy_per_key_checksum_on_seek, false,
+            "Sets CF option memtable_veirfy_per_key_checksum_on_seek");
+
 DEFINE_bool(
     auto_refresh_iterator_with_snapshot, false,
     "When set to true, RocksDB iterator will automatically refresh itself "
@@ -1293,6 +1291,26 @@ DEFINE_bool(explicit_snapshot, false,
             "When set to true iterators will be initialized with explicit "
             "snapshot");
 
+DEFINE_uint32(memtable_op_scan_flush_trigger,
+              ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
+                  .memtable_op_scan_flush_trigger,
+              "Setting for CF option memtable_op_scan_flush_trigger.");
+
+DEFINE_bool(verify_compression, false,
+            "See BlockBasedTableOptions::verify_compression");
+
+ROCKSDB_NAMESPACE::ToolHooks* hooks_ = nullptr;
+[[noreturn]] void db_bench_exit(int status) {
+  if (hooks_ == nullptr) {
+    exit(status);
+  }
+
+  hooks_->Exit(status);
+
+  // We should exit here but in case they don't we exit anyway.
+  exit(-1);
+};
+
 static enum ROCKSDB_NAMESPACE::CompressionType StringToCompressionType(
     const char* ctype) {
   assert(ctype);
@@ -1315,7 +1333,7 @@ static enum ROCKSDB_NAMESPACE::CompressionType StringToCompressionType(
     return ROCKSDB_NAMESPACE::kZSTD;
   } else {
     fprintf(stderr, "Cannot parse compression type '%s'\n", ctype);
-    exit(1);
+    db_bench_exit(1);
   }
 }
 
@@ -1335,7 +1353,7 @@ static enum ROCKSDB_NAMESPACE::TieredAdmissionPolicy StringToAdmissionPolicy(
     return ROCKSDB_NAMESPACE::kAdmPolicyAllowAll;
   } else {
     fprintf(stderr, "Cannot parse admission policy %s\n", policy);
-    exit(1);
+    db_bench_exit(1);
   }
 }
 
@@ -1733,6 +1751,10 @@ DEFINE_uint64(stats_history_buffer_size,
 DEFINE_bool(avoid_flush_during_recovery,
             ROCKSDB_NAMESPACE::Options().avoid_flush_during_recovery,
             "If true, avoids flushing the recovered WAL data where possible.");
+
+DEFINE_bool(avoid_flush_during_shutdown,
+            ROCKSDB_NAMESPACE::Options().avoid_flush_during_shutdown,
+            "If true, avoids flushing the recovered WAL data where possible.");
 DEFINE_int64(multiread_stride, 0,
              "Stride length for the keys in a MultiGet batch");
 DEFINE_bool(multiread_batched, false, "Use the new MultiGet API");
@@ -1746,6 +1768,9 @@ DEFINE_double(cuckoo_hash_ratio, 0.9, "Hash ratio for Cuckoo SST table.");
 DEFINE_bool(use_hash_search, false,
             "if use kHashSearch instead of kBinarySearch. "
             "This is valid if only we use BlockTable");
+DEFINE_string(index_block_search_type, "binary_search",
+              "Search algorithm for reading index blocks: binary_search or "
+              "interpolation_search.");
 DEFINE_string(merge_operator, "",
               "The merge operator to use with the database."
               "If a new merge operator is specified, be sure to use fresh"
@@ -1817,6 +1842,39 @@ DEFINE_bool(track_and_verify_wals_in_manifest, false,
 
 DEFINE_bool(track_and_verify_wals, false, "See Options.track_and_verify_wals");
 
+DEFINE_int32(same_value_percentage, 0,
+             "Percentage of time value will be same i.e good for compression "
+             "of the block");
+
+DEFINE_bool(universal_reduce_file_locking,
+            ROCKSDB_NAMESPACE::Options()
+                .compaction_options_universal.reduce_file_locking,
+            "See Options().compaction_options_universal.reduce_file_locking");
+
+DEFINE_uint64(
+    multiscan_coalesce_threshold,
+    ROCKSDB_NAMESPACE::MultiScanArgs(ROCKSDB_NAMESPACE::BytewiseComparator())
+        .io_coalesce_threshold,
+    "Configures io coalescing threshold for multiscans");
+
+DEFINE_bool(
+    multiscan_use_async_io,
+    ROCKSDB_NAMESPACE::MultiScanArgs(ROCKSDB_NAMESPACE::BytewiseComparator())
+        .use_async_io,
+    "Sets MultiScanArgs::use_async_io");
+
+DEFINE_bool(openandcompact_allow_resumption, false,
+            "Whether to keep existing progress and enable resume compaction in "
+            "OpenAndCompact benchmark");
+
+DEFINE_bool(openandcompact_test_cancel_on_odd, false,
+            "During OpenAndCompact[Xn], odd runs gets cancelled "
+            "after specified `openandcompact_cancel_after_millseconds`");
+
+DEFINE_uint32(openandcompact_cancel_after_millseconds, 1,
+              "Time to wait before cancelling compaction in odd runs when "
+              "openandcompact_test_cancel_on_odd is true");
+
 namespace ROCKSDB_NAMESPACE {
 namespace {
 static Status CreateMemTableRepFactory(
@@ -1861,7 +1919,7 @@ static enum DistributionType StringToDistributionType(const char* ctype) {
   }
 
   fprintf(stdout, "Cannot parse distribution type '%s'\n", ctype);
-  exit(1);
+  db_bench_exit(1);
 }
 
 class BaseDistribution {
@@ -1937,9 +1995,10 @@ class RandomGenerator {
   std::string data_;
   unsigned int pos_;
   std::unique_ptr<BaseDistribution> dist_;
+  Random rnd;
 
  public:
-  RandomGenerator() {
+  RandomGenerator() : rnd(301) {
     auto max_value_size = FLAGS_value_size_max;
     switch (FLAGS_value_size_distribution_type_e) {
       case kUniform:
@@ -1958,7 +2017,6 @@ class RandomGenerator {
     // We use a limited amount of data over and over again and ensure
     // that it is larger than the compression window (32KB), and also
     // large enough to serve all typical value sizes we want to write.
-    Random rnd(301);
     std::string piece;
     while (data_.size() < (unsigned)std::max(1048576, max_value_size)) {
       // Add a short fragment that is as compressible as specified
@@ -1971,11 +2029,15 @@ class RandomGenerator {
 
   Slice Generate(unsigned int len) {
     assert(len <= data_.size());
-    if (pos_ + len > data_.size()) {
-      pos_ = 0;
+    if (rnd.PercentTrue(FLAGS_same_value_percentage)) {
+      return Slice(data_.data(), len);
+    } else {
+      if (pos_ + len > data_.size()) {
+        pos_ = 0;
+      }
+      pos_ += len;
+      return Slice(data_.data() + pos_ - len, len);
     }
-    pos_ += len;
-    return Slice(data_.data() + pos_ - len, len);
   }
 
   Slice Generate() {
@@ -1996,6 +2058,7 @@ static void AppendWithSpace(std::string* str, Slice msg) {
 
 struct DBWithColumnFamilies {
   std::vector<ColumnFamilyHandle*> cfh;
+  std::unique_ptr<DB> db_owner;
   DB* db;
   OptimisticTransactionDB* opt_txn_db;
   std::atomic<size_t> num_created;  // Need to be updated after all the
@@ -2025,13 +2088,9 @@ struct DBWithColumnFamilies {
     std::for_each(cfh.begin(), cfh.end(),
                   [](ColumnFamilyHandle* cfhi) { delete cfhi; });
     cfh.clear();
-    if (opt_txn_db) {
-      delete opt_txn_db;
-      opt_txn_db = nullptr;
-    } else {
-      delete db;
-      db = nullptr;
-    }
+    db_owner.reset();
+    db = nullptr;
+    opt_txn_db = nullptr;
   }
 
   ColumnFamilyHandle* GetCfh(int64_t rand_num) {
@@ -2176,7 +2235,8 @@ enum OperationType : unsigned char {
   kUncompress,
   kCrc,
   kHash,
-  kOthers
+  kOthers,
+  kMultiScan
 };
 
 static std::unordered_map<OperationType, std::string, std::hash<unsigned char>>
@@ -2185,7 +2245,7 @@ static std::unordered_map<OperationType, std::string, std::hash<unsigned char>>
                            {kMerge, "merge"},       {kUpdate, "update"},
                            {kCompress, "compress"}, {kCompress, "uncompress"},
                            {kCrc, "crc"},           {kHash, "hash"},
-                           {kOthers, "op"}};
+                           {kOthers, "op"},         {kMultiScan, "multiscan"}};
 
 class CombinedStats;
 class Stats {
@@ -2565,24 +2625,33 @@ class CombinedStats {
     const char* name = bench_name.c_str();
     int num_runs = static_cast<int>(throughput_ops_.size());
 
+    double avg_ops_per_sec = CalcAvg(throughput_ops_);
+    double avg_millis_per_op =
+        (avg_ops_per_sec > 0) ? (1000.0 / avg_ops_per_sec) : 0;
+
+    printf("\n");
+
     if (throughput_mbs_.size() == throughput_ops_.size()) {
       // \xC2\xB1 is +/- character in UTF-8
       fprintf(stdout,
-              "%s [AVG    %d runs] : %d (\xC2\xB1 %d) ops/sec; %6.1f (\xC2\xB1 "
+              "%s [AVG    %d runs] : %d (\xC2\xB1 %d) ops/sec; %.3f ms/op; "
+              "%6.1f (\xC2\xB1 "
               "%.1f) MB/sec\n"
               "%s [MEDIAN %d runs] : %d ops/sec; %6.1f MB/sec\n",
               name, num_runs, static_cast<int>(CalcAvg(throughput_ops_)),
               static_cast<int>(CalcConfidence95(throughput_ops_)),
-              CalcAvg(throughput_mbs_), CalcConfidence95(throughput_mbs_), name,
-              num_runs, static_cast<int>(CalcMedian(throughput_ops_)),
+              avg_millis_per_op, CalcAvg(throughput_mbs_),
+              CalcConfidence95(throughput_mbs_), name, num_runs,
+              static_cast<int>(CalcMedian(throughput_ops_)),
               CalcMedian(throughput_mbs_));
     } else {
       fprintf(stdout,
-              "%s [AVG    %d runs] : %d (\xC2\xB1 %d) ops/sec\n"
+              "%s [AVG    %d runs] : %d (\xC2\xB1 %d) ops/sec; %.3f ms/op\n"
               "%s [MEDIAN %d runs] : %d ops/sec\n",
               name, num_runs, static_cast<int>(CalcAvg(throughput_ops_)),
-              static_cast<int>(CalcConfidence95(throughput_ops_)), name,
-              num_runs, static_cast<int>(CalcMedian(throughput_ops_)));
+              static_cast<int>(CalcConfidence95(throughput_ops_)),
+              avg_millis_per_op, name, num_runs,
+              static_cast<int>(CalcMedian(throughput_ops_)));
     }
   }
 
@@ -2741,6 +2810,8 @@ class Duration {
   uint64_t start_at_;
 };
 
+// Global run counter for cancel/resume-OpenAndCompact() testing
+static std::atomic<int> openandcompact_run_counter{0};
 class Benchmark {
  private:
   std::shared_ptr<Cache> cache_;
@@ -2835,12 +2906,18 @@ class Benchmark {
     return true;
   }
 
-  inline bool CompressSlice(const CompressionInfo& compression_info,
-                            const Slice& input, std::string* compressed) {
-    constexpr uint32_t compress_format_version = 2;
-
-    return CompressData(input, compression_info, compress_format_version,
-                        compressed);
+  std::unique_ptr<Compressor> GetCompressor() {
+    CompressionOptions opts;
+    opts.level = FLAGS_compression_level;
+    // TODO: inter-operate with FLAGS_compression_manager
+    auto compressor = GetBuiltinV2CompressionManager()->GetCompressor(
+        opts, FLAGS_compression_type_e);
+    if (compressor &&
+        compressor->GetPreferredCompressionType() != FLAGS_compression_type_e) {
+      // For benchmarking, don't fall back on a different compression type
+      compressor.reset();
+    }
+    return compressor;
   }
 
   void PrintHeader(const Options& options) {
@@ -2882,16 +2959,23 @@ class Benchmark {
       fprintf(stderr, "Running in NUMA enabled mode.\n");
 #ifndef NUMA
       fprintf(stderr, "NUMA is not defined in the system.\n");
-      exit(1);
+      db_bench_exit(1);
 #else
       if (numa_available() == -1) {
         fprintf(stderr, "NUMA is not supported by the system.\n");
-        exit(1);
+        db_bench_exit(1);
       }
 #endif
     }
-
-    auto compression = CompressionTypeToString(FLAGS_compression_type_e);
+    // mixed compression  manager expect compression type to be expliciltiy
+    // configured through Options to be zstd
+    auto compression = std::string("zstd");
+    if (!strcasecmp(FLAGS_compression_manager.c_str(), "none")) {
+      compression = CompressionTypeToString(FLAGS_compression_type_e);
+    } else {
+      fprintf(stdout, "Compression manager: %s\n",
+              FLAGS_compression_manager.c_str());
+    }
     fprintf(stdout, "Compression: %s\n", compression.c_str());
     fprintf(stdout, "Compression sampling rate: %" PRId64 "\n",
             FLAGS_sample_for_compression);
@@ -2919,19 +3003,30 @@ class Benchmark {
       // The test string should not be too small.
       const int len = FLAGS_block_size;
       std::string input_str(len, 'y');
-      std::string compressed;
-      CompressionOptions opts;
-      CompressionContext context(FLAGS_compression_type_e, opts);
-      CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
-                           FLAGS_compression_type_e,
-                           FLAGS_sample_for_compression);
-      bool result = CompressSlice(info, Slice(input_str), &compressed);
-
-      if (!result) {
-        fprintf(stdout, "WARNING: %s compression is not enabled\n",
-                compression);
-      } else if (compressed.size() >= input_str.size()) {
-        fprintf(stdout, "WARNING: %s compression is not effective\n",
+      auto compressor = GetCompressor();
+      if (compressor) {
+        GrowableBuffer compressed;
+        compressed.ResetForSize(input_str.size());
+        CompressionType actual_type = kNoCompression;
+        auto working_area = compressor->ObtainWorkingArea();
+        Status s = compressor->CompressBlock(
+            Slice(input_str), compressed.data(), &compressed.MutableSize(),
+            &actual_type, &working_area);
+        if (!s.ok()) {
+          fprintf(stdout, "WARNING: compression test run failure: %s\n",
+                  s.ToString().c_str());
+        } else if (actual_type == kNoCompression) {
+          fprintf(stdout,
+                  "WARNING: %s compression is not effective or declined\n",
+                  compression);
+        } else if (actual_type != FLAGS_compression_type_e) {
+          fprintf(
+              stdout,
+              "WARNING: using %s compression in place of %s (unsupported?)\n",
+              CompressionTypeToString(actual_type).c_str(), compression);
+        }
+      } else {
+        fprintf(stdout, "WARNING: %s compression is not available\n",
                 compression);
       }
     }
@@ -3082,14 +3177,14 @@ class Benchmark {
       JemallocAllocatorOptions jemalloc_options;
       if (!NewJemallocNodumpAllocator(jemalloc_options, &allocator).ok()) {
         fprintf(stderr, "JemallocNodumpAllocator not supported.\n");
-        exit(1);
+        db_bench_exit(1);
       }
     } else if (FLAGS_use_cache_memkind_kmem_allocator) {
 #ifdef MEMKIND
       allocator = std::make_shared<MemkindKmemAllocator>();
 #else
       fprintf(stderr, "Memkind library is not linked with the binary.\n");
-      exit(1);
+      db_bench_exit(1);
 #endif
     }
 
@@ -3120,8 +3215,6 @@ class Benchmark {
           FLAGS_compressed_secondary_cache_compression_type_e;
       secondary_cache_opts.compression_opts.level =
           FLAGS_compressed_secondary_cache_compression_level;
-      secondary_cache_opts.compress_format_version =
-          FLAGS_compressed_secondary_cache_compress_format_version;
       if (FLAGS_use_tiered_cache) {
         use_tiered_cache = true;
         adm_policy = StringToAdmissionPolicy(FLAGS_tiered_adm_policy.c_str());
@@ -3133,7 +3226,7 @@ class Benchmark {
             stderr,
             "Cannot specify both --secondary_cache_uri and "
             "--use_compressed_secondary_cache when using a non-tiered cache\n");
-        exit(1);
+        db_bench_exit(1);
       }
       Status s = SecondaryCache::CreateFromString(
           ConfigOptions(), FLAGS_secondary_cache_uri, &secondary_cache);
@@ -3141,7 +3234,7 @@ class Benchmark {
         fprintf(stderr,
                 "No secondary cache registered matching string: %s status=%s\n",
                 FLAGS_secondary_cache_uri.c_str(), s.ToString().c_str());
-        exit(1);
+        db_bench_exit(1);
       }
     }
 
@@ -3152,21 +3245,21 @@ class Benchmark {
       if (block_cache == nullptr) {
         fprintf(stderr, "No  cache registered matching string: %s status=%s\n",
                 FLAGS_cache_uri.c_str(), s.ToString().c_str());
-        exit(1);
+        db_bench_exit(1);
       }
     } else if (FLAGS_cache_type == "clock_cache") {
       fprintf(stderr, "Old clock cache implementation has been removed.\n");
-      exit(1);
+      db_bench_exit(1);
     } else if (EndsWith(FLAGS_cache_type, "hyper_clock_cache")) {
       size_t estimated_entry_charge;
-      if (FLAGS_cache_type == "fixed_hyper_clock_cache" ||
-          FLAGS_cache_type == "hyper_clock_cache") {
+      if (FLAGS_cache_type == "fixed_hyper_clock_cache") {
         estimated_entry_charge = FLAGS_block_size;
-      } else if (FLAGS_cache_type == "auto_hyper_clock_cache") {
+      } else if (FLAGS_cache_type == "auto_hyper_clock_cache" ||
+                 FLAGS_cache_type == "hyper_clock_cache") {
         estimated_entry_charge = 0;
       } else {
         fprintf(stderr, "Cache type not supported.");
-        exit(1);
+        db_bench_exit(1);
       }
       HyperClockCacheOptions opts(FLAGS_cache_size, estimated_entry_charge,
                                   FLAGS_cache_numshardbits);
@@ -3222,12 +3315,12 @@ class Benchmark {
       }
     } else {
       fprintf(stderr, "Cache type not supported.");
-      exit(1);
+      db_bench_exit(1);
     }
 
     if (!block_cache) {
       fprintf(stderr, "Unable to allocate block cache\n");
-      exit(1);
+      db_bench_exit(1);
     }
     return block_cache;
   }
@@ -3275,7 +3368,7 @@ class Benchmark {
 
     if (FLAGS_prefix_size > FLAGS_key_size) {
       fprintf(stderr, "prefix size is larger than key size");
-      exit(1);
+      db_bench_exit(1);
     }
 
     std::vector<std::string> files;
@@ -3316,8 +3409,8 @@ class Benchmark {
 
   void DeleteDBs() {
     db_.DeleteDBs();
-    for (const DBWithColumnFamilies& dbwcf : multi_dbs_) {
-      delete dbwcf.db;
+    for (auto& dbwcf : multi_dbs_) {
+      dbwcf.DeleteDBs();
     }
   }
 
@@ -3422,11 +3515,13 @@ class Benchmark {
 
   void VerifyDBFromDB(std::string& truth_db_name) {
     DBWithColumnFamilies truth_db;
-    auto s = DB::OpenForReadOnly(open_options_, truth_db_name, &truth_db.db);
+    auto s =
+        DB::OpenForReadOnly(open_options_, truth_db_name, &truth_db.db_owner);
     if (!s.ok()) {
       fprintf(stderr, "open error: %s\n", s.ToString().c_str());
-      exit(1);
+      db_bench_exit(1);
     }
+    truth_db.db = truth_db.db_owner.get();
     ReadOptions ro;
     ro.total_order_seek = true;
     std::unique_ptr<Iterator> truth_iter(truth_db.db->NewIterator(ro));
@@ -3455,7 +3550,7 @@ class Benchmark {
 
   void ErrorExit() {
     DeleteDBs();
-    exit(1);
+    db_bench_exit(1);
   }
 
   void Run(ToolHooks& hooks) {
@@ -3621,6 +3716,12 @@ class Benchmark {
         fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
                 entries_per_batch_);
         method = &Benchmark::MultiReadRandom;
+      } else if (name == "multiscan") {
+        fprintf(stderr, "multiscan_stride = %" PRIi64 "\n",
+                FLAGS_multiscan_stride);
+        fprintf(stderr, "multiscan_size = %" PRIi64 "\n", FLAGS_multiscan_size);
+        fprintf(stderr, "seek_nexts = %" PRIi32 "\n", FLAGS_seek_nexts);
+        method = &Benchmark::MultiScan;
       } else if (name == "multireadwhilewriting") {
         fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
                 entries_per_batch_);
@@ -3685,7 +3786,7 @@ class Benchmark {
         if (FLAGS_merge_operator.empty()) {
           fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
                   name.c_str());
-          exit(1);
+          db_bench_exit(1);
         }
         method = &Benchmark::MergeRandom;
       } else if (name == "randomwithverify") {
@@ -3781,6 +3882,9 @@ class Benchmark {
         method = &Benchmark::Backup;
       } else if (name == "restore") {
         method = &Benchmark::Restore;
+      } else if (name == "openandcompact") {
+        fresh_db = false;
+        method = &Benchmark::OpenAndCompact;
       } else if (!name.empty()) {  // No error message for empty name
         fprintf(stderr, "unknown benchmark '%s'\n", name.c_str());
         ErrorExit();
@@ -3798,7 +3902,7 @@ class Benchmark {
           }
           Options options = open_options_;
           for (size_t i = 0; i < multi_dbs_.size(); i++) {
-            delete multi_dbs_[i].db;
+            multi_dbs_[i].DeleteDBs();
             if (!open_options_.wal_dir.empty()) {
               options.wal_dir = GetPathForMultiple(open_options_.wal_dir, i);
             }
@@ -4129,7 +4233,7 @@ class Benchmark {
       thread->stats.FinishedOps(nullptr, nullptr, 1, kOthers);
     }
     if (ptr == nullptr) {
-      exit(1);  // Disable unused variable warning.
+      db_bench_exit(1);  // Disable unused variable warning.
     }
   }
 
@@ -4138,25 +4242,37 @@ class Benchmark {
     Slice input = gen.Generate(FLAGS_block_size);
     int64_t bytes = 0;
     int64_t produced = 0;
-    bool ok = true;
-    std::string compressed;
-    CompressionOptions opts;
-    opts.level = FLAGS_compression_level;
-    CompressionContext context(FLAGS_compression_type_e, opts);
-    CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
-                         FLAGS_compression_type_e,
-                         FLAGS_sample_for_compression);
+    Status s;
+
+    auto compressor = GetCompressor();
+    if (!compressor) {
+      thread->stats.AddMessage("(compression type not supported)");
+      return;
+    }
+    auto working_area = compressor->ObtainWorkingArea();
+
+    GrowableBuffer compressed;
     // Compress 1G
-    while (ok && bytes < int64_t(1) << 30) {
-      compressed.clear();
-      ok = CompressSlice(info, input, &compressed);
+    while (bytes < int64_t(1) << 30) {
+      compressed.ResetForSize(input.size());
+      CompressionType actual_type = kNoCompression;
+      s = compressor->CompressBlock(input, compressed.data(),
+                                    &compressed.MutableSize(), &actual_type,
+                                    &working_area);
+      if (UNLIKELY(!s.ok())) {
+        break;
+      }
+      if (UNLIKELY(actual_type == kNoCompression)) {
+        s = Status::Aborted("Unable to compress smaller than input");
+        break;
+      }
       produced += compressed.size();
       bytes += input.size();
       thread->stats.FinishedOps(nullptr, nullptr, 1, kCompress);
     }
 
-    if (!ok) {
-      thread->stats.AddMessage("(compression failure)");
+    if (!s.ok()) {
+      thread->stats.AddMessage("(compression failure: " + s.ToString() + ")");
     } else {
       char buf[340];
       snprintf(buf, sizeof(buf), "(output: %.1f%%)",
@@ -4169,37 +4285,59 @@ class Benchmark {
   void Uncompress(ThreadState* thread) {
     RandomGenerator gen;
     Slice input = gen.Generate(FLAGS_block_size);
-    std::string compressed;
-
-    CompressionOptions compression_opts;
-    compression_opts.level = FLAGS_compression_level;
-    CompressionContext compression_ctx(FLAGS_compression_type_e,
-                                       compression_opts);
-    CompressionInfo compression_info(
-        compression_opts, compression_ctx, CompressionDict::GetEmptyDict(),
-        FLAGS_compression_type_e, FLAGS_sample_for_compression);
-    UncompressionContext uncompression_ctx(FLAGS_compression_type_e);
-    UncompressionInfo uncompression_info(uncompression_ctx,
-                                         UncompressionDict::GetEmptyDict(),
-                                         FLAGS_compression_type_e);
-
-    bool ok = CompressSlice(compression_info, input, &compressed);
-    int64_t bytes = 0;
-    size_t uncompressed_size = 0;
-    while (ok && bytes < 1024 * 1048576) {
-      constexpr uint32_t compress_format_version = 2;
 
-      CacheAllocationPtr uncompressed = UncompressData(
-          uncompression_info, compressed.data(), compressed.size(),
-          &uncompressed_size, compress_format_version);
+    auto compressor = GetCompressor();
+    if (!compressor) {
+      thread->stats.AddMessage("(compression type not supported)");
+      return;
+    }
+
+    // Compress the input first
+    GrowableBuffer compressed;
+    compressed.ResetForSize(input.size());
+    CompressionType actual_type = kNoCompression;
+    Status s = compressor->CompressBlock(
+        input, compressed.data(), &compressed.MutableSize(), &actual_type,
+        /*working_area=*/nullptr);
+    if (!s.ok()) {
+      thread->stats.AddMessage("(compression failure: " + s.ToString() + ")");
+      return;
+    }
+    if (actual_type != FLAGS_compression_type_e) {
+      thread->stats.AddMessage("(failed to compress smaller than input)");
+      return;
+    }
+
+    // TODO: inter-operate with FLAGS_compression_manager
+    auto decompressor =
+        GetBuiltinV2CompressionManager()->GetDecompressorOptimizeFor(
+            actual_type);
+    auto decomp_working_area = decompressor->ObtainWorkingArea(actual_type);
 
-      ok = uncompressed.get() != nullptr;
+    int64_t bytes = 0;
+    while (bytes < 1024 * 1048576) {
+      Decompressor::Args args;
+      args.compression_type = actual_type;
+      args.compressed_data = compressed.AsSlice();
+      args.working_area = &decomp_working_area;
+
+      s = decompressor->ExtractUncompressedSize(args);
+      if (UNLIKELY(!s.ok())) {
+        break;
+      }
+
+      CacheAllocationPtr uncompressed = AllocateBlock(args.uncompressed_size,
+                                                      /*allocator=*/nullptr);
+      s = decompressor->DecompressBlock(args, uncompressed.get());
+      if (UNLIKELY(!s.ok())) {
+        break;
+      }
       bytes += input.size();
       thread->stats.FinishedOps(nullptr, nullptr, 1, kUncompress);
     }
 
-    if (!ok) {
-      thread->stats.AddMessage("(compression failure)");
+    if (!s.ok()) {
+      thread->stats.AddMessage("(decompression failure: " + s.ToString() + ")");
     } else {
       thread->stats.AddBytes(bytes);
     }
@@ -4225,7 +4363,7 @@ class Benchmark {
       }
       fprintf(stderr, "Unable to load options file %s --- %s\n",
               FLAGS_options_file.c_str(), s.ToString().c_str());
-      exit(1);
+      db_bench_exit(1);
     }
     return false;
   }
@@ -4249,6 +4387,7 @@ class Benchmark {
     options.stats_history_buffer_size =
         static_cast<size_t>(FLAGS_stats_history_buffer_size);
     options.avoid_flush_during_recovery = FLAGS_avoid_flush_during_recovery;
+    options.avoid_flush_during_shutdown = FLAGS_avoid_flush_during_shutdown;
 
     options.compression_opts.level = FLAGS_compression_level;
     options.compression_opts.max_dict_bytes = FLAGS_compression_max_dict_bytes;
@@ -4266,13 +4405,13 @@ class Benchmark {
       options.write_buffer_manager.reset(
           new WriteBufferManager(FLAGS_db_write_buffer_size, cache_));
     }
+    options.max_manifest_file_size = FLAGS_max_manifest_file_size;
+    options.max_manifest_space_amp_pct = FLAGS_max_manifest_space_amp_pct;
     options.arena_block_size = FLAGS_arena_block_size;
     options.write_buffer_size = FLAGS_write_buffer_size;
     options.max_write_buffer_number = FLAGS_max_write_buffer_number;
     options.min_write_buffer_number_to_merge =
         FLAGS_min_write_buffer_number_to_merge;
-    options.max_write_buffer_number_to_maintain =
-        FLAGS_max_write_buffer_number_to_maintain;
     options.max_write_buffer_size_to_maintain =
         FLAGS_max_write_buffer_size_to_maintain;
     options.max_background_jobs = FLAGS_max_background_jobs;
@@ -4293,12 +4432,16 @@ class Benchmark {
         FLAGS_fifo_compaction_max_table_files_size_mb * 1024 * 1024,
         FLAGS_fifo_compaction_allow_compaction);
     options.compaction_options_fifo.age_for_warm = FLAGS_fifo_age_for_warm;
+    options.compaction_options_fifo.max_data_files_size =
+        FLAGS_fifo_compaction_max_data_files_size_mb * 1024 * 1024;
+    options.compaction_options_fifo.use_kv_ratio_compaction =
+        FLAGS_fifo_compaction_use_kv_ratio_compaction;
     options.prefix_extractor = prefix_extractor_;
     if (FLAGS_use_uint64_comparator) {
       options.comparator = test::Uint64Comparator();
       if (FLAGS_key_size != 8) {
         fprintf(stderr, "Using Uint64 comparator but key size is not 8.\n");
-        exit(1);
+        db_bench_exit(1);
       }
     }
     if (FLAGS_use_stderr_info_logger) {
@@ -4332,14 +4475,14 @@ class Benchmark {
     if (!s.ok()) {
       fprintf(stderr, "Could not create memtable factory: %s\n",
               s.ToString().c_str());
-      exit(1);
+      db_bench_exit(1);
     } else if ((FLAGS_prefix_size == 0) &&
                (options.memtable_factory->IsInstanceOf("prefix_hash") ||
                 options.memtable_factory->IsInstanceOf("hash_linkedlist"))) {
       fprintf(stderr,
               "prefix_size should be non-zero if PrefixHash or "
               "HashLinkedList memtablerep is used\n");
-      exit(1);
+      db_bench_exit(1);
     }
     if (FLAGS_use_plain_table) {
       if (!options.memtable_factory->IsInstanceOf("prefix_hash") &&
@@ -4362,12 +4505,12 @@ class Benchmark {
     } else if (FLAGS_use_cuckoo_table) {
       if (FLAGS_cuckoo_hash_ratio > 1 || FLAGS_cuckoo_hash_ratio < 0) {
         fprintf(stderr, "Invalid cuckoo_hash_ratio\n");
-        exit(1);
+        db_bench_exit(1);
       }
 
       if (!FLAGS_mmap_read) {
         fprintf(stderr, "cuckoo table format requires mmap read to operate\n");
-        exit(1);
+        db_bench_exit(1);
       }
 
       ROCKSDB_NAMESPACE::CuckooTableOptions table_options;
@@ -4383,12 +4526,24 @@ class Benchmark {
         if (FLAGS_prefix_size == 0) {
           fprintf(stderr,
                   "prefix_size not assigned when enable use_hash_search \n");
-          exit(1);
+          db_bench_exit(1);
         }
         block_based_options.index_type = BlockBasedTableOptions::kHashSearch;
       } else {
         block_based_options.index_type = BlockBasedTableOptions::kBinarySearch;
       }
+
+      if (FLAGS_index_block_search_type == "binary_search") {
+        block_based_options.index_block_search_type =
+            BlockBasedTableOptions::kBinary;
+      } else if (FLAGS_index_block_search_type == "interpolation_search") {
+        block_based_options.index_block_search_type =
+            BlockBasedTableOptions::kInterpolation;
+      } else {
+        fprintf(stderr, "Unknown index_block_search_type: %s\n",
+                FLAGS_index_block_search_type.c_str());
+        db_bench_exit(1);
+      }
       block_based_options.decouple_partitioned_filters =
           FLAGS_decouple_partitioned_filters;
       if (FLAGS_partition_index_and_filters || FLAGS_partition_index) {
@@ -4482,6 +4637,7 @@ class Benchmark {
       block_based_options.block_restart_interval = FLAGS_block_restart_interval;
       block_based_options.index_block_restart_interval =
           FLAGS_index_block_restart_interval;
+      TEST_AllowUnsupportedFormatVersion() = true;
       block_based_options.format_version =
           static_cast<uint32_t>(FLAGS_format_version);
       block_based_options.read_amp_bytes_per_bit = FLAGS_read_amp_bytes_per_bit;
@@ -4495,6 +4651,7 @@ class Benchmark {
           FLAGS_initial_auto_readahead_size;
       block_based_options.num_file_reads_for_auto_readahead =
           FLAGS_num_file_reads_for_auto_readahead;
+      block_based_options.verify_compression = FLAGS_verify_compression;
       BlockBasedTableOptions::PrepopulateBlockCache prepopulate_block_cache =
           block_based_options.prepopulate_block_cache;
       switch (FLAGS_prepopulate_block_cache) {
@@ -4549,7 +4706,7 @@ class Benchmark {
         if (!rc_status.ok()) {
           fprintf(stderr, "Error initializing read cache, %s\n",
                   rc_status.ToString().c_str());
-          exit(1);
+          db_bench_exit(1);
         }
       }
 
@@ -4569,7 +4726,7 @@ class Benchmark {
                 stderr,
                 "Unable to create a standalone blob cache if blob_cache_size "
                 "<= 0.\n");
-            exit(1);
+            db_bench_exit(1);
           }
         }
         switch (FLAGS_prepopulate_blob_cache) {
@@ -4581,7 +4738,7 @@ class Benchmark {
             break;
           default:
             fprintf(stderr, "Unknown prepopulate blob cache mode\n");
-            exit(1);
+            db_bench_exit(1);
         }
 
         fprintf(stdout,
@@ -4609,7 +4766,7 @@ class Benchmark {
         fprintf(stderr, "Insufficient number of fanouts specified %d\n",
                 static_cast<int>(
                     FLAGS_max_bytes_for_level_multiplier_additional_v.size()));
-        exit(1);
+        db_bench_exit(1);
       }
       options.max_bytes_for_level_multiplier_additional =
           FLAGS_max_bytes_for_level_multiplier_additional_v;
@@ -4620,6 +4777,34 @@ class Benchmark {
     options.level0_slowdown_writes_trigger =
         FLAGS_level0_slowdown_writes_trigger;
     options.compression = FLAGS_compression_type_e;
+    std::shared_ptr<CompressionManagerWrapper> mgr = nullptr;
+    if (!strcasecmp(FLAGS_compression_manager.c_str(), "mixed")) {
+      mgr =
+          std::make_shared<RoundRobinManager>(GetBuiltinV2CompressionManager());
+    } else if (!strcasecmp(FLAGS_compression_manager.c_str(),
+                           "costpredictor")) {
+      mgr = CreateCostAwareCompressionManager();
+    } else if (!strcasecmp(FLAGS_compression_manager.c_str(), "autoskip")) {
+      mgr = CreateAutoSkipCompressionManager();
+    } else if (!strcasecmp(FLAGS_compression_manager.c_str(), "none")) {
+      options.compression = FLAGS_compression_type_e;
+    } else {
+      // compression manager is not supported
+      // exit with error
+      fprintf(stderr, "Requested compression manager not supported");
+      ErrorExit();
+    }
+    if (FLAGS_compression_type_e == kNoCompression &&
+        strcasecmp(FLAGS_compression_manager.c_str(), "none")) {
+      fprintf(stderr,
+              "Compression type must not be no Compression when using "
+              "compression manager");
+      ErrorExit();
+    }
+    if (mgr != nullptr) {
+      options.compression_manager = mgr;
+    }
+
     if (FLAGS_simulate_hybrid_fs_file != "") {
       options.last_level_temperature = Temperature::kWarm;
     }
@@ -4680,7 +4865,7 @@ class Benchmark {
       if (!s.ok()) {
         fprintf(stderr, "invalid merge operator[%s]: %s\n",
                 FLAGS_merge_operator.c_str(), s.ToString().c_str());
-        exit(1);
+        db_bench_exit(1);
       }
     }
     options.max_successive_merges = FLAGS_max_successive_merges;
@@ -4723,7 +4908,7 @@ class Benchmark {
     if (FLAGS_user_timestamp_size > 0) {
       if (FLAGS_user_timestamp_size != 8) {
         fprintf(stderr, "Only 64 bits timestamps are supported.\n");
-        exit(1);
+        db_bench_exit(1);
       }
       options.comparator = test::BytewiseComparatorWithU64TsWrapper();
     }
@@ -4751,18 +4936,24 @@ class Benchmark {
 
     if (FLAGS_readonly && FLAGS_transaction_db) {
       fprintf(stderr, "Cannot use readonly flag with transaction_db\n");
-      exit(1);
+      db_bench_exit(1);
     }
     if (FLAGS_use_secondary_db &&
         (FLAGS_transaction_db || FLAGS_optimistic_transaction_db)) {
       fprintf(stderr, "Cannot use use_secondary_db flag with transaction_db\n");
-      exit(1);
+      db_bench_exit(1);
     }
     options.memtable_protection_bytes_per_key =
         FLAGS_memtable_protection_bytes_per_key;
     options.block_protection_bytes_per_key =
         FLAGS_block_protection_bytes_per_key;
     options.paranoid_memory_checks = FLAGS_paranoid_memory_checks;
+    options.memtable_veirfy_per_key_checksum_on_seek =
+        FLAGS_memtable_veirfy_per_key_checksum_on_seek;
+    options.memtable_op_scan_flush_trigger =
+        FLAGS_memtable_op_scan_flush_trigger;
+    options.compaction_options_universal.reduce_file_locking =
+        FLAGS_universal_reduce_file_locking;
   }
 
   void InitializeOptionsGeneral(Options* opts, ToolHooks& hooks) {
@@ -4931,7 +5122,7 @@ class Benchmark {
         }
         if (sum != 100) {
           fprintf(stderr, "column_family_distribution items must sum to 100\n");
-          exit(1);
+          db_bench_exit(1);
         }
         if (cfh_idx_to_prob.size() != num_hot) {
           fprintf(stderr,
@@ -4939,16 +5130,20 @@ class Benchmark {
                   " column_family_distribution items; expected "
                   "%" ROCKSDB_PRIszt "\n",
                   cfh_idx_to_prob.size(), num_hot);
-          exit(1);
+          db_bench_exit(1);
         }
       }
       if (FLAGS_readonly) {
         s = hooks.OpenForReadOnly(options, db_name, column_families, &db->cfh,
-                                  &db->db);
+                                  &db->db_owner);
+        if (s.ok()) {
+          db->db = db->db_owner.get();
+        }
       } else if (FLAGS_optimistic_transaction_db) {
         s = hooks.OpenOptimisticTransactionDB(options, db_name, column_families,
                                               &db->cfh, &db->opt_txn_db);
         if (s.ok()) {
+          db->db_owner.reset(db->opt_txn_db);
           db->db = db->opt_txn_db->GetBaseDB();
         }
       } else if (FLAGS_transaction_db) {
@@ -4962,20 +5157,29 @@ class Benchmark {
         s = hooks.OpenTransactionDB(options, txn_db_options, db_name,
                                     column_families, &db->cfh, &ptr);
         if (s.ok()) {
+          db->db_owner.reset(ptr);
           db->db = ptr;
         }
       } else {
-        s = hooks.Open(options, db_name, column_families, &db->cfh, &db->db);
+        s = hooks.Open(options, db_name, column_families, &db->cfh,
+                       &db->db_owner);
+        if (s.ok()) {
+          db->db = db->db_owner.get();
+        }
       }
       db->cfh.resize(FLAGS_num_column_families);
       db->num_created = num_hot;
       db->num_hot = num_hot;
       db->cfh_idx_to_prob = std::move(cfh_idx_to_prob);
     } else if (FLAGS_readonly) {
-      s = hooks.OpenForReadOnly(options, db_name, &db->db, false);
+      s = hooks.OpenForReadOnly(options, db_name, &db->db_owner, false);
+      if (s.ok()) {
+        db->db = db->db_owner.get();
+      }
     } else if (FLAGS_optimistic_transaction_db) {
       s = hooks.OpenOptimisticTransactionDB(options, db_name, &db->opt_txn_db);
       if (s.ok()) {
+        db->db_owner.reset(db->opt_txn_db);
         db->db = db->opt_txn_db->GetBaseDB();
       }
     } else if (FLAGS_transaction_db) {
@@ -4991,23 +5195,20 @@ class Benchmark {
         s = hooks.OpenTransactionDB(options, txn_db_options, db_name, &ptr);
       }
       if (s.ok()) {
+        db->db_owner.reset(ptr);
         db->db = ptr;
       }
     } else if (FLAGS_use_blob_db) {
       // Stacked BlobDB
       blob_db::BlobDBOptions blob_db_options;
       blob_db_options.enable_garbage_collection = FLAGS_blob_db_enable_gc;
-      blob_db_options.garbage_collection_cutoff = FLAGS_blob_db_gc_cutoff;
-      blob_db_options.is_fifo = FLAGS_blob_db_is_fifo;
       blob_db_options.max_db_size = FLAGS_blob_db_max_db_size;
       blob_db_options.ttl_range_secs = FLAGS_blob_db_ttl_range_secs;
-      blob_db_options.min_blob_size = FLAGS_blob_db_min_blob_size;
-      blob_db_options.bytes_per_sync = FLAGS_blob_db_bytes_per_sync;
       blob_db_options.blob_file_size = FLAGS_blob_db_file_size;
-      blob_db_options.compression = FLAGS_blob_db_compression_type_e;
       blob_db::BlobDB* ptr = nullptr;
       s = hooks.Open(options, blob_db_options, db_name, &ptr);
       if (s.ok()) {
+        db->db_owner.reset(ptr);
         db->db = ptr;
       }
     } else if (FLAGS_use_secondary_db) {
@@ -5018,7 +5219,10 @@ class Benchmark {
         FLAGS_secondary_path = default_secondary_path;
       }
       s = hooks.OpenAsSecondary(options, db_name, FLAGS_secondary_path,
-                                &db->db);
+                                &db->db_owner);
+      if (s.ok()) {
+        db->db = db->db_owner.get();
+      }
       if (s.ok() && FLAGS_secondary_update_interval > 0) {
         secondary_update_thread_.reset(new port::Thread(
             [this](int interval, DBWithColumnFamilies* _db) {
@@ -5038,13 +5242,16 @@ class Benchmark {
             FLAGS_secondary_update_interval, db));
       }
     } else if (FLAGS_open_as_follower) {
-      std::unique_ptr<DB> dbptr;
-      s = hooks.OpenAsFollower(options, db_name, FLAGS_leader_path, &dbptr);
+      s = hooks.OpenAsFollower(options, db_name, FLAGS_leader_path,
+                               &db->db_owner);
       if (s.ok()) {
-        db->db = dbptr.release();
+        db->db = db->db_owner.get();
       }
     } else {
-      s = hooks.Open(options, db_name, &db->db);
+      s = hooks.Open(options, db_name, &db->db_owner);
+      if (s.ok()) {
+        db->db = db->db_owner.get();
+      }
     }
     if (FLAGS_report_open_timing) {
       std::cout << "OpenDb:     "
@@ -5053,7 +5260,7 @@ class Benchmark {
     }
     if (!s.ok()) {
       fprintf(stderr, "open error: %s\n", s.ToString().c_str());
-      exit(1);
+      db_bench_exit(1);
     }
   }
 
@@ -5076,6 +5283,206 @@ class Benchmark {
     DoWrite(thread, UNIQUE_RANDOM);
   }
 
+  void OpenAndCompact(ThreadState* thread) {
+    if (thread->tid != 0) {
+      return;
+    }
+
+    int current_run = ++openandcompact_run_counter;
+    bool is_odd_run = (current_run % 2 == 1);
+
+    if (FLAGS_openandcompact_test_cancel_on_odd) {
+      const char* even_description = FLAGS_openandcompact_allow_resumption
+                                         ? "even - resume"
+                                         : "even - normal";
+      fprintf(stdout, "\n--- Run %d (%s) ---\n", current_run,
+              is_odd_run ? "odd - will cancel" : even_description);
+    }
+
+    Status create_status =
+        db_.db->GetEnv()->CreateDirIfMissing(FLAGS_secondary_path);
+    if (!create_status.ok()) {
+      fprintf(stderr, "Failed to create secondary path: %s\n",
+              create_status.ToString().c_str());
+      return;
+    }
+
+    std::string options_file;
+    Status options_status =
+        GetLatestOptionsFileName(FLAGS_db, db_.db->GetEnv(), &options_file);
+    if (!options_status.ok()) {
+      fprintf(stderr, "FAILED: Cannot find OPTIONS file in %s: %s\n",
+              FLAGS_db.c_str(), options_status.ToString().c_str());
+      return;
+    }
+
+    uint64_t options_file_number;
+    FileType type;
+    if (!ParseFileName(options_file, &options_file_number, &type) ||
+        type != kOptionsFile) {
+      fprintf(stderr, "FAILED: Cannot parse OPTIONS file number from %s\n",
+              options_file.c_str());
+      return;
+    }
+
+    CompactionServiceInput compaction_input;
+    compaction_input.cf_name = kDefaultColumnFamilyName;
+
+    std::vector<std::string> input_file_names;
+    ColumnFamilyMetaData cf_meta;
+    db_.db->GetColumnFamilyMetaData(&cf_meta);
+
+    uint64_t total_input_keys = 0;
+    uint64_t total_input_files = 0;
+
+    // Collect files from all levels for full compaction
+    for (const auto& level : cf_meta.levels) {
+      for (const auto& file : level.files) {
+        input_file_names.push_back(file.name);
+        total_input_keys += file.num_entries;
+        total_input_files++;
+      }
+    }
+
+    // Set output level to configured bottom level (num_levels - 1)
+    compaction_input.output_level = FLAGS_num_levels - 1;
+    compaction_input.db_id = "db_bench_openandcompact";
+    compaction_input.options_file_number = options_file_number;
+
+    compaction_input.input_files = input_file_names;
+
+    std::string input_string;
+    Status serialize_status = compaction_input.Write(&input_string);
+    if (!serialize_status.ok()) {
+      fprintf(stderr, "FAILED: Cannot serialize compaction input: %s\n",
+              serialize_status.ToString().c_str());
+      return;
+    }
+
+    fprintf(stdout, "\nInput files: %" PRIu64 " files, %" PRIu64 " keys\n",
+            total_input_files, total_input_keys);
+
+    std::string output_directory =
+        FLAGS_secondary_path + "/openandcompact_" + std::to_string(thread->tid);
+
+    // Always clean up in odd run, depending on
+    // !FLAGS_openandcompact_allow_resumption in even run
+    bool should_cleanup = is_odd_run || !FLAGS_openandcompact_allow_resumption;
+
+    if (should_cleanup) {
+      std::vector<std::string> children;
+      Status list_status = FLAGS_env->GetChildren(output_directory, &children);
+      if (list_status.ok()) {
+        for (const auto& child : children) {
+          if (child != "." && child != "..") {
+            std::string child_path = output_directory + "/" + child;
+            Status del_status = FLAGS_env->DeleteFile(child_path);
+            if (!del_status.ok()) {
+              fprintf(stderr, "Warning: Failed to delete file %s: %s\n",
+                      child_path.c_str(), del_status.ToString().c_str());
+            }
+          }
+        }
+        Status del_dir_status = FLAGS_env->DeleteDir(output_directory);
+        if (!del_dir_status.ok()) {
+          fprintf(stderr, "Warning: Failed to delete directory %s: %s\n",
+                  output_directory.c_str(), del_dir_status.ToString().c_str());
+        }
+      }
+    }
+
+    Status create_output_status =
+        FLAGS_env->CreateDirIfMissing(output_directory);
+    if (!create_output_status.ok()) {
+      fprintf(stderr, "Failed to create output directory %s: %s\n",
+              output_directory.c_str(),
+              create_output_status.ToString().c_str());
+      return;
+    }
+
+    std::string result_string;
+
+    CompactionServiceOptionsOverride options_override;
+    options_override.env = FLAGS_env;
+    BlockBasedTableOptions table_options;
+    options_override.table_factory.reset(
+        NewBlockBasedTableFactory(table_options));
+
+    OpenAndCompactOptions options;
+    std::atomic<bool> should_cancel{false};
+    options.canceled = &should_cancel;
+    options.allow_resumption = FLAGS_openandcompact_allow_resumption;
+
+    Status s;
+    uint64_t start_time = FLAGS_env->NowMicros();
+    uint64_t end_time = start_time;
+
+    if (FLAGS_openandcompact_test_cancel_on_odd && is_odd_run) {
+      std::thread compaction_thread([&]() {
+        s = DB::OpenAndCompact(options, FLAGS_db, output_directory,
+                               input_string, &result_string, options_override);
+        end_time = FLAGS_env->NowMicros();
+      });
+
+      std::thread cancellation_timer([&]() {
+        std::this_thread::sleep_for(std::chrono::milliseconds(
+            FLAGS_openandcompact_cancel_after_millseconds));
+        should_cancel.store(true);
+      });
+
+      compaction_thread.join();
+      cancellation_timer.join();
+    } else {
+      // Normal synchronous operation for even runs or when test_cancel_on_odd
+      // is false
+      s = DB::OpenAndCompact(options, FLAGS_db, output_directory, input_string,
+                             &result_string, options_override);
+      end_time = FLAGS_env->NowMicros();
+    }
+
+    uint64_t latency_micros = end_time - start_time;
+    double latency_seconds = latency_micros / 1000000.0;
+
+    fprintf(stdout,
+            "OpenAndCompact() API call : %.3f micros/op %.3f seconds/op\n",
+            (double)latency_micros, latency_seconds);
+
+    fprintf(stdout, "OpenAndCompact status: %s\n", s.ToString().c_str());
+
+    if (FLAGS_openandcompact_test_cancel_on_odd && is_odd_run) {
+      if (!s.IsManualCompactionPaused()) {
+        fprintf(stdout, "Fail to cancel compaction");
+      }
+      return;
+    } else if (!s.ok()) {
+      fprintf(stderr, "OpenAndCompact failed: %s\n", s.ToString().c_str());
+      return;
+    }
+
+    CompactionServiceResult compaction_result;
+    Status parse_status =
+        CompactionServiceResult::Read(result_string, &compaction_result);
+    if (parse_status.ok()) {
+      uint64_t total_output_size = 0;
+      for (const auto& output_file : compaction_result.output_files) {
+        total_output_size += output_file.file_size;
+      }
+
+      uint64_t num_output_files = compaction_result.output_files.size();
+      uint64_t avg_output_file_size =
+          num_output_files > 0 ? total_output_size / num_output_files : 0;
+
+      fprintf(stdout,
+              "Output: %" PRIu64 " files, average size: %" PRIu64
+              " bytes (%.2f MB)\n",
+              num_output_files, avg_output_file_size,
+              avg_output_file_size / (1024.0 * 1024.0));
+    } else {
+      fprintf(stderr, "Failed to parse compaction result: %s\n",
+              parse_status.ToString().c_str());
+    }
+  }
+
   class KeyGenerator {
    public:
     KeyGenerator(Random64* rand, WriteMode mode, uint64_t num,
@@ -5652,7 +6059,7 @@ class Benchmark {
         if (sorted_runs[i].size() < num_levels - 1) {
           fprintf(stderr, "n is too small to fill %" ROCKSDB_PRIszt " levels\n",
                   num_levels);
-          exit(1);
+          db_bench_exit(1);
         }
       }
       for (size_t i = 0; i < num_db; i++) {
@@ -5707,7 +6114,7 @@ class Benchmark {
         if (sorted_runs[i].size() < num_levels) {
           fprintf(stderr, "n is too small to fill %" ROCKSDB_PRIszt " levels\n",
                   num_levels);
-          exit(1);
+          db_bench_exit(1);
         }
       }
       for (size_t i = 0; i < num_db; i++) {
@@ -6317,6 +6724,75 @@ class Benchmark {
     thread->stats.AddMessage(msg);
   }
 
+  void MultiScan(ThreadState* thread) {
+    const int64_t scan_size = FLAGS_seek_nexts ? FLAGS_seek_nexts : 50;
+    const int64_t readahead =
+        FLAGS_readahead_size ? FLAGS_readahead_size : 1024 * 24;
+    const int64_t multiscan_size = FLAGS_multiscan_size;
+    auto count_hist = std::make_shared<HistogramImpl>();
+    ReadOptions options = read_options_;
+
+    int64_t multiscans_done = 0;
+
+    options.async_io = true;
+    options.readahead_size = readahead;
+
+    Duration duration(FLAGS_duration, reads_);
+    int64_t num_keys = 1;
+    while (!duration.Done(num_keys)) {
+      DB* db = SelectDB(thread);
+      MultiScanArgs opts(open_options_.comparator);
+      opts.io_coalesce_threshold = FLAGS_multiscan_coalesce_threshold;
+      opts.use_async_io = FLAGS_multiscan_use_async_io;
+      std::vector<std::unique_ptr<const char[]>> guards;
+      opts.reserve(multiscan_size);
+      // We create 1 random start, and then multiscan will start from that
+      // random start point And create a set of scans of `scan_size` in size
+      // with `multiscan_stride` space between each scan.
+      uint64_t range = static_cast<uint64_t>(FLAGS_num) -
+                       ((scan_size + FLAGS_multiscan_stride) * multiscan_size);
+      uint64_t start_key = thread->rand.Uniform(range);
+      for (int64_t i = 0; i < multiscan_size; i++) {
+        std::unique_ptr<const char[]> skey_guard;
+        Slice skey = AllocateKey(&skey_guard);
+        guards.push_back(std::move(skey_guard));
+        std::unique_ptr<const char[]> ekey_guard;
+        Slice ekey = AllocateKey(&ekey_guard);
+        guards.push_back(std::move(ekey_guard));
+
+        GenerateKeyFromInt(start_key, FLAGS_num, &skey);
+        uint64_t end_key = start_key + scan_size;
+        GenerateKeyFromInt(end_key, FLAGS_num, &ekey);
+
+        opts.insert(skey, ekey);
+        start_key += scan_size + FLAGS_multiscan_stride;
+      }
+
+      auto iter =
+          db->NewMultiScan(read_options_, db->DefaultColumnFamily(), opts);
+      int64_t keys = 0;
+      for (auto rng : *iter) {
+        for ([[maybe_unused]] auto it : rng) {
+          keys++;
+        }
+        assert(keys > 0);
+      }
+      num_keys = std::max<int64_t>(1, keys);
+
+      if (thread->shared->read_rate_limiter.get() != nullptr) {
+        thread->shared->read_rate_limiter->Request(
+            1, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
+      }
+
+      thread->stats.FinishedOps(nullptr, db, 1, kMultiScan);
+      multiscans_done += 1;
+    }
+
+    char msg[100];
+    snprintf(msg, sizeof(msg), "(multscans:%" PRIu64 ")", multiscans_done);
+    thread->stats.AddMessage(msg);
+  }
+
   void ApproximateMemtableStats(ThreadState* thread) {
     const size_t batch_size = entries_per_batch_;
     std::unique_ptr<const char[]> skey_guard;
@@ -7016,7 +7492,7 @@ class Benchmark {
       thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kDelete);
       if (!s.ok()) {
         fprintf(stderr, "del error: %s\n", s.ToString().c_str());
-        exit(1);
+        db_bench_exit(1);
       }
       i += entries_per_batch_;
     }
@@ -7132,7 +7608,7 @@ class Benchmark {
 
       if (!s.ok()) {
         fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
-        exit(1);
+        db_bench_exit(1);
       }
       bytes += key.size() + val.size() + user_timestamp_size_;
       thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
@@ -7159,7 +7635,7 @@ class Benchmark {
                                &expanded_keys[offset]);
             if (!db->Delete(write_options_, expanded_keys[offset]).ok()) {
               fprintf(stderr, "delete error: %s\n", s.ToString().c_str());
-              exit(1);
+              db_bench_exit(1);
             }
           }
         } else {
@@ -7170,7 +7646,7 @@ class Benchmark {
                                begin_key, end_key)
                    .ok()) {
             fprintf(stderr, "deleterange error: %s\n", s.ToString().c_str());
-            exit(1);
+            db_bench_exit(1);
           }
         }
         thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
@@ -7404,7 +7880,7 @@ class Benchmark {
         Status s = PutMany(db, write_options_, key, gen.Generate());
         if (!s.ok()) {
           fprintf(stderr, "putmany error: %s\n", s.ToString().c_str());
-          exit(1);
+          db_bench_exit(1);
         }
         put_weight--;
         puts_done++;
@@ -7413,7 +7889,7 @@ class Benchmark {
         Status s = DeleteMany(db, write_options_, key);
         if (!s.ok()) {
           fprintf(stderr, "deletemany error: %s\n", s.ToString().c_str());
-          exit(1);
+          db_bench_exit(1);
         }
         delete_weight--;
         deletes_done++;
@@ -7557,7 +8033,7 @@ class Benchmark {
       }
       if (!s.ok()) {
         fprintf(stderr, "put error: %s\n", s.ToString().c_str());
-        exit(1);
+        db_bench_exit(1);
       }
       bytes += key.size() + val.size() + user_timestamp_size_;
       thread->stats.FinishedOps(nullptr, db, 1, kUpdate);
@@ -7604,7 +8080,7 @@ class Benchmark {
       } else if (!status.IsNotFound()) {
         fprintf(stderr, "Get returned an error: %s\n",
                 status.ToString().c_str());
-        exit(1);
+        db_bench_exit(1);
       }
 
       Slice value =
@@ -7742,7 +8218,7 @@ class Benchmark {
 
       if (!s.ok()) {
         fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
-        exit(1);
+        db_bench_exit(1);
       }
       bytes += key.size() + val.size();
       thread->stats.FinishedOps(nullptr, db_with_cfh->db, 1, kMerge);
@@ -7784,7 +8260,7 @@ class Benchmark {
         Status s = db->Merge(write_options_, key, gen.Generate());
         if (!s.ok()) {
           fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
-          exit(1);
+          db_bench_exit(1);
         }
         num_merges++;
         thread->stats.FinishedOps(nullptr, db, 1, kMerge);
@@ -7976,7 +8452,7 @@ class Benchmark {
     Status s = db->VerifyChecksum(ro);
     if (!s.ok()) {
       fprintf(stderr, "VerifyChecksum() failed: %s\n", s.ToString().c_str());
-      exit(1);
+      db_bench_exit(1);
     }
   }
 
@@ -7993,7 +8469,7 @@ class Benchmark {
     if (!s.ok()) {
       fprintf(stderr, "VerifyFileChecksums() failed: %s\n",
               s.ToString().c_str());
-      exit(1);
+      db_bench_exit(1);
     }
   }
 
@@ -8117,7 +8593,7 @@ class Benchmark {
       }
       if (!s.ok()) {
         fprintf(stderr, "Operation failed: %s\n", s.ToString().c_str());
-        exit(1);
+        db_bench_exit(1);
       }
     }
 
@@ -8155,7 +8631,7 @@ class Benchmark {
 
       if (!s.ok()) {
         fprintf(stderr, "Operation failed: %s\n", s.ToString().c_str());
-        exit(1);
+        db_bench_exit(1);
       }
 
       thread->stats.FinishedOps(nullptr, db, 1, kOthers);
@@ -8462,7 +8938,7 @@ class Benchmark {
 
       if (!s.ok()) {
         fprintf(stderr, "Flush failed: %s\n", s.ToString().c_str());
-        exit(1);
+        db_bench_exit(1);
       }
     } else {
       for (const auto& db_with_cfh : multi_dbs_) {
@@ -8476,7 +8952,7 @@ class Benchmark {
 
         if (!s.ok()) {
           fprintf(stderr, "Flush failed: %s\n", s.ToString().c_str());
-          exit(1);
+          db_bench_exit(1);
         }
       }
     }
@@ -8592,7 +9068,7 @@ class Benchmark {
           "Encountered an error creating a TraceReader from the trace file. "
           "Error: %s\n",
           s.ToString().c_str());
-      exit(1);
+      db_bench_exit(1);
     }
     std::unique_ptr<Replayer> replayer;
     s = db_with_cfh->db->NewDefaultReplayer(db_with_cfh->cfh,
@@ -8602,7 +9078,7 @@ class Benchmark {
               "Encountered an error creating a default Replayer. "
               "Error: %s\n",
               s.ToString().c_str());
-      exit(1);
+      db_bench_exit(1);
     }
     s = replayer->Prepare();
     if (!s.ok()) {
@@ -8668,6 +9144,7 @@ int db_bench_tool(int argc, char** argv, ToolHooks& hooks) {
   ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
   ConfigOptions config_options;
   static bool initialized = false;
+  hooks_ = &hooks;
   if (!initialized) {
     SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
                     " [OPTIONS]...");
@@ -8680,7 +9157,7 @@ int db_bench_tool(int argc, char** argv, ToolHooks& hooks) {
   if (FLAGS_statistics && !FLAGS_statistics_string.empty()) {
     fprintf(stderr,
             "Cannot provide both --statistics and --statistics_string.\n");
-    exit(1);
+    db_bench_exit(1);
   }
   if (!FLAGS_statistics_string.empty()) {
     Status s = Statistics::CreateFromString(config_options,
@@ -8689,7 +9166,7 @@ int db_bench_tool(int argc, char** argv, ToolHooks& hooks) {
       fprintf(stderr,
               "No Statistics registered matching string: %s status=%s\n",
               FLAGS_statistics_string.c_str(), s.ToString().c_str());
-      exit(1);
+      db_bench_exit(1);
     }
   }
   if (FLAGS_statistics) {
@@ -8721,14 +9198,10 @@ int db_bench_tool(int argc, char** argv, ToolHooks& hooks) {
   FLAGS_compressed_secondary_cache_compression_type_e = StringToCompressionType(
       FLAGS_compressed_secondary_cache_compression_type.c_str());
 
-  // Stacked BlobDB
-  FLAGS_blob_db_compression_type_e =
-      StringToCompressionType(FLAGS_blob_db_compression_type.c_str());
-
   int env_opts = !FLAGS_env_uri.empty() + !FLAGS_fs_uri.empty();
   if (env_opts > 1) {
     fprintf(stderr, "Error: --env_uri and --fs_uri are mutually exclusive\n");
-    exit(1);
+    db_bench_exit(1);
   }
 
   if (env_opts == 1) {
@@ -8736,7 +9209,7 @@ int db_bench_tool(int argc, char** argv, ToolHooks& hooks) {
                                   &FLAGS_env, &env_guard);
     if (!s.ok()) {
       fprintf(stderr, "Failed creating env: %s\n", s.ToString().c_str());
-      exit(1);
+      db_bench_exit(1);
     }
   } else if (FLAGS_simulate_hdd || FLAGS_simulate_hybrid_fs_file != "") {
     //**TODO: Make the simulate fs something that can be loaded
@@ -8757,7 +9230,7 @@ int db_bench_tool(int argc, char** argv, ToolHooks& hooks) {
     std::string build_info;
     std::cout << GetRocksBuildInfoAsString(build_info, true) << std::endl;
     // Similar to --version, nothing else will be done when this flag is set
-    exit(0);
+    db_bench_exit(0);
   }
 
   if (!FLAGS_seed) {
@@ -8773,7 +9246,7 @@ int db_bench_tool(int argc, char** argv, ToolHooks& hooks) {
     fprintf(stderr,
             "`-use_existing_db` must be true for `-use_existing_keys` to be "
             "settable\n");
-    exit(1);
+    db_bench_exit(1);
   }
 
   FLAGS_value_size_distribution_type_e =
@@ -8812,7 +9285,7 @@ int db_bench_tool(int argc, char** argv, ToolHooks& hooks) {
 
   if (FLAGS_seek_missing_prefix && FLAGS_prefix_size <= 8) {
     fprintf(stderr, "prefix_size > 8 required by --seek_missing_prefix\n");
-    exit(1);
+    db_bench_exit(1);
   }
 
   ROCKSDB_NAMESPACE::Benchmark benchmark;
diff --git a/tools/db_bench_tool_test.cc b/tools/db_bench_tool_test.cc
index e2546ff1c173..1b68e5dbfebf 100644
--- a/tools/db_bench_tool_test.cc
+++ b/tools/db_bench_tool_test.cc
@@ -253,7 +253,6 @@ const std::string options_file_content = R"OPTIONS_FILE(
   level0_slowdown_writes_trigger=50
   level0_file_num_compaction_trigger=10
   expanded_compaction_factor=25
-  max_write_buffer_number_to_maintain=0
   max_write_buffer_size_to_maintain=0
   verify_checksums_in_compaction=true
   merge_operator=nullptr
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 1f618c82321b..ff0b1c998404 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -5,12 +5,83 @@
 import math
 import os
 import random
+import shlex
 import shutil
 import subprocess
 import sys
 import tempfile
 import time
 
+per_iteration_random_seed_override = 0
+remain_argv = None
+is_remote_db = False
+
+
+def get_random_seed(override):
+    if override == 0:
+        return random.randint(1, 2**64)
+    else:
+        return override
+
+
+def quote_arg_for_display(arg):
+    """
+    Quote only the value after '=' for shell display.
+    This makes the printed command safe to copy/paste into a Unix shell.
+    Note: shlex is Unix-focused; Non-Unix shell users may need to adjust quoting after copying.
+    """
+    if "=" not in arg:
+        return arg
+    flag, value = arg.split("=", 1)
+    return f"{flag}={shlex.quote(value)}"
+
+
+def early_argument_parsing_before_main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--initial_random_seed_override",
+        default=0,
+        type=int,
+        help="Random seed used for initialize the test parameters at the beginning of stress test run",
+    )
+    # sometimes the failure appeared after a few iteration, to reproduce the error, we have to wait for the test to run
+    # multiple iterations to reach the iteration that fails the test. By overriding the seed used within each iteration,
+    # we could skip all the previous iterations.
+    parser.add_argument(
+        "--per_iteration_random_seed_override",
+        default=0,
+        type=int,
+        help="Random seed used for initialize the test parameters in each iteration of the stress test run",
+    )
+
+    global remain_args
+    args, remain_args = parser.parse_known_args()
+    init_random_seed = get_random_seed(args.initial_random_seed_override)
+    global per_iteration_random_seed_override
+    per_iteration_random_seed_override = args.per_iteration_random_seed_override
+    global is_remote_db
+    # Set is_remote_db if remain_args has a non-empty --env_uri= or --fs_uri= argument
+    for arg in remain_args:
+        parts = arg.split("=", 1)
+        if parts[0] in ["--env_uri", "--fs_uri"] and len(parts) > 1 and parts[1]:
+            is_remote_db = True
+            break
+
+    print(f"Start with random seed {init_random_seed}")
+    random.seed(init_random_seed)
+
+
+def apply_random_seed_per_iteration():
+    per_iteration_random_seed = get_random_seed(per_iteration_random_seed_override)
+    print(f"Use random seed for iteration {per_iteration_random_seed}")
+    random.seed(per_iteration_random_seed)
+
+
+# Random seed has to be setup before the rest of the script, so that the random
+# value selected in the global variable uses the random seed specified. More
+# arguments can also be parsed early.
+early_argument_parsing_before_main()
+
 # params overwrite priority:
 #   for default:
 #       default_params < {blackbox,whitebox}_default_params < args
@@ -61,7 +132,7 @@
     ),
     "compression_max_dict_bytes": lambda: 16384 * random.randint(0, 1),
     "compression_zstd_max_train_bytes": lambda: 65536 * random.randint(0, 1),
-    "compression_parallel_threads": lambda: random.choice([1] * 3 + [4, 8, 16]),
+    "compression_parallel_threads": lambda: random.choice([1, 1, 2, 3, 4, 5, 8, 9, 16]),
     "compression_max_dict_buffer_bytes": lambda: (1 << random.randint(0, 40)) - 1,
     "compression_use_zstd_dict_trainer": lambda: random.randint(0, 1),
     "compression_checksum": lambda: random.randint(0, 1),
@@ -80,6 +151,7 @@
     "destroy_db_initially": 0,
     "enable_pipelined_write": lambda: random.randint(0, 1),
     "enable_compaction_filter": lambda: random.choice([0, 0, 0, 1]),
+    "enable_compaction_on_deletion_trigger": lambda: random.choice([0, 0, 0, 1]),
     # `inplace_update_support` is incompatible with DB that has delete
     # range data in memtables.
     # Such data can result from any of the previous db stress runs
@@ -91,7 +163,6 @@
     # (see below `finalize_and_sanitize`).
     "inplace_update_support": random.choice([0] * 9 + [1]),
     "expected_values_dir": lambda: setup_expected_values_dir(),
-    "fail_if_options_file_error": lambda: random.randint(0, 1),
     "flush_one_in": lambda: random.choice([1000, 1000000]),
     "manual_wal_flush_one_in": lambda: random.choice([0, 1000]),
     "file_checksum_impl": lambda: random.choice(["none", "crc32c", "xxh64", "big"]),
@@ -103,18 +174,20 @@
     "get_current_wal_file_one_in": 0,
     # Temporarily disable hash index
     "index_type": lambda: random.choice([0, 0, 0, 2, 2, 3]),
+    "index_block_search_type": lambda: random.choice([0, 1]),
     "ingest_external_file_one_in": lambda: random.choice([1000, 1000000]),
     "test_ingest_standalone_range_deletion_one_in": lambda: random.choice([0, 5, 10]),
     "iterpercent": 10,
     "lock_wal_one_in": lambda: random.choice([10000, 1000000]),
     "mark_for_compaction_one_file_in": lambda: 10 * random.randint(0, 1),
-    "max_background_compactions": 20,
+    "max_background_compactions": lambda: random.choice([2, 20]),
+    "num_bottom_pri_threads": lambda: random.choice([0, 1, 20]),
     "max_bytes_for_level_base": 10485760,
     # max_key has to be the same across invocations for verification to work, hence no lambda
     "max_key": random.choice([100000, 25000000]),
     "max_sequential_skip_in_iterations": lambda: random.choice([1, 2, 8, 16]),
     "max_write_buffer_number": 3,
-    "mmap_read": lambda: random.randint(0, 1),
+    "mmap_read": lambda: random.choice([0, 0, 1]),
     # Setting `nooverwritepercent > 0` is only possible because we do not vary
     # the random seed, so the same keys are chosen by every run for disallowing
     # overwrites.
@@ -127,6 +200,7 @@
     "pause_background_one_in": lambda: random.choice([10000, 1000000]),
     "disable_file_deletions_one_in": lambda: random.choice([10000, 1000000]),
     "disable_manual_compaction_one_in": lambda: random.choice([10000, 1000000]),
+    "abort_and_resume_compactions_one_in": lambda: random.choice([10000, 1000000]),
     "prefix_size": lambda: random.choice([-1, 1, 5, 7, 8]),
     "prefixpercent": 5,
     "progress_reports": 0,
@@ -178,7 +252,7 @@
     "verify_checksum": 1,
     "write_buffer_size": lambda: random.choice([1024 * 1024, 4 * 1024 * 1024]),
     "writepercent": 35,
-    "format_version": lambda: random.choice([2, 3, 4, 5, 6, 6]),
+    "format_version": lambda: random.choice([2, 3, 4, 5, 6, 7, 7]),
     "index_block_restart_interval": lambda: random.choice(range(1, 16)),
     "use_multiget": lambda: random.randint(0, 1),
     "use_get_entity": lambda: random.choice([0] * 7 + [1]),
@@ -191,11 +265,14 @@
     "stats_dump_period_sec": lambda: random.choice([0, 10, 600]),
     "compaction_ttl": lambda: random.choice([0, 0, 1, 2, 10, 100, 1000]),
     "fifo_allow_compaction": lambda: random.randint(0, 1),
+    "fifo_compaction_max_data_files_size_mb": lambda: random.choice([0, 100, 500]),
+    "fifo_compaction_use_kv_ratio_compaction": lambda: random.randint(0, 1),
     # Test small max_manifest_file_size in a smaller chance, as most of the
     # time we wnat manifest history to be preserved to help debug
     "max_manifest_file_size": lambda: random.choice(
-        [t * 16384 if t < 3 else 1024 * 1024 * 1024 for t in range(1, 30)]
+        [t * 2048 if t < 5 else 1024 * 1024 * 1024 for t in range(1, 30)]
     ),
+    "max_manifest_space_amp_pct": lambda: random.choice([0, 10, 100, 1000]),
     # Sync mode might make test runs slower so running it in a smaller chance
     "sync": lambda: random.choice([1 if t == 0 else 0 for t in range(0, 20)]),
     "bytes_per_sync": lambda: random.choice([0, 262144]),
@@ -307,7 +384,6 @@
     "index_shortening": lambda: random.choice([0, 1, 2]),
     "metadata_charge_policy": lambda: random.choice([0, 1]),
     "use_adaptive_mutex_lru": lambda: random.choice([0, 1]),
-    "compress_format_version": lambda: random.choice([1, 2]),
     "manifest_preallocation_size": lambda: random.choice([0, 5 * 1024]),
     "enable_checksum_handoff": lambda: random.choice([0, 1]),
     "max_total_wal_size": lambda: random.choice([0] * 4 + [64 * 1024 * 1024]),
@@ -322,17 +398,21 @@
     "enable_sst_partitioner_factory": lambda: random.choice([0, 1]),
     "enable_do_not_compress_roles": lambda: random.choice([0, 1]),
     "block_align": lambda: random.choice([0, 1]),
+    "super_block_alignment_size": lambda: random.choice(
+        [0, 128 * 1024, 512 * 1024, 2 * 1024 * 1024]
+    ),
+    "super_block_alignment_space_overhead_ratio": lambda: random.choice([0, 32, 4096]),
     "lowest_used_cache_tier": lambda: random.choice([0, 1, 2]),
     "enable_custom_split_merge": lambda: random.choice([0, 1]),
     "adm_policy": lambda: random.choice([0, 1, 2, 3]),
     "last_level_temperature": lambda: random.choice(
-        ["kUnknown", "kHot", "kWarm", "kCold"]
+        ["kUnknown", "kHot", "kWarm", "kCool", "kCold", "kIce"]
     ),
     "default_write_temperature": lambda: random.choice(
-        ["kUnknown", "kHot", "kWarm", "kCold"]
+        ["kUnknown", "kHot", "kWarm", "kCool", "kCold", "kIce"]
     ),
     "default_temperature": lambda: random.choice(
-        ["kUnknown", "kHot", "kWarm", "kCold"]
+        ["kUnknown", "kHot", "kWarm", "kCool", "kCold", "kIce"]
     ),
     # TODO(hx235): enable `enable_memtable_insert_with_hint_prefix_extractor`
     # after fixing the surfaced issue with delete range
@@ -342,20 +422,42 @@
     "use_timed_put_one_in": lambda: random.choice([0] * 7 + [1, 5, 10]),
     "universal_max_read_amp": lambda: random.choice([-1] * 3 + [0, 4, 10]),
     "paranoid_memory_checks": lambda: random.choice([0] * 7 + [1]),
+    "memtable_veirfy_per_key_checksum_on_seek": lambda: random.choice([0] * 7 + [1]),
     "allow_unprepared_value": lambda: random.choice([0, 1]),
-    # TODO(hx235): enable `track_and_verify_wals` again after resolving the issues
-    # it has with write fault injection and TXN
-    "track_and_verify_wals": 0,
-    "enable_remote_compaction": lambda: random.choice([0, 1]),
+    # TODO(hx235): enable `track_and_verify_wals` after stabalizing the stress test
+    "track_and_verify_wals": lambda: random.choice([0]),
+    "remote_compaction_worker_threads": lambda: random.choice([0, 8]),
+    "allow_resumption_one_in": lambda: random.choice([0, 1, 2, 20]),
+    # TODO(jaykorean): Change to lambda: random.choice([0, 1]) after addressing all remote compaction failures
+    "remote_compaction_failure_fall_back_to_local": 1,
     "auto_refresh_iterator_with_snapshot": lambda: random.choice([0, 1]),
+    "memtable_op_scan_flush_trigger": lambda: random.choice([0, 10, 100, 1000]),
+    "memtable_avg_op_scan_flush_trigger": lambda: random.choice([0, 2, 20, 200]),
+    "ingest_wbwi_one_in": lambda: random.choice([0, 0, 100, 500]),
+    "universal_reduce_file_locking": lambda: random.randint(0, 1),
+    "compression_manager": lambda: random.choice(
+        ["mixed"] * 1
+        + ["none"] * 2
+        + ["autoskip"] * 2
+        + ["randommixed"] * 2
+        + ["custom"] * 3
+    ),
+    # fixed within a run for easier debugging
+    # actual frequency is lower after option sanitization
+    "use_multiscan": random.choice([1] + [0] * 3),
+    # By default, `statistics` use kExceptDetailedTimers level
+    "statistics": random.choice([0, 1]),
+    # TODO: re-enable after resolving "Req failed: Unknown error -14" errors
+    "multiscan_use_async_io": 0,  # random.randint(0, 1),
 }
+
 _TEST_DIR_ENV_VAR = "TEST_TMPDIR"
 # If TEST_TMPDIR_EXPECTED is not specified, default value will be TEST_TMPDIR
+# except on remote filesystem
 _TEST_EXPECTED_DIR_ENV_VAR = "TEST_TMPDIR_EXPECTED"
 _DEBUG_LEVEL_ENV_VAR = "DEBUG_LEVEL"
 
 stress_cmd = "./db_stress"
-cleanup_cmd = None
 
 
 def is_release_mode():
@@ -369,15 +471,8 @@ def get_dbname(test_name):
         dbname = tempfile.mkdtemp(prefix=test_dir_name)
     else:
         dbname = test_tmpdir + "/" + test_dir_name
-        shutil.rmtree(dbname, True)
-        if cleanup_cmd is not None:
-            print("Running DB cleanup command - %s\n" % cleanup_cmd)
-            # Ignore failure
-            os.system(cleanup_cmd)
-        try:
-            os.mkdir(dbname)
-        except OSError:
-            pass
+        if not is_remote_db:
+            os.makedirs(dbname, exist_ok=True)
     return dbname
 
 
@@ -391,9 +486,7 @@ def setup_expected_values_dir():
     expected_dir_prefix = "rocksdb_crashtest_expected_"
     test_exp_tmpdir = os.environ.get(_TEST_EXPECTED_DIR_ENV_VAR)
 
-    # set the value to _TEST_DIR_ENV_VAR if _TEST_EXPECTED_DIR_ENV_VAR is not
-    # specified.
-    if test_exp_tmpdir is None or test_exp_tmpdir == "":
+    if not is_remote_db and (test_exp_tmpdir is None or test_exp_tmpdir == ""):
         test_exp_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR)
 
     if test_exp_tmpdir is None or test_exp_tmpdir == "":
@@ -417,9 +510,7 @@ def setup_multiops_txn_key_spaces_file():
     key_spaces_file_prefix = "rocksdb_crashtest_multiops_txn_key_spaces"
     test_exp_tmpdir = os.environ.get(_TEST_EXPECTED_DIR_ENV_VAR)
 
-    # set the value to _TEST_DIR_ENV_VAR if _TEST_EXPECTED_DIR_ENV_VAR is not
-    # specified.
-    if test_exp_tmpdir is None or test_exp_tmpdir == "":
+    if not is_remote_db and (test_exp_tmpdir is None or test_exp_tmpdir == ""):
         test_exp_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR)
 
     if test_exp_tmpdir is None or test_exp_tmpdir == "":
@@ -436,12 +527,17 @@ def setup_multiops_txn_key_spaces_file():
 
 
 def is_direct_io_supported(dbname):
-    with tempfile.NamedTemporaryFile(dir=dbname) as f:
-        try:
-            os.open(f.name, os.O_DIRECT)
-        except BaseException:
-            return False
-        return True
+    if is_remote_db:
+        return False
+    else:
+        # Note: db dir might be removed on check_mode change. Re-create it
+        os.makedirs(dbname, exist_ok=True)
+        with tempfile.NamedTemporaryFile(dir=dbname) as f:
+            try:
+                os.open(f.name, os.O_DIRECT)
+            except BaseException:
+                return False
+            return True
 
 
 blackbox_default_params = {
@@ -455,7 +551,7 @@ def is_direct_io_supported(dbname):
     # since we will be killing anyway, use large value for ops_per_thread
     "ops_per_thread": 100000000,
     "reopen": 0,
-    "set_options_one_in": 2000,
+    "set_options_one_in": 1000,
 }
 
 whitebox_default_params = {
@@ -468,6 +564,8 @@ def is_direct_io_supported(dbname):
     #
     # Second, we need to make sure disabling WAL works with `-reopen > 0`.
     "disable_wal": 0,
+    # TODO: Re-enable this once we fix WAL + Remote Compaction in Stress Test
+    "remote_compaction_worker_threads": 0,
     "duration": 10000,
     "log2_keys_per_lock": 10,
     "ops_per_thread": 200000,
@@ -515,6 +613,7 @@ def is_direct_io_supported(dbname):
     "ingest_external_file_one_in": 0,
     # `CfConsistencyStressTest::TestIterateAgainstExpected()` is not implemented.
     "verify_iterator_with_expected_state_one_in": 0,
+    "memtablerep": random.choice(["skip_list"] * 9 + ["vector"]),
 }
 
 # For pessimistic transaction db
@@ -522,11 +621,15 @@ def is_direct_io_supported(dbname):
     "use_txn": 1,
     "use_optimistic_txn": 0,
     # Avoid lambda to set it once for the entire test
+    # NOTE: often passed in from command line overriding this
     "txn_write_policy": random.randint(0, 2),
     "unordered_write": random.randint(0, 1),
+    "use_per_key_point_lock_mgr": lambda: random.choice([0, 1]),
     # TODO: there is such a thing as transactions with WAL disabled. We should
     # cover that case.
     "disable_wal": 0,
+    # TODO: Re-enable this once we fix WAL + Remote Compaction in Stress Test
+    "remote_compaction_worker_threads": 0,
     # OpenReadOnly after checkpoint is not currnetly compatible with WritePrepared txns
     "checkpoint_one_in": 0,
     # pipeline write is not currnetly compatible with WritePrepared txns
@@ -584,6 +687,8 @@ def is_direct_io_supported(dbname):
     "use_shared_block_and_blob_cache": lambda: random.randint(0, 1),
     "blob_cache_size": lambda: random.choice([1048576, 2097152, 4194304, 8388608]),
     "prepopulate_blob_cache": lambda: random.randint(0, 1),
+    # TODO Fix races when both Remote Compaction + BlobDB enabled
+    "remote_compaction_worker_threads": 0,
 }
 
 ts_params = {
@@ -593,6 +698,8 @@ def is_direct_io_supported(dbname):
     # Below flag is randomly picked once and kept consistent in following runs.
     "persist_user_defined_timestamps": random.choice([0, 1, 1]),
     "use_merge": 0,
+    # Causing failures and not yet compatible
+    "use_multiscan": 0,
     "use_full_merge_v1": 0,
     "use_txn": 0,
     "ingest_external_file_one_in": 0,
@@ -600,6 +707,9 @@ def is_direct_io_supported(dbname):
     "use_put_entity_one_in": 0,
     # TimedPut is not compatible with user-defined timestamps yet.
     "use_timed_put_one_in": 0,
+    # when test_best_efforts_recovery == true, disable_wal becomes 0.
+    # TODO: Re-enable this once we fix WAL + Remote Compaction in Stress Test
+    "remote_compaction_worker_threads": 0,
 }
 
 tiered_params = {
@@ -609,32 +719,40 @@ def is_direct_io_supported(dbname):
     "preclude_last_level_data_seconds": lambda: random.choice(
         [-1, -1, 10, 60, 1200, 86400]
     ),
-    "last_level_temperature": "kCold",
+    "last_level_temperature": lambda: random.choice(["kCold", "kIce"]),
     # For FIFO compaction (ignored otherwise)
     "file_temperature_age_thresholds": lambda: random.choice(
         [
+            "{{temperature=kWarm;age=10}:{temperature=kCool;age=30}:{temperature=kCold;age=100}:{temperature=kIce;age=300}}",
             "{{temperature=kWarm;age=30}:{temperature=kCold;age=300}}",
             "{{temperature=kCold;age=100}}",
         ]
     ),
+    "allow_trivial_copy_when_change_temperature": lambda: random.choice([0, 1]),
     # tiered storage doesn't support blob db yet
     "enable_blob_files": 0,
     "use_blob_db": 0,
     "default_write_temperature": lambda: random.choice(["kUnknown", "kHot", "kWarm"]),
 }
 
-multiops_txn_default_params = {
+multiops_txn_params = {
     "test_cf_consistency": 0,
     "test_batches_snapshots": 0,
     "test_multi_ops_txns": 1,
     "use_txn": 1,
+    # Avoid lambda to set it once for the entire test
+    # NOTE: often passed in from command line overriding this
+    "txn_write_policy": random.randint(0, 2),
     "two_write_queues": lambda: random.choice([0, 1]),
     # TODO: enable write-prepared
     "disable_wal": 0,
+    # TODO: Re-enable this once we fix WAL + Remote Compaction in Stress Test
+    "remote_compaction_worker_threads": 0,
     "use_only_the_last_commit_time_batch_for_recovery": lambda: random.choice([0, 1]),
     "clear_column_family_one_in": 0,
     "column_families": 1,
-    "enable_pipelined_write": lambda: random.choice([0, 1]),
+    # TODO re-enable pipelined write (lambda: random.choice([0, 1]))
+    "enable_pipelined_write": 0,
     # This test already acquires snapshots in reads
     "acquire_snapshot_one_in": 0,
     "backup_one_in": 0,
@@ -681,34 +799,9 @@ def is_direct_io_supported(dbname):
     "use_timed_put_one_in": 0,
     # AttributeGroup not yet supported
     "use_attribute_group": 0,
-}
-
-multiops_wc_txn_params = {
-    "txn_write_policy": 0,
-    # TODO re-enable pipelined write. Not well tested atm
-    "enable_pipelined_write": 0,
     "commit_bypass_memtable_one_in": random.choice([0] * 4 + [100]),
 }
 
-multiops_wp_txn_params = {
-    "txn_write_policy": 1,
-    "wp_snapshot_cache_bits": 1,
-    # try small wp_commit_cache_bits, e.g. 0 once we explore storing full
-    # commit sequence numbers in commit cache
-    "wp_commit_cache_bits": 10,
-    # pipeline write is not currnetly compatible with WritePrepared txns
-    "enable_pipelined_write": 0,
-    # OpenReadOnly after checkpoint is not currnetly compatible with WritePrepared txns
-    "checkpoint_one_in": 0,
-    # Required to be 1 in order to use commit-time-batch
-    "use_only_the_last_commit_time_batch_for_recovery": 1,
-    "clear_wp_commit_cache_one_in": 10,
-    "create_timestamped_snapshot_one_in": 0,
-    # sequence number can be advanced in SwitchMemtable::WriteRecoverableState() for WP.
-    # disable it for now until we find another way to test LockWAL().
-    "lock_wal_one_in": 0,
-}
-
 
 def finalize_and_sanitize(src_params):
     dest_params = {k: v() if callable(v) else v for (k, v) in src_params.items()}
@@ -722,6 +815,7 @@ def finalize_and_sanitize(src_params):
     if dest_params["mmap_read"] == 1:
         dest_params["use_direct_io_for_flush_and_compaction"] = 0
         dest_params["use_direct_reads"] = 0
+        dest_params["multiscan_use_async_io"] = 0
     if (
         dest_params["use_direct_io_for_flush_and_compaction"] == 1
         or dest_params["use_direct_reads"] == 1
@@ -736,6 +830,14 @@ def finalize_and_sanitize(src_params):
         else:
             dest_params["mock_direct_io"] = True
 
+    if dest_params.get("memtablerep") == "vector":
+        dest_params["inplace_update_support"] = 0
+
+    # only skip list memtable representation supports paranoid memory checks
+    if dest_params.get("memtablerep") != "skip_list":
+        dest_params["paranoid_memory_checks"] = 0
+        dest_params["memtable_veirfy_per_key_checksum_on_seek"] = 0
+
     if dest_params["test_batches_snapshots"] == 1:
         dest_params["enable_compaction_filter"] = 0
         dest_params["inplace_update_support"] = 0
@@ -744,6 +846,7 @@ def finalize_and_sanitize(src_params):
         dest_params["metadata_write_fault_one_in"] = 0
         dest_params["read_fault_one_in"] = 0
         dest_params["metadata_read_fault_one_in"] = 0
+        dest_params["use_multiscan"] = 0
         if dest_params["prefix_size"] < 0:
             dest_params["prefix_size"] = 1
 
@@ -752,12 +855,44 @@ def finalize_and_sanitize(src_params):
     if dest_params.get("best_efforts_recovery") == 1:
         dest_params["inplace_update_support"] = 0
 
+    # Remote Compaction Incompatible Tests and Features
+    if dest_params.get("remote_compaction_worker_threads", 0) > 0:
+        # TODO Fix races when both Remote Compaction + BlobDB enabled
+        dest_params["enable_blob_files"] = 0
+        dest_params["enable_blob_garbage_collection"] = 0
+        dest_params["allow_setting_blob_options_dynamically"] = 0
+        # TODO Fix - Remote worker shouldn't recover from WAL
+        dest_params["disable_wal"] = 1
+        # Disable Incompatible Ones
+        dest_params["inplace_update_support"] = 0
+        dest_params["checkpoint_one_in"] = 0
+        dest_params["use_timed_put_one_in"] = 0
+        dest_params["test_secondary"] = 0
+        dest_params["mmap_read"] = 0
+
+        # Disable database open fault injection to prevent test inefficiency described below.
+        # When fault injection occurs during DB open, the db will wait for compaction
+        # to finish to clean up the database before retrying without injected error.
+        # However remote compaction threads are not yet created at that point
+        # so the db has to wait for the timeout (currently 30 seconds) to fall back to
+        # local compaction in order for the compaction to finish.
+        #
+        # TODO: Consider moving compaction thread creation earlier in the startup sequence
+        # to allow db open fault injection testing without this performance penalty
+        dest_params["open_metadata_write_fault_one_in"] = 0
+        dest_params["open_metadata_read_fault_one_in"] = 0
+        dest_params["open_write_fault_one_in"] = 0
+        dest_params["open_read_fault_one_in"] = 0
+        dest_params["sync_fault_injection"] = 0
+    else:
+        dest_params["allow_resumption_one_in"] = 0
+
     # Multi-key operations are not currently compatible with transactions or
     # timestamp.
     if (
         dest_params.get("test_batches_snapshots") == 1
         or dest_params.get("use_txn") == 1
-        or dest_params.get("user_timestamp_size") > 0
+        or dest_params.get("user_timestamp_size", 0) > 0
     ):
         dest_params["ingest_external_file_one_in"] = 0
     if (
@@ -785,7 +920,7 @@ def finalize_and_sanitize(src_params):
     if (
         dest_params.get("sync_fault_injection") == 1
         or dest_params.get("disable_wal") == 1
-        or dest_params.get("manual_wal_flush_one_in") > 0
+        or dest_params.get("manual_wal_flush_one_in", 0) > 0
     ):
         # File ingestion does not guarantee prefix-recoverability when unsynced
         # data can be lost. Ingesting a file syncs data immediately that is
@@ -802,8 +937,9 @@ def finalize_and_sanitize(src_params):
     # Remove the following once write-prepared/write-unprepared with/without
     # unordered write supports timestamped snapshots
     if dest_params.get("create_timestamped_snapshot_one_in", 0) > 0:
-        dest_params["txn_write_policy"] = 0
         dest_params["unordered_write"] = 0
+        if dest_params.get("txn_write_policy", 0) != 0:
+            dest_params["create_timestamped_snapshot_one_in"] = 0
     # Only under WritePrepared txns, unordered_write would provide the same guarnatees as vanilla rocksdb
     # unordered_write is only enabled with --txn, and txn_params disables inplace_update_support, so
     # setting allow_concurrent_memtable_write=1 won't conflcit with inplace_update_support.
@@ -835,9 +971,20 @@ def finalize_and_sanitize(src_params):
         # Disable irrelevant tiering options
         dest_params["preclude_last_level_data_seconds"] = 0
         dest_params["last_level_temperature"] = "kUnknown"
+        # use_kv_ratio_compaction requires allow_compaction and
+        # max_data_files_size > 0
+        if dest_params.get("fifo_compaction_use_kv_ratio_compaction", 0) == 1:
+            if (
+                dest_params.get("fifo_allow_compaction", 0) != 1
+                or dest_params.get("fifo_compaction_max_data_files_size_mb", 0) == 0
+            ):
+                dest_params["fifo_compaction_use_kv_ratio_compaction"] = 0
     else:
         # Disable irrelevant tiering options
         dest_params["file_temperature_age_thresholds"] = ""
+        # Disable FIFO-specific options for non-FIFO compaction styles
+        dest_params["fifo_compaction_max_data_files_size_mb"] = 0
+        dest_params["fifo_compaction_use_kv_ratio_compaction"] = 0
     if dest_params["partition_filters"] == 1:
         if dest_params["index_type"] != 2:
             dest_params["partition_filters"] = 0
@@ -882,12 +1029,31 @@ def finalize_and_sanitize(src_params):
         dest_params["use_multi_cf_iterator"] = 0
         # only works with write committed policy
         dest_params["commit_bypass_memtable_one_in"] = 0
+        # not compatible with Remote Compaction yet
+        dest_params["remote_compaction_worker_threads"] = 0
     # TODO(hx235): enable test_multi_ops_txns with fault injection after stabilizing the CI
     if dest_params.get("test_multi_ops_txns") == 1:
         dest_params["write_fault_one_in"] = 0
         dest_params["metadata_write_fault_one_in"] = 0
         dest_params["read_fault_one_in"] = 0
         dest_params["metadata_read_fault_one_in"] = 0
+        if dest_params.get("txn_write_policy", 0) != 0:
+            # TODO: should any of this change for WUP (txn_write_policy==2)?
+            dest_params["wp_snapshot_cache_bits"] = 1
+            # try small wp_commit_cache_bits, e.g. 0 once we explore storing full
+            # commit sequence numbers in commit cache
+            dest_params["wp_commit_cache_bits"] = 10
+            # pipeline write is not currnetly compatible with WritePrepared txns
+            dest_params["enable_pipelined_write"] = 0
+            # OpenReadOnly after checkpoint is not currently compatible with WritePrepared txns
+            dest_params["checkpoint_one_in"] = 0
+            # Required to be 1 in order to use commit-time-batch
+            dest_params["use_only_the_last_commit_time_batch_for_recovery"] = 1
+            dest_params["clear_wp_commit_cache_one_in"] = 10
+            # sequence number can be advanced in SwitchMemtable::WriteRecoverableState() for WP.
+            # disable it for now until we find another way to test LockWAL().
+            dest_params["lock_wal_one_in"] = 0
+
     # Wide column stress tests require FullMergeV3
     if dest_params["use_put_entity_one_in"] != 0:
         dest_params["use_full_merge_v1"] = 0
@@ -945,8 +1111,6 @@ def finalize_and_sanitize(src_params):
         # disable atomic flush.
         if dest_params["test_best_efforts_recovery"] == 0:
             dest_params["disable_wal"] = 0
-    if dest_params.get("allow_concurrent_memtable_write", 1) == 1:
-        dest_params["memtablerep"] = "skip_list"
     if (
         dest_params.get("enable_compaction_filter", 0) == 1
         or dest_params.get("inplace_update_support", 0) == 1
@@ -965,28 +1129,68 @@ def finalize_and_sanitize(src_params):
         dest_params["check_multiget_consistency"] = 0
         dest_params["check_multiget_entity_consistency"] = 0
     if dest_params.get("disable_wal") == 0:
-        if dest_params.get("reopen") > 0 or (
-            dest_params.get("manual_wal_flush_one_in")
-            and dest_params.get("column_families") != 1
+        if (
+            dest_params.get("reopen", 0) > 0
+            or (
+                dest_params.get("manual_wal_flush_one_in")
+                and dest_params.get("column_families") != 1
+            )
+            or (
+                dest_params.get("use_txn") != 0
+                and dest_params.get("use_optimistic_txn") == 0
+            )
         ):
-            # Reopen with WAL currently requires persisting WAL data before closing for reopen.
+            # 1. Reopen with WAL currently requires persisting WAL data before closing for reopen.
             # Previous injected WAL write errors may not be cleared by the time of closing and ready
             # for persisting WAL.
             # To simplify, we disable any WAL write error injection.
             # TODO(hx235): support WAL write error injection with reopen
-            # TODO(hx235): support excluding WAL from metadata write fault injection so we don't
-            # have to disable metadata write fault injection to other file
             #
-            # WAL write failure can drop buffered WAL data. This can cause
+            # 2. WAL write failure can drop buffered WAL data. This can cause
             # inconsistency when one CF has a successful flush during auto
             # recovery. Disable the fault injection in this path for now until
             # we have a fix that allows auto recovery.
+            #
+            # 3. Pessimistic transactions use 2PC, which can't auto-recover from WAL write errors.
+            # This is because RocksDB cannot easily discard the corrupted WAL without risking the
+            # loss of uncommitted prepared data within the same WAL.
+            # Therefore disabling WAL write error injection in stress tests to prevent crashing
+            # since stress test does not support injecting errors that can' be auto-recovered.
+            #
+            # TODO(hx235): support excluding WAL from metadata write fault injection so we don't
+            # have to disable metadata write fault injection to other file
             dest_params["exclude_wal_from_write_fault_injection"] = 1
             dest_params["metadata_write_fault_one_in"] = 0
-    # Enabling block_align with compression is not supported
-    if dest_params.get("block_align") == 1:
-        dest_params["compression_type"] = "none"
-        dest_params["bottommost_compression_type"] = "none"
+
+            # TODO Fix - Remote worker shouldn't recover from WAL
+            dest_params["remote_compaction_worker_threads"] = 0
+    # Disabling block align if mixed manager is being used
+    if dest_params.get("compression_manager") == "custom":
+        if dest_params.get("block_align") == 1:
+            dest_params["block_align"] = 0
+        if dest_params["format_version"] < 7:
+            dest_params["format_version"] = 7
+    elif (
+        dest_params.get("compression_manager") == "mixed"
+        or dest_params.get("compression_manager") == "randommixed"
+    ):
+        dest_params["block_align"] = 0
+    elif dest_params.get("compression_manager") == "autoskip":
+        # ensuring the compression is being used
+        if dest_params.get("compression_type") == "none":
+            dest_params["compression_type"] = random.choice(
+                ["snappy", "zlib", "lz4", "lz4hc", "xpress", "zstd"]
+            )
+        if dest_params.get("bottommost_compression_type") == "none":
+            dest_params["bottommost_compression_type"] = random.choice(
+                ["snappy", "zlib", "lz4", "lz4hc", "xpress", "zstd"]
+            )
+        dest_params["block_align"] = 0
+    else:
+        # Enabling block_align with compression is not supported
+        if dest_params.get("block_align") == 1:
+            dest_params["compression_type"] = "none"
+            dest_params["bottommost_compression_type"] = "none"
     # If periodic_compaction_seconds is not set, daily_offpeak_time_utc doesn't do anything
     if dest_params.get("periodic_compaction_seconds") == 0:
         dest_params["daily_offpeak_time_utc"] = ""
@@ -995,7 +1199,7 @@ def finalize_and_sanitize(src_params):
     if dest_params.get("use_put_entity_one_in") == 1:
         dest_params["use_timed_put_one_in"] = 0
     elif (
-        dest_params.get("use_put_entity_one_in") > 1
+        dest_params.get("use_put_entity_one_in", 0) > 1
         and dest_params.get("use_timed_put_one_in") == 1
     ):
         dest_params["use_timed_put_one_in"] = 3
@@ -1020,16 +1224,45 @@ def finalize_and_sanitize(src_params):
     ):
         dest_params["enable_blob_files"] = 0
         dest_params["allow_setting_blob_options_dynamically"] = 0
-        dest_params["atomic_flush"] = 0
         dest_params["allow_concurrent_memtable_write"] = 0
         dest_params["use_put_entity_one_in"] = 0
         dest_params["use_get_entity"] = 0
         dest_params["use_multi_get_entity"] = 0
         dest_params["enable_pipelined_write"] = 0
         dest_params["use_attribute_group"] = 0
+    if (
+        dest_params.get("enable_pipelined_write", 0)
+        or dest_params.get("unordered_write", 0)
+        or dest_params.get("disable_wal", 0) == 0
+        or dest_params.get("user_timestamp_size", 0)
+    ):
+        dest_params["ingest_wbwi_one_in"] = 0
     # Continuous verification fails with secondaries inside NonBatchedOpsStressTest
     if dest_params.get("test_secondary") == 1:
         dest_params["continuous_verification_interval"] = 0
+    if dest_params.get("use_multiscan") == 1:
+        dest_params["async_io"] = 0
+        dest_params["delpercent"] += dest_params["delrangepercent"]
+        dest_params["delrangepercent"] = 0
+        dest_params["prefix_size"] = -1
+        dest_params["iterpercent"] += dest_params["prefixpercent"]
+        dest_params["prefixpercent"] = 0
+        dest_params["read_fault_one_in"] = 0
+        dest_params["memtable_prefix_bloom_size_ratio"] = 0
+        dest_params["max_sequential_skip_in_iterations"] = sys.maxsize
+        # This option ingests a delete range that might partially overlap with
+        # existing key range, which will cause a reseek that's currently not
+        # supported by multiscan
+        dest_params["test_ingest_standalone_range_deletion_one_in"] = 0
+        # LevelIterator multiscan currently relies on num_entries and num_range_deletions,
+        # which are not updated if skip_stats_update_on_db_open is true
+        dest_params["skip_stats_update_on_db_open"] = 0
+
+    # inplace update and key checksum verification during seek would cause race condition
+    # Therefore, when inplace_update_support is enabled, disable memtable_veirfy_per_key_checksum_on_seek
+    if dest_params["inplace_update_support"] == 1:
+        dest_params["memtable_veirfy_per_key_checksum_on_seek"] = 0
+
     return dest_params
 
 
@@ -1058,11 +1291,7 @@ def gen_cmd_params(args):
     if args.enable_ts:
         params.update(ts_params)
     if args.test_multiops_txn:
-        params.update(multiops_txn_default_params)
-        if args.write_policy == "write_committed":
-            params.update(multiops_wc_txn_params)
-        elif args.write_policy == "write_prepared":
-            params.update(multiops_wp_txn_params)
+        params.update(multiops_txn_params)
     if args.test_tiered_storage:
         params.update(tiered_params)
 
@@ -1111,11 +1340,9 @@ def gen_cmd(params, unknown_params):
                 "test_best_efforts_recovery",
                 "enable_ts",
                 "test_multiops_txn",
-                "write_policy",
                 "stress_cmd",
                 "test_tiered_storage",
                 "cleanup_cmd",
-                "skip_tmpdir_check",
                 "print_stderr_separately",
                 "verify_timeout",
             }
@@ -1128,7 +1355,10 @@ def gen_cmd(params, unknown_params):
 
 def execute_cmd(cmd, timeout=None, timeout_pstack=False):
     child = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
-    print("Running db_stress with pid=%d: %s\n\n" % (child.pid, " ".join(cmd)))
+    print(
+        "Running db_stress with pid=%d: %s\n\n"
+        % (child.pid, " ".join(quote_arg_for_display(arg) for arg in cmd))
+    )
     pid = child.pid
 
     try:
@@ -1160,12 +1390,18 @@ def print_output_and_exit_on_error(stdout, stderr, print_stderr_separately=False
 
 
 def cleanup_after_success(dbname):
-    shutil.rmtree(dbname, True)
-    if cleanup_cmd is not None:
-        print("Running DB cleanup command - %s\n" % cleanup_cmd)
-        ret = os.system(cleanup_cmd)
-        if ret != 0:
-            print("WARNING: DB cleanup returned error %d\n" % ret)
+    # Use db_stress --destroy_db_and_exit, which simplifies remote DB cleanup
+    cleanup_cmd_parts = [stress_cmd, "--destroy_db_and_exit=1", "--db=" + dbname]
+    # Pass through relevant arguments for remote DB access
+    for arg in remain_args:
+        parts = arg.split("=", 1)
+        if parts[0] in ["--env_uri", "--fs_uri"]:
+            cleanup_cmd_parts.append(arg)
+    print("Running DB cleanup command - %s\n" % " ".join(cleanup_cmd_parts))
+    ret = subprocess.call(cleanup_cmd_parts)
+    if ret != 0:
+        print("ERROR: DB cleanup returned error %d\n" % ret)
+        sys.exit(2)
 
 
 # This script runs and kills db_stress multiple times. It checks consistency
@@ -1186,12 +1422,17 @@ def blackbox_crash_main(args, unknown_args):
     )
 
     while time.time() < exit_time:
+        apply_random_seed_per_iteration()
         cmd = gen_cmd(
             dict(list(cmd_params.items()) + list({"db": dbname}.items())), unknown_args
         )
 
         hit_timeout, retcode, outs, errs = execute_cmd(cmd, cmd_params["interval"])
 
+        # Reset destroy_db_initially after each run (it may have been set by
+        # command line for first run only)
+        cmd_params["destroy_db_initially"] = 0
+
         if not hit_timeout:
             print("Exit Before Killing")
             print_output_and_exit_on_error(outs, errs, args.print_stderr_separately)
@@ -1248,6 +1489,7 @@ def whitebox_crash_main(args, unknown_args):
     succeeded = True
     hit_timeout = False
     while time.time() < exit_time:
+        apply_random_seed_per_iteration()
         if check_mode == 0:
             additional_opts = {
                 # use large ops per thread since we will kill it anyway
@@ -1333,7 +1575,7 @@ def whitebox_crash_main(args, unknown_args):
                 "`compaction_style` is changed in current run so `destroy_db_initially` is set to 1 as a short-term solution to avoid cycling through previous db of different compaction style."
                 + "\n"
             )
-            additional_opts["destroy_db_initially"] = 1
+            cmd_params["destroy_db_initially"] = 1
         prev_compaction_style = cur_compaction_style
 
         cmd = gen_cmd(
@@ -1358,6 +1600,11 @@ def whitebox_crash_main(args, unknown_args):
         hit_timeout, retncode, stdoutdata, stderrdata = execute_cmd(
             cmd, exit_time - time.time() + 900
         )
+
+        # Reset destroy_db_initially after each run (it may have been set by
+        # command line for first run, or set for various reasons for a step)
+        cmd_params["destroy_db_initially"] = 0
+
         msg = "check_mode={}, kill option={}, exitcode={}\n".format(
             check_mode, additional_opts["kill_random_test"], retncode
         )
@@ -1387,15 +1634,11 @@ def whitebox_crash_main(args, unknown_args):
         # First half of the duration, keep doing kill test. For the next half,
         # try different modes.
         if time.time() > half_time:
-            cleanup_after_success(dbname)
-            try:
-                os.mkdir(dbname)
-            except OSError:
-                pass
+            # Set next iteration to destroy DB (works for remote DB)
+            cmd_params["destroy_db_initially"] = 1
             if expected_values_dir is not None:
                 shutil.rmtree(expected_values_dir, True)
                 os.mkdir(expected_values_dir)
-
             check_mode = (check_mode + 1) % total_check_mode
 
         time.sleep(1)  # time to stabilize after a kill
@@ -1408,7 +1651,6 @@ def whitebox_crash_main(args, unknown_args):
 
 def main():
     global stress_cmd
-    global cleanup_cmd
 
     parser = argparse.ArgumentParser(
         description="This script runs and kills \
@@ -1422,11 +1664,9 @@ def main():
     parser.add_argument("--test_best_efforts_recovery", action="store_true")
     parser.add_argument("--enable_ts", action="store_true")
     parser.add_argument("--test_multiops_txn", action="store_true")
-    parser.add_argument("--write_policy", choices=["write_committed", "write_prepared"])
     parser.add_argument("--stress_cmd")
     parser.add_argument("--test_tiered_storage", action="store_true")
-    parser.add_argument("--cleanup_cmd")
-    parser.add_argument("--skip_tmpdir_check", action="store_true")
+    parser.add_argument("--cleanup_cmd")  # ignore old option for now
     parser.add_argument("--print_stderr_separately", action="store_true", default=False)
 
     all_params = dict(
@@ -1438,9 +1678,7 @@ def main():
         + list(whitebox_simple_default_params.items())
         + list(blob_params.items())
         + list(ts_params.items())
-        + list(multiops_txn_default_params.items())
-        + list(multiops_wc_txn_params.items())
-        + list(multiops_wp_txn_params.items())
+        + list(multiops_txn_params.items())
         + list(best_efforts_recovery_params.items())
         + list(cf_consistency_params.items())
         + list(tiered_params.items())
@@ -1451,10 +1689,10 @@ def main():
     for k, v in all_params.items():
         parser.add_argument("--" + k, type=type(v() if callable(v) else v))
     # unknown_args are passed directly to db_stress
-    args, unknown_args = parser.parse_known_args()
 
+    args, unknown_args = parser.parse_known_args(remain_args)
     test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR)
-    if test_tmpdir is not None and not args.skip_tmpdir_check:
+    if test_tmpdir is not None and not is_remote_db:
         isdir = False
         try:
             isdir = os.path.isdir(test_tmpdir)
@@ -1469,8 +1707,6 @@ def main():
 
     if args.stress_cmd:
         stress_cmd = args.stress_cmd
-    if args.cleanup_cmd:
-        cleanup_cmd = args.cleanup_cmd
     if args.test_type == "blackbox":
         blackbox_crash_main(args, unknown_args)
     if args.test_type == "whitebox":
diff --git a/tools/db_repl_stress.cc b/tools/db_repl_stress.cc
index 78cccaa038d4..8db16a1d76ec 100644
--- a/tools/db_repl_stress.cc
+++ b/tools/db_repl_stress.cc
@@ -83,7 +83,7 @@ int main(int argc, const char** argv) {
   options.create_if_missing = true;
   options.WAL_ttl_seconds = FLAGS_wal_ttl_seconds;
   options.WAL_size_limit_MB = FLAGS_wal_size_limit_MB;
-  DB* db;
+  std::unique_ptr<DB> db;
   DestroyDB(default_db_path, options);
 
   Status s = DB::Open(options, default_db_path, &db);
@@ -94,7 +94,7 @@ int main(int argc, const char** argv) {
   }
 
   DataPumpThread dataPump;
-  dataPump.db = db;
+  dataPump.db = db.get();
   env->StartThread(DataPumpThreadBody, &dataPump);
 
   std::unique_ptr<TransactionLogIterator> iter;
diff --git a/tools/db_sanity_test.cc b/tools/db_sanity_test.cc
index dd4fd59bc4ce..76a93e5bbfdd 100644
--- a/tools/db_sanity_test.cc
+++ b/tools/db_sanity_test.cc
@@ -41,9 +41,8 @@ class SanityTest {
     if (!s.ok()) {
       return s;
     }
-    DB* db = nullptr;
+    std::unique_ptr<DB> db;
     s = DB::Open(options, dbname, &db);
-    std::unique_ptr<DB> db_guard(db);
     if (!s.ok()) {
       return s;
     }
@@ -58,10 +57,9 @@ class SanityTest {
     return db->Flush(FlushOptions());
   }
   Status Verify() {
-    DB* db = nullptr;
+    std::unique_ptr<DB> db;
     std::string dbname = path_ + Name();
     Status s = DB::Open(GetOptions(), dbname, &db);
-    std::unique_ptr<DB> db_guard(db);
     if (!s.ok()) {
       return s;
     }
diff --git a/tools/dump/db_dump_tool.cc b/tools/dump/db_dump_tool.cc
index d03230308f31..520c276915db 100644
--- a/tools/dump/db_dump_tool.cc
+++ b/tools/dump/db_dump_tool.cc
@@ -16,7 +16,6 @@ namespace ROCKSDB_NAMESPACE {
 
 bool DbDumpTool::Run(const DumpOptions& dump_options,
                      ROCKSDB_NAMESPACE::Options options) {
-  ROCKSDB_NAMESPACE::DB* dbptr;
   ROCKSDB_NAMESPACE::Status status;
   std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> dumpfile;
   char hostname[1024];
@@ -31,16 +30,15 @@ bool DbDumpTool::Run(const DumpOptions& dump_options,
 
   // Open the database
   options.create_if_missing = false;
+  std::unique_ptr<ROCKSDB_NAMESPACE::DB> db;
   status = ROCKSDB_NAMESPACE::DB::OpenForReadOnly(options, dump_options.db_path,
-                                                  &dbptr);
+                                                  &db);
   if (!status.ok()) {
     std::cerr << "Unable to open database '" << dump_options.db_path
               << "' for reading: " << status.ToString() << std::endl;
     return false;
   }
 
-  const std::unique_ptr<ROCKSDB_NAMESPACE::DB> db(dbptr);
-
   status = env->NewWritableFile(dump_options.dump_location, &dumpfile,
                                 ROCKSDB_NAMESPACE::EnvOptions());
   if (!status.ok()) {
@@ -131,7 +129,6 @@ bool DbDumpTool::Run(const DumpOptions& dump_options,
 
 bool DbUndumpTool::Run(const UndumpOptions& undump_options,
                        ROCKSDB_NAMESPACE::Options options) {
-  ROCKSDB_NAMESPACE::DB* dbptr;
   ROCKSDB_NAMESPACE::Status status;
   ROCKSDB_NAMESPACE::Env* env;
   std::unique_ptr<ROCKSDB_NAMESPACE::SequentialFile> dumpfile;
@@ -180,15 +177,14 @@ bool DbUndumpTool::Run(const UndumpOptions& undump_options,
   }
 
   options.create_if_missing = true;
-  status = ROCKSDB_NAMESPACE::DB::Open(options, undump_options.db_path, &dbptr);
+  std::unique_ptr<ROCKSDB_NAMESPACE::DB> db;
+  status = ROCKSDB_NAMESPACE::DB::Open(options, undump_options.db_path, &db);
   if (!status.ok()) {
     std::cerr << "Unable to open database '" << undump_options.db_path
               << "' for writing: " << status.ToString() << std::endl;
     return false;
   }
 
-  const std::unique_ptr<ROCKSDB_NAMESPACE::DB> db(dbptr);
-
   uint32_t last_keysize = 64;
   size_t last_valsize = 1 << 20;
   std::unique_ptr<char[]> keyscratch(new char[last_keysize]);
diff --git a/tools/dump/rocksdb_undump.cc b/tools/dump/rocksdb_undump.cc
index e437b3fe8a43..9b922a8233dd 100644
--- a/tools/dump/rocksdb_undump.cc
+++ b/tools/dump/rocksdb_undump.cc
@@ -25,7 +25,7 @@ DEFINE_bool(compact, false, "Compact the db after loading the dumped file");
 DEFINE_string(db_options, "",
               "Options string used to open the database that will be loaded");
 
-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
   GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
 
   if (FLAGS_db_path == "" || FLAGS_dump_location == "") {
diff --git a/tools/generate_random_db.sh b/tools/generate_random_db.sh
index a05c1f5a2133..e668d56a2393 100755
--- a/tools/generate_random_db.sh
+++ b/tools/generate_random_db.sh
@@ -29,6 +29,12 @@ if ./ldb --version 2>/dev/null >/dev/null; then
   rm -rf $db_dir
 fi
 
+# Check if deleterange command is supported by grepping ldb --help
+deleterange_support=
+if ./ldb --help 2>&1 | grep -q deleterange; then
+  deleterange_support=1
+fi
+
 echo == Loading data from $input_data_dir to $db_dir
 
 declare -a compression_opts=("no" "snappy" "zlib" "bzip2")
@@ -65,5 +71,33 @@ do
   fi
   ./ldb load --db=$db_dir --compression_type=$c $d_arg --bloom_bits=10 \
     --auto_compaction=false --create_if_missing < $input_data_dir/$f
+
+  # Use md5sum of file to deterministically decide whether to add a range
+  # tombstone (approximately 1/4 of files) and which key to delete
+  file_path=$input_data_dir/$f
+  hash=$(md5sum "$file_path" | cut -c1-8)
+  hash_int=$((16#$hash))
+
+  if [ $((hash_int % 4)) -eq 0 ]; then
+    # Pick a key from this file based on the hash
+    line_count=$(wc -l < "$file_path")
+    if [ "$line_count" -gt 0 ]; then
+      line_num=$((hash_int % line_count + 1))
+      key=$(sed -n "${line_num}p" "$file_path" | cut -d' ' -f1)
+      if [ -n "$key" ]; then
+        # Create end key by appending a character to make a small range
+        end_key="${key}0"
+        if [ "$deleterange_support" == "1" ]; then
+          echo "== Deleting range [$key, $end_key) from $f"
+          ./ldb deleterange --db=$db_dir "$key" "$end_key"
+        else
+          # Fall back to point delete for equivalent logical contents
+          echo "== Deleting key $key from $f"
+          ./ldb delete --db=$db_dir "$key"
+        fi
+      fi
+    fi
+  fi
+
   let "n = n + 1"
 done
diff --git a/tools/io_tracer_parser_test.cc b/tools/io_tracer_parser_test.cc
index 8e1fb72df394..8f7cb3a5d0cb 100644
--- a/tools/io_tracer_parser_test.cc
+++ b/tools/io_tracer_parser_test.cc
@@ -50,8 +50,7 @@ class IOTracerParserTest : public testing::Test {
     if (db_ != nullptr) {
       Options options;
       options.env = env_;
-      delete db_;
-      db_ = nullptr;
+      db_.reset();
       EXPECT_OK(DestroyDB(dbname_, options));
     }
     EXPECT_OK(env_->DeleteDir(test_path_));
@@ -97,7 +96,7 @@ class IOTracerParserTest : public testing::Test {
     ASSERT_EQ(0, ROCKSDB_NAMESPACE::io_tracer_parser(argc, argv));
   }
 
-  DB* db_;
+  std::unique_ptr<DB> db_;
   Env* env_;
   EnvOptions env_options_;
   std::string trace_file_path_;
diff --git a/tools/ldb.cc b/tools/ldb.cc
index 52533e6b0f6e..5ef91df1b209 100644
--- a/tools/ldb.cc
+++ b/tools/ldb.cc
@@ -8,6 +8,5 @@
 
 int main(int argc, char** argv) {
   ROCKSDB_NAMESPACE::LDBTool tool;
-  tool.Run(argc, argv);
-  return 0;
+  return tool.RunAndReturn(argc, argv);
 }
diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc
index 16a47ab5b0ac..eb3fb66d36bc 100644
--- a/tools/ldb_cmd.cc
+++ b/tools/ldb_cmd.cc
@@ -25,6 +25,7 @@
 #include "db/wide/wide_column_serialization.h"
 #include "db/wide/wide_columns_helper.h"
 #include "db/write_batch_internal.h"
+#include "db_stress_tool/db_stress_compression_manager.h"
 #include "file/filename.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/comparator.h"
@@ -39,12 +40,12 @@
 #include "rocksdb/utilities/options_util.h"
 #include "rocksdb/write_batch.h"
 #include "rocksdb/write_buffer_manager.h"
-#include "table/block_based/block_based_table_builder.h"
 #include "table/sst_file_dumper.h"
 #include "tools/ldb_cmd_impl.h"
 #include "util/cast_util.h"
 #include "util/coding.h"
 #include "util/file_checksum_helper.h"
+#include "util/simple_mixed_compressor.h"
 #include "util/stderr_logger.h"
 #include "util/string_util.h"
 #include "util/write_batch_util.h"
@@ -70,6 +71,8 @@ const std::string LDBCommand::ARG_CF_NAME = "column_family";
 const std::string LDBCommand::ARG_TTL = "ttl";
 const std::string LDBCommand::ARG_TTL_START = "start_time";
 const std::string LDBCommand::ARG_TTL_END = "end_time";
+const std::string LDBCommand::ARG_USE_TXN = "use_txn";
+const std::string LDBCommand::ARG_TXN_WRITE_POLICY = "txn_write_policy";
 const std::string LDBCommand::ARG_TIMESTAMP = "timestamp";
 const std::string LDBCommand::ARG_TRY_LOAD_OPTIONS = "try_load_options";
 const std::string LDBCommand::ARG_DISABLE_CONSISTENCY_CHECKS =
@@ -109,8 +112,6 @@ const std::string LDBCommand::ARG_BLOB_FILE_STARTING_LEVEL =
 const std::string LDBCommand::ARG_PREPOPULATE_BLOB_CACHE =
     "prepopulate_blob_cache";
 const std::string LDBCommand::ARG_DECODE_BLOB_INDEX = "decode_blob_index";
-const std::string LDBCommand::ARG_DUMP_UNCOMPRESSED_BLOBS =
-    "dump_uncompressed_blobs";
 const std::string LDBCommand::ARG_READ_TIMESTAMP = "read_timestamp";
 const std::string LDBCommand::ARG_GET_WRITE_UNIX_TIME = "get_write_unix_time";
 
@@ -200,7 +201,7 @@ void DumpSstFile(Options options, std::string filename, bool output_hex,
                  std::string from_key = "", std::string to_key = "");
 
 void DumpBlobFile(const std::string& filename, bool is_key_hex,
-                  bool is_value_hex, bool dump_uncompressed_blobs);
+                  bool is_value_hex);
 
 Status EncodeUserProvidedTimestamp(const std::string& user_timestamp,
                                    std::string* ts_buf);
@@ -426,6 +427,10 @@ LDBCommand* LDBCommand::SelectCommand(const ParsedParams& parsed_params) {
     return new UpdateManifestCommand(parsed_params.cmd_params,
                                      parsed_params.option_map,
                                      parsed_params.flags);
+  } else if (parsed_params.cmd == CompactionProgressDumpCommand::Name()) {
+    return new CompactionProgressDumpCommand(parsed_params.cmd_params,
+                                             parsed_params.option_map,
+                                             parsed_params.flags);
   }
   return nullptr;
 }
@@ -476,10 +481,13 @@ LDBCommand::LDBCommand(const std::map<std::string, std::string>& options,
                        const std::vector<std::string>& valid_cmd_line_options)
     : db_(nullptr),
       db_ttl_(nullptr),
+      db_txn_(nullptr),
       is_read_only_(is_read_only),
       is_key_hex_(false),
       is_value_hex_(false),
       is_db_ttl_(false),
+      is_db_txn_(false),
+      txn_write_policy_(0),
       timestamp_(false),
       try_load_options_(false),
       create_if_missing_(false),
@@ -523,6 +531,21 @@ LDBCommand::LDBCommand(const std::map<std::string, std::string>& options,
   is_key_hex_ = IsKeyHex(options, flags);
   is_value_hex_ = IsValueHex(options, flags);
   is_db_ttl_ = IsFlagPresent(flags, ARG_TTL);
+  is_db_txn_ = IsFlagPresent(flags, ARG_USE_TXN);
+  itr = options.find(ARG_TXN_WRITE_POLICY);
+  if (itr != options.end()) {
+    try {
+      txn_write_policy_ = std::stoi(itr->second);
+      if (txn_write_policy_ < 0 || txn_write_policy_ > 2) {
+        fprintf(stderr, "Invalid txn_write_policy: %d. Must be 0, 1, or 2.\n",
+                txn_write_policy_);
+        txn_write_policy_ = 0;
+      }
+    } catch (const std::exception&) {
+      fprintf(stderr, "Invalid txn_write_policy value: %s\n",
+              itr->second.c_str());
+    }
+  }
   timestamp_ = IsFlagPresent(flags, ARG_TIMESTAMP);
   try_load_options_ = IsTryLoadOptions(options, flags);
   force_consistency_checks_ =
@@ -546,7 +569,34 @@ void LDBCommand::OpenDB() {
   // Open the DB.
   Status st;
   std::vector<ColumnFamilyHandle*> handles_opened;
-  if (is_db_ttl_) {
+  if (is_db_txn_) {
+    // TransactionDB mode
+    if (is_db_ttl_) {
+      exec_state_ = LDBCommandExecuteResult::Failed(
+          "Cannot use both --ttl and --use_txn flags together");
+      return;
+    }
+    if (!secondary_path_.empty() || !leader_path_.empty()) {
+      exec_state_ = LDBCommandExecuteResult::Failed(
+          "TransactionDB does not support secondary or follower mode");
+      return;
+    }
+    if (is_read_only_) {
+      exec_state_ = LDBCommandExecuteResult::Failed(
+          "TransactionDB does not support read-only mode");
+      return;
+    }
+    TransactionDBOptions txn_db_options;
+    txn_db_options.write_policy =
+        static_cast<TxnDBWritePolicy>(txn_write_policy_);
+    if (column_families_.empty()) {
+      st = TransactionDB::Open(options_, txn_db_options, db_path_, &db_txn_);
+    } else {
+      st = TransactionDB::Open(options_, txn_db_options, db_path_,
+                               column_families_, &handles_opened, &db_txn_);
+    }
+    db_.reset(db_txn_);
+  } else if (is_db_ttl_) {
     // ldb doesn't yet support TTL DB with multiple column families
     if (!column_family_name_.empty() || !column_families_.empty()) {
       exec_state_ = LDBCommandExecuteResult::Failed(
@@ -561,7 +611,7 @@ void LDBCommand::OpenDB() {
     } else {
       st = DBWithTTL::Open(options_, db_path_, &db_ttl_);
     }
-    db_ = db_ttl_;
+    db_.reset(db_ttl_);
   } else {
     if (!secondary_path_.empty() && !leader_path_.empty()) {
       exec_state_ = LDBCommandExecuteResult::Failed(
@@ -581,9 +631,7 @@ void LDBCommand::OpenDB() {
         } else if (!secondary_path_.empty()) {
           st = DB::OpenAsSecondary(options_, db_path_, secondary_path_, &db_);
         } else {
-          std::unique_ptr<DB> dbptr;
-          st = DB::OpenAsFollower(options_, db_path_, leader_path_, &dbptr);
-          db_ = dbptr.release();
+          st = DB::OpenAsFollower(options_, db_path_, leader_path_, &db_);
         }
       } else {
         if (secondary_path_.empty() && leader_path_.empty()) {
@@ -593,10 +641,8 @@ void LDBCommand::OpenDB() {
           st = DB::OpenAsSecondary(options_, db_path_, secondary_path_,
                                    column_families_, &handles_opened, &db_);
         } else {
-          std::unique_ptr<DB> dbptr;
           st = DB::OpenAsFollower(options_, db_path_, leader_path_,
-                                  column_families_, &handles_opened, &dbptr);
-          db_ = dbptr.release();
+                                  column_families_, &handles_opened, &db_);
         }
       }
     }
@@ -641,8 +687,9 @@ void LDBCommand::CloseDB() {
     }
     Status s = db_->Close();
     s.PermitUncheckedError();
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
+    db_ttl_ = nullptr;
+    db_txn_ = nullptr;
   }
 }
 
@@ -687,7 +734,9 @@ std::vector<std::string> LDBCommand::BuildCmdLineOptions(
                                   ARG_BLOB_FILE_STARTING_LEVEL,
                                   ARG_PREPOPULATE_BLOB_CACHE,
                                   ARG_IGNORE_UNKNOWN_OPTIONS,
-                                  ARG_CF_NAME};
+                                  ARG_CF_NAME,
+                                  ARG_USE_TXN,
+                                  ARG_TXN_WRITE_POLICY};
   ret.insert(ret.end(), options.begin(), options.end());
   return ret;
 }
@@ -868,13 +917,21 @@ bool LDBCommand::ParseCompressionTypeOption(
             "No compressions are supported in this build for \"mixed\".");
         return false;
       }
-      // A temporary hack to generate an SST file with a mix of compression
-      // types, as this has been *de facto* supported for a long time on the
-      // read side with no code to generate them on the write side. We can test
-      // that functionality, e.g. in check_format_compatible.sh, with this hack
-      g_hack_mixed_compression_in_block_based_table.StoreRelaxed(1);
-      // Need to list zstd in compression_name table property if it's
-      // potentially in the mix, for proper handling of context and dictionary.
+      options_.compression = kZSTD;
+      options_.bottommost_compression = kZSTD;
+      auto mgr =
+          std::make_shared<RoundRobinManager>(GetBuiltinV2CompressionManager());
+      options_.compression_manager = mgr;
+
+      // Need to list zstd in the compression_name table property if it's
+      // potentially used by being in the mix (i.e., potentially at least one
+      // data block in the table is compressed by zstd). This ensures proper
+      // context and dictionary handling, and prevents crashes in older RocksDB
+      // versions.
+      //
+      // To achieve this, set `value` (the compression_type in Options which
+      // will be used to set compression_name table property) to kZSTD, even if
+      // multiple compression types are used within a single table.
       value = ZSTD_Supported() ? kZSTD : GetSupportedCompressions()[0];
       return true;
 #endif  // !NDEBUG
@@ -1141,6 +1198,7 @@ void LDBCommand::OverrideBaseCFOptions(ColumnFamilyOptions* cf_opts) {
 // Second, overrides the options according to the CLI arguments and the
 // specific subcommand being run.
 void LDBCommand::PrepareOptions() {
+  DbStressCustomCompressionManager::Register();
   std::vector<ColumnFamilyDescriptor> column_families_from_options;
 
   if (!create_if_missing_ && try_load_options_) {
@@ -1421,7 +1479,7 @@ CompactorCommand::CompactorCommand(
     const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, false,
+    : LDBCommand(options, flags, false /* is_read_only */,
                  BuildCmdLineOptions({ARG_FROM, ARG_TO, ARG_HEX, ARG_KEY_HEX,
                                       ARG_VALUE_HEX, ARG_TTL})),
       null_from_(true),
@@ -1496,7 +1554,7 @@ DBLoaderCommand::DBLoaderCommand(
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
     : LDBCommand(
-          options, flags, false,
+          options, flags, false /* is_read_only */,
           BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, ARG_FROM,
                                ARG_TO, ARG_CREATE_IF_MISSING, ARG_DISABLE_WAL,
                                ARG_BULK_LOAD, ARG_COMPACT})),
@@ -1596,11 +1654,12 @@ void DumpManifestFile(Options options, std::string file, bool verbose, bool hex,
   WriteController wc(options.delayed_write_rate);
   WriteBufferManager wb(options.db_write_buffer_size);
   ImmutableDBOptions immutable_db_options(options);
-  VersionSet versions(dbname, &immutable_db_options, sopt, tc.get(), &wb, &wc,
+  VersionSet versions(dbname, &immutable_db_options, MutableDBOptions{}, sopt,
+                      tc.get(), &wb, &wc,
                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
                       /*db_id=*/"", /*db_session_id=*/"",
                       options.daily_offpeak_time_utc,
-                      /*error_handler=*/nullptr, /*read_only=*/true);
+                      /*error_handler=*/nullptr, /*unchanging=*/true);
   Status s = versions.DumpManifest(options, file, verbose, hex, json, cf_descs);
   if (!s.ok()) {
     fprintf(stderr, "Error in processing file %s %s\n", file.c_str(),
@@ -1608,6 +1667,57 @@ void DumpManifestFile(Options options, std::string file, bool verbose, bool hex,
   }
 }
 
+void DumpCompactionProgressFile(const std::string& file_path) {
+  Status s;
+  std::unique_ptr<SequentialFileReader> file_reader;
+
+  std::unique_ptr<FSSequentialFile> file;
+  const std::shared_ptr<FileSystem>& fs = Env::Default()->GetFileSystem();
+  s = fs->NewSequentialFile(file_path, FileOptions(), &file, nullptr);
+  if (!s.ok()) {
+    fprintf(stderr, "Failed to open compaction progress file %s: %s\n",
+            file_path.c_str(), s.ToString().c_str());
+    return;
+  }
+
+  file_reader = std::make_unique<SequentialFileReader>(std::move(file),
+                                                       file_path, 0, nullptr);
+
+  log::Reader reader(nullptr, std::move(file_reader), nullptr,
+                     true /* checksum */, 0);
+
+  Slice record;
+  std::string scratch;
+  int count = 0;
+
+  fprintf(stdout, "Compaction Progress File: %s\n", file_path.c_str());
+  fprintf(stdout, "============================================\n");
+
+  while (reader.ReadRecord(&record, &scratch)) {
+    VersionEdit edit;
+    s = edit.DecodeFrom(record);
+    if (!s.ok()) {
+      fprintf(stderr, "Failed to decode VersionEdit: %s\n",
+              s.ToString().c_str());
+      continue;
+    }
+
+    if (edit.HasSubcompactionProgress()) {
+      fprintf(stdout, "Progress Record %d:\n", count);
+      const auto& progress = edit.GetSubcompactionProgress();
+      fprintf(stdout, "%s\n", progress.ToString().c_str());
+      ++count;
+    }
+  }
+
+  if (count == 0) {
+    fprintf(stdout,
+            "No valid records found in the compaction progress file.\n");
+  } else {
+    fprintf(stdout, "\nTotal records: %d\n", count);
+  }
+}
+
 }  // namespace
 
 const std::string ManifestDumpCommand::ARG_VERBOSE = "verbose";
@@ -1628,7 +1738,7 @@ ManifestDumpCommand::ManifestDumpCommand(
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
     : LDBCommand(
-          options, flags, false,
+          options, flags, true /* is_read_only */,
           BuildCmdLineOptions({ARG_VERBOSE, ARG_PATH, ARG_HEX, ARG_JSON})),
       verbose_(false),
       json_(false) {
@@ -1740,11 +1850,12 @@ Status GetLiveFilesChecksumInfoFromVersionSet(Options options,
   WriteController wc(options.delayed_write_rate);
   WriteBufferManager wb(options.db_write_buffer_size);
   ImmutableDBOptions immutable_db_options(options);
-  VersionSet versions(dbname, &immutable_db_options, sopt, tc.get(), &wb, &wc,
+  VersionSet versions(dbname, &immutable_db_options, MutableDBOptions{options},
+                      sopt, tc.get(), &wb, &wc,
                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
                       /*db_id=*/"", /*db_session_id=*/"",
                       options.daily_offpeak_time_utc,
-                      /*error_handler=*/nullptr, /*read_only=*/true);
+                      /*error_handler=*/nullptr, /*unchanging=*/true);
   std::vector<std::string> cf_name_list;
   s = versions.ListColumnFamilies(&cf_name_list, db_path,
                                   immutable_db_options.fs.get());
@@ -1776,7 +1887,7 @@ FileChecksumDumpCommand::FileChecksumDumpCommand(
     const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, false,
+    : LDBCommand(options, flags, true /* is_read_only */,
                  BuildCmdLineOptions({ARG_PATH, ARG_HEX})) {
   auto itr = options.find(ARG_PATH);
   if (itr != options.end()) {
@@ -1840,7 +1951,8 @@ GetPropertyCommand::GetPropertyCommand(
     const std::vector<std::string>& params,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, true, BuildCmdLineOptions({})) {
+    : LDBCommand(options, flags, true /* is_read_only */,
+                 BuildCmdLineOptions({})) {
   if (params.size() != 1) {
     exec_state_ =
         LDBCommandExecuteResult::Failed("property name must be specified");
@@ -1891,7 +2003,8 @@ ListColumnFamiliesCommand::ListColumnFamiliesCommand(
     const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, false, BuildCmdLineOptions({})) {}
+    : LDBCommand(options, flags, true /* is_read_only */,
+                 BuildCmdLineOptions({})) {}
 
 void ListColumnFamiliesCommand::DoCommand() {
   PrepareOptions();
@@ -1925,7 +2038,7 @@ CreateColumnFamilyCommand::CreateColumnFamilyCommand(
     const std::vector<std::string>& params,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, true, {ARG_DB}) {
+    : LDBCommand(options, flags, false /* is_read_only */, {ARG_DB}) {
   if (params.size() != 1) {
     exec_state_ = LDBCommandExecuteResult::Failed(
         "new column family name must be specified");
@@ -1962,7 +2075,7 @@ DropColumnFamilyCommand::DropColumnFamilyCommand(
     const std::vector<std::string>& params,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, true, {ARG_DB}) {
+    : LDBCommand(options, flags, false /* is_read_only */, {ARG_DB}) {
   if (params.size() != 1) {
     exec_state_ = LDBCommandExecuteResult::Failed(
         "The name of column family to drop must be specified");
@@ -2038,7 +2151,7 @@ InternalDumpCommand::InternalDumpCommand(
     const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, true,
+    : LDBCommand(options, flags, true /* is_read_only */,
                  BuildCmdLineOptions(
                      {ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, ARG_FROM, ARG_TO,
                       ARG_MAX_KEYS, ARG_COUNT_ONLY, ARG_COUNT_DELIM, ARG_STATS,
@@ -2111,8 +2224,9 @@ void InternalDumpCommand::DoCommand() {
 
   // Cast as DBImpl to get internal iterator
   std::vector<KeyVersion> key_versions;
-  Status st = GetAllKeyVersions(db_, GetCfHandle(), from_, to_, max_keys_,
-                                &key_versions);
+  Status st = GetAllKeyVersions(
+      db_.get(), GetCfHandle(), has_from_ ? from_ : OptSlice{},
+      has_to_ ? to_ : OptSlice{}, max_keys_, &key_versions);
   if (!st.ok()) {
     exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
     return;
@@ -2218,13 +2332,12 @@ DBDumperCommand::DBDumperCommand(
     const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(
-          options, flags, true,
-          BuildCmdLineOptions(
-              {ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, ARG_FROM, ARG_TO,
-               ARG_MAX_KEYS, ARG_COUNT_ONLY, ARG_COUNT_DELIM, ARG_STATS,
-               ARG_TTL_START, ARG_TTL_END, ARG_TTL_BUCKET, ARG_TIMESTAMP,
-               ARG_PATH, ARG_DECODE_BLOB_INDEX, ARG_DUMP_UNCOMPRESSED_BLOBS})),
+    : LDBCommand(options, flags, true /* is_read_only */,
+                 BuildCmdLineOptions(
+                     {ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, ARG_FROM,
+                      ARG_TO, ARG_MAX_KEYS, ARG_COUNT_ONLY, ARG_COUNT_DELIM,
+                      ARG_STATS, ARG_TTL_START, ARG_TTL_END, ARG_TTL_BUCKET,
+                      ARG_TIMESTAMP, ARG_PATH, ARG_DECODE_BLOB_INDEX})),
       null_from_(true),
       null_to_(true),
       max_keys_(-1),
@@ -2272,7 +2385,6 @@ DBDumperCommand::DBDumperCommand(
   print_stats_ = IsFlagPresent(flags, ARG_STATS);
   count_only_ = IsFlagPresent(flags, ARG_COUNT_ONLY);
   decode_blob_index_ = IsFlagPresent(flags, ARG_DECODE_BLOB_INDEX);
-  dump_uncompressed_blobs_ = IsFlagPresent(flags, ARG_DUMP_UNCOMPRESSED_BLOBS);
 
   if (is_key_hex_) {
     if (!null_from_) {
@@ -2307,7 +2419,6 @@ void DBDumperCommand::Help(std::string& ret) {
   ret.append(" [--" + ARG_TTL_END + "=<N>:- is exclusive]");
   ret.append(" [--" + ARG_PATH + "=<path_to_a_file>]");
   ret.append(" [--" + ARG_DECODE_BLOB_INDEX + "]");
-  ret.append(" [--" + ARG_DUMP_UNCOMPRESSED_BLOBS + "]");
   ret.append("\n");
 }
 
@@ -2354,8 +2465,7 @@ void DBDumperCommand::DoCommand() {
                          /*  json_ */ false, column_families_);
         break;
       case kBlobFile:
-        DumpBlobFile(path_, is_key_hex_, is_value_hex_,
-                     dump_uncompressed_blobs_);
+        DumpBlobFile(path_, is_key_hex_, is_value_hex_);
         break;
       default:
         exec_state_ = LDBCommandExecuteResult::Failed(
@@ -2539,7 +2649,7 @@ ReduceDBLevelsCommand::ReduceDBLevelsCommand(
     const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, false,
+    : LDBCommand(options, flags, false /* is_read_only */,
                  BuildCmdLineOptions({ARG_NEW_LEVELS, ARG_PRINT_OLD_LEVELS})),
       old_levels_(1 << 7),
       new_levels_(-1),
@@ -2592,11 +2702,12 @@ Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt, int* levels) {
   const InternalKeyComparator cmp(opt.comparator);
   WriteController wc(opt.delayed_write_rate);
   WriteBufferManager wb(opt.db_write_buffer_size);
-  VersionSet versions(db_path_, &db_options, soptions, tc.get(), &wb, &wc,
+  VersionSet versions(db_path_, &db_options, MutableDBOptions{opt}, soptions,
+                      tc.get(), &wb, &wc,
                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
                       /*db_id=*/"", /*db_session_id=*/"",
                       opt.daily_offpeak_time_utc,
-                      /*error_handler=*/nullptr, /*read_only=*/true);
+                      /*error_handler=*/nullptr, /*unchanging=*/false);
   std::vector<ColumnFamilyDescriptor> dummy;
   ColumnFamilyDescriptor dummy_descriptor(kDefaultColumnFamilyName,
                                           ColumnFamilyOptions(opt));
@@ -2678,7 +2789,7 @@ ChangeCompactionStyleCommand::ChangeCompactionStyleCommand(
     const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, false,
+    : LDBCommand(options, flags, false /* is_read_only */,
                  BuildCmdLineOptions(
                      {ARG_OLD_COMPACTION_STYLE, ARG_NEW_COMPACTION_STYLE})),
       old_compaction_style_(-1),
@@ -3224,7 +3335,7 @@ WALDumperCommand::WALDumperCommand(
     const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, true,
+    : LDBCommand(options, flags, true /* is_read_only */,
                  BuildCmdLineOptions({ARG_WAL_FILE, ARG_DB, ARG_WRITE_COMMITTED,
                                       ARG_PRINT_HEADER, ARG_PRINT_VALUE,
                                       ARG_ONLY_PRINT_SEQNO_GAPS})),
@@ -3280,7 +3391,7 @@ void WALDumperCommand::DoCommand() {
 GetCommand::GetCommand(const std::vector<std::string>& params,
                        const std::map<std::string, std::string>& options,
                        const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, true,
+    : LDBCommand(options, flags, true /* is_read_only */,
                  BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX,
                                       ARG_VALUE_HEX, ARG_READ_TIMESTAMP})) {
   if (params.size() != 1) {
@@ -3339,7 +3450,7 @@ MultiGetCommand::MultiGetCommand(
     const std::vector<std::string>& params,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, true,
+    : LDBCommand(options, flags, true /* is_read_only */,
                  BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX,
                                       ARG_READ_TIMESTAMP})) {
   if (params.size() < 1) {
@@ -3414,7 +3525,7 @@ GetEntityCommand::GetEntityCommand(
     const std::vector<std::string>& params,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, true,
+    : LDBCommand(options, flags, true /* is_read_only */,
                  BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX,
                                       ARG_VALUE_HEX, ARG_READ_TIMESTAMP})) {
   if (params.size() != 1) {
@@ -3552,7 +3663,7 @@ ApproxSizeCommand::ApproxSizeCommand(
     const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, true,
+    : LDBCommand(options, flags, true /* is_read_only */,
                  BuildCmdLineOptions(
                      {ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, ARG_FROM, ARG_TO})) {
   if (options.find(ARG_FROM) != options.end()) {
@@ -3608,7 +3719,7 @@ BatchPutCommand::BatchPutCommand(
     const std::vector<std::string>& params,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, false,
+    : LDBCommand(options, flags, false /* is_read_only */,
                  BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX,
                                       ARG_VALUE_HEX, ARG_CREATE_IF_MISSING})) {
   if (params.size() < 2) {
@@ -3680,7 +3791,7 @@ ScanCommand::ScanCommand(const std::vector<std::string>& /*params*/,
                          const std::map<std::string, std::string>& options,
                          const std::vector<std::string>& flags)
     : LDBCommand(
-          options, flags, true,
+          options, flags, true /* is_read_only */,
           BuildCmdLineOptions({ARG_TTL, ARG_NO_VALUE, ARG_HEX, ARG_KEY_HEX,
                                ARG_TO, ARG_VALUE_HEX, ARG_FROM, ARG_TIMESTAMP,
                                ARG_MAX_KEYS, ARG_TTL_START, ARG_TTL_END,
@@ -3857,7 +3968,7 @@ void ScanCommand::DoCommand() {
 DeleteCommand::DeleteCommand(const std::vector<std::string>& params,
                              const std::map<std::string, std::string>& options,
                              const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, false,
+    : LDBCommand(options, flags, false /* is_read_only */,
                  BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX})) {
   if (params.size() != 1) {
     exec_state_ = LDBCommandExecuteResult::Failed(
@@ -3893,7 +4004,7 @@ SingleDeleteCommand::SingleDeleteCommand(
     const std::vector<std::string>& params,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, false,
+    : LDBCommand(options, flags, false /* is_read_only */,
                  BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX})) {
   if (params.size() != 1) {
     exec_state_ = LDBCommandExecuteResult::Failed(
@@ -3929,7 +4040,7 @@ DeleteRangeCommand::DeleteRangeCommand(
     const std::vector<std::string>& params,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, false,
+    : LDBCommand(options, flags, false /* is_read_only */,
                  BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX})) {
   if (params.size() != 2) {
     exec_state_ = LDBCommandExecuteResult::Failed(
@@ -3967,7 +4078,7 @@ void DeleteRangeCommand::DoCommand() {
 PutCommand::PutCommand(const std::vector<std::string>& params,
                        const std::map<std::string, std::string>& options,
                        const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, false,
+    : LDBCommand(options, flags, false /* is_read_only */,
                  BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX,
                                       ARG_VALUE_HEX, ARG_CREATE_IF_MISSING})) {
   if (params.size() != 2) {
@@ -4021,7 +4132,7 @@ PutEntityCommand::PutEntityCommand(
     const std::vector<std::string>& params,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, false,
+    : LDBCommand(options, flags, false /* is_read_only */,
                  BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX,
                                       ARG_VALUE_HEX, ARG_CREATE_IF_MISSING})) {
   if (params.size() < 2) {
@@ -4103,7 +4214,7 @@ DBQuerierCommand::DBQuerierCommand(
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
     : LDBCommand(
-          options, flags, false,
+          options, flags, false /* is_read_only */,
           BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX})) {
 
 }
@@ -4339,7 +4450,8 @@ CheckConsistencyCommand::CheckConsistencyCommand(
     const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, true, BuildCmdLineOptions({})) {}
+    : LDBCommand(options, flags, true /* is_read_only */,
+                 BuildCmdLineOptions({})) {}
 
 void CheckConsistencyCommand::Help(std::string& ret) {
   ret.append("  ");
@@ -4386,7 +4498,7 @@ void CheckPointCommand::DoCommand() {
     return;
   }
   Checkpoint* checkpoint;
-  Status status = Checkpoint::Create(db_, &checkpoint);
+  Status status = Checkpoint::Create(db_.get(), &checkpoint);
   status = checkpoint->CreateCheckpoint(checkpoint_dir_);
   if (status.ok()) {
     fprintf(stdout, "OK\n");
@@ -4402,7 +4514,8 @@ const std::string RepairCommand::ARG_VERBOSE = "verbose";
 RepairCommand::RepairCommand(const std::vector<std::string>& /*params*/,
                              const std::map<std::string, std::string>& options,
                              const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, false, BuildCmdLineOptions({ARG_VERBOSE})) {
+    : LDBCommand(options, flags, false /* is_read_only */,
+                 BuildCmdLineOptions({ARG_VERBOSE})) {
   verbose_ = IsFlagPresent(flags, ARG_VERBOSE);
 }
 
@@ -4540,7 +4653,7 @@ void BackupCommand::DoCommand() {
     exec_state_ = LDBCommandExecuteResult::Failed(status.ToString());
     return;
   }
-  status = backup_engine->CreateNewBackup(db_);
+  status = backup_engine->CreateNewBackup(db_.get());
   if (status.ok()) {
     fprintf(stdout, "create new backup OK\n");
   } else {
@@ -4645,22 +4758,16 @@ void DumpSstFile(Options options, std::string filename, bool output_hex,
 }
 
 void DumpBlobFile(const std::string& filename, bool is_key_hex,
-                  bool is_value_hex, bool dump_uncompressed_blobs) {
+                  bool is_value_hex) {
   using ROCKSDB_NAMESPACE::blob_db::BlobDumpTool;
   BlobDumpTool tool;
-  BlobDumpTool::DisplayType blob_type = is_value_hex
+  BlobDumpTool::DisplayType show_blob = is_value_hex
                                             ? BlobDumpTool::DisplayType::kHex
                                             : BlobDumpTool::DisplayType::kRaw;
-  BlobDumpTool::DisplayType show_uncompressed_blob =
-      dump_uncompressed_blobs ? blob_type : BlobDumpTool::DisplayType::kNone;
-  BlobDumpTool::DisplayType show_blob =
-      dump_uncompressed_blobs ? BlobDumpTool::DisplayType::kNone : blob_type;
-
   BlobDumpTool::DisplayType show_key = is_key_hex
                                            ? BlobDumpTool::DisplayType::kHex
                                            : BlobDumpTool::DisplayType::kRaw;
-  Status s = tool.Run(filename, show_key, show_blob, show_uncompressed_blob,
-                      /* show_summary */ true);
+  Status s = tool.Run(filename, show_key, show_blob, /* show_summary */ true);
   if (!s.ok()) {
     fprintf(stderr, "Failed: %s\n", s.ToString().c_str());
   }
@@ -4683,18 +4790,14 @@ DBFileDumperCommand::DBFileDumperCommand(
     const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, true,
-                 BuildCmdLineOptions(
-                     {ARG_DECODE_BLOB_INDEX, ARG_DUMP_UNCOMPRESSED_BLOBS})),
-      decode_blob_index_(IsFlagPresent(flags, ARG_DECODE_BLOB_INDEX)),
-      dump_uncompressed_blobs_(
-          IsFlagPresent(flags, ARG_DUMP_UNCOMPRESSED_BLOBS)) {}
+    : LDBCommand(options, flags, true /* is_read_only */,
+                 BuildCmdLineOptions({ARG_DECODE_BLOB_INDEX})),
+      decode_blob_index_(IsFlagPresent(flags, ARG_DECODE_BLOB_INDEX)) {}
 
 void DBFileDumperCommand::Help(std::string& ret) {
   ret.append("  ");
   ret.append(DBFileDumperCommand::Name());
-  ret.append(" [--" + ARG_DECODE_BLOB_INDEX + "] ");
-  ret.append(" [--" + ARG_DUMP_UNCOMPRESSED_BLOBS + "] ");
+  ret.append(" [--" + ARG_DECODE_BLOB_INDEX + "]");
   ret.append("\n");
 }
 
@@ -4762,8 +4865,7 @@ void DBFileDumperCommand::DoCommand() {
       filename = NormalizePath(filename);
       std::cout << filename << std::endl;
       std::cout << "------------------------------" << std::endl;
-      DumpBlobFile(filename, /* is_key_hex */ false, /* is_value_hex */ false,
-                   dump_uncompressed_blobs_);
+      DumpBlobFile(filename, /* is_key_hex */ false, /* is_value_hex */ false);
       std::cout << std::endl;
     }
   }
@@ -4804,7 +4906,7 @@ DBLiveFilesMetadataDumperCommand::DBLiveFilesMetadataDumperCommand(
     const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, true,
+    : LDBCommand(options, flags, true /* is_read_only */,
                  BuildCmdLineOptions({ARG_SORT_BY_FILENAME})) {
   sort_by_filename_ = IsFlagPresent(flags, ARG_SORT_BY_FILENAME);
 }
@@ -5119,7 +5221,8 @@ void IngestExternalSstFilesCommand::OverrideBaseOptions() {
 ListFileRangeDeletesCommand::ListFileRangeDeletesCommand(
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, true, BuildCmdLineOptions({ARG_MAX_KEYS})) {
+    : LDBCommand(options, flags, true /* is_read_only */,
+                 BuildCmdLineOptions({ARG_MAX_KEYS})) {
   auto itr = options.find(ARG_MAX_KEYS);
   if (itr != options.end()) {
     try {
@@ -5287,4 +5390,35 @@ void UpdateManifestCommand::DoCommand() {
   }
 }
 
+const std::string CompactionProgressDumpCommand::ARG_PATH = "path";
+
+CompactionProgressDumpCommand::CompactionProgressDumpCommand(
+    const std::vector<std::string>& /*params*/,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(options, flags, false, BuildCmdLineOptions({ARG_PATH})) {
+  auto itr = options.find(ARG_PATH);
+  if (itr != options.end()) {
+    path_ = itr->second;
+  } else {
+    path_ = "";
+  }
+
+  if (path_.empty()) {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "The --path option is required for compaction_progress_dump command");
+  }
+}
+
+void CompactionProgressDumpCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(CompactionProgressDumpCommand::Name());
+  ret.append(" [--" + ARG_PATH + "=<path_to_compaction_progress_file>]");
+  ret.append("\n");
+}
+
+void CompactionProgressDumpCommand::DoCommand() {
+  DumpCompactionProgressFile(path_);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/tools/ldb_cmd_impl.h b/tools/ldb_cmd_impl.h
index 3f7273dd5447..ee3122d805c0 100644
--- a/tools/ldb_cmd_impl.h
+++ b/tools/ldb_cmd_impl.h
@@ -47,7 +47,6 @@ class DBFileDumperCommand : public LDBCommand {
 
  private:
   bool decode_blob_index_;
-  bool dump_uncompressed_blobs_;
 };
 
 class DBLiveFilesMetadataDumperCommand : public LDBCommand {
@@ -109,7 +108,6 @@ class DBDumperCommand : public LDBCommand {
   bool print_stats_;
   std::string path_;
   bool decode_blob_index_;
-  bool dump_uncompressed_blobs_;
 
   static const std::string ARG_COUNT_ONLY;
   static const std::string ARG_COUNT_DELIM;
@@ -814,4 +812,25 @@ class UnsafeRemoveSstFileCommand : public LDBCommand {
   uint64_t sst_file_number_;
 };
 
+class CompactionProgressDumpCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "compaction_progress_dump"; }
+
+  CompactionProgressDumpCommand(
+      const std::vector<std::string>& params,
+      const std::map<std::string, std::string>& options,
+      const std::vector<std::string>& flags);
+
+  static void Help(std::string& ret);
+
+  void DoCommand() override;
+
+  bool NoDBOpen() override { return true; }
+
+ private:
+  std::string path_;
+
+  static const std::string ARG_PATH;
+};
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/tools/ldb_cmd_test.cc b/tools/ldb_cmd_test.cc
index 711a313db678..09be65a6914c 100644
--- a/tools/ldb_cmd_test.cc
+++ b/tools/ldb_cmd_test.cc
@@ -6,6 +6,8 @@
 #include "rocksdb/utilities/ldb_cmd.h"
 
 #include <cinttypes>
+#include <iomanip>
+#include <memory>
 
 #include "db/db_test_util.h"
 #include "db/version_edit.h"
@@ -98,7 +100,7 @@ TEST_F(LdbCmdTest, MemEnv) {
   opts.env = env.get();
   opts.create_if_missing = true;
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
   ASSERT_OK(DB::Open(opts, dbname, &db));
 
@@ -112,7 +114,7 @@ TEST_F(LdbCmdTest, MemEnv) {
   fopts.wait = true;
   ASSERT_OK(db->Flush(fopts));
 
-  delete db;
+  db.reset();
 
   char arg1[] = "./ldb";
   char arg2[1024];
@@ -207,8 +209,9 @@ class FileChecksumTestHelper {
     WriteController wc(options_.delayed_write_rate);
     WriteBufferManager wb(options_.db_write_buffer_size);
     ImmutableDBOptions immutable_db_options(options_);
-    VersionSet versions(dbname_, &immutable_db_options, sopt, tc.get(), &wb,
-                        &wc, nullptr, nullptr, "", "",
+    VersionSet versions(dbname_, &immutable_db_options,
+                        MutableDBOptions{options_}, sopt, tc.get(), &wb, &wc,
+                        nullptr, nullptr, "", "",
                         options_.daily_offpeak_time_utc, nullptr,
                         /*read_only=*/false);
     std::vector<std::string> cf_name_list;
@@ -283,7 +286,7 @@ TEST_F(LdbCmdTest, DumpFileChecksumNoChecksum) {
   opts.env = env.get();
   opts.create_if_missing = true;
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
   ASSERT_OK(DB::Open(opts, dbname, &db));
 
@@ -320,7 +323,7 @@ TEST_F(LdbCmdTest, DumpFileChecksumNoChecksum) {
   }
   ASSERT_OK(db->Flush(fopts));
   ASSERT_OK(db->Close());
-  delete db;
+  db.reset();
 
   char arg1[] = "./ldb";
   char arg2[1024];
@@ -335,7 +338,7 @@ TEST_F(LdbCmdTest, DumpFileChecksumNoChecksum) {
   ASSERT_OK(DB::Open(opts, dbname, &db));
 
   // Verify each sst file checksum value and checksum name
-  FileChecksumTestHelper fct_helper(opts, db, dbname);
+  FileChecksumTestHelper fct_helper(opts, db.get(), dbname);
   ASSERT_OK(fct_helper.VerifyEachFileChecksum());
 
   // Manually trigger compaction
@@ -348,11 +351,11 @@ TEST_F(LdbCmdTest, DumpFileChecksumNoChecksum) {
   CompactRangeOptions options;
   ASSERT_OK(db->CompactRange(options, &begin, &end));
   // Verify each sst file checksum after compaction
-  FileChecksumTestHelper fct_helper_ac(opts, db, dbname);
+  FileChecksumTestHelper fct_helper_ac(opts, db.get(), dbname);
   ASSERT_OK(fct_helper_ac.VerifyEachFileChecksum());
 
   ASSERT_OK(db->Close());
-  delete db;
+  db.reset();
 
   ASSERT_EQ(0,
             LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
@@ -362,7 +365,7 @@ TEST_F(LdbCmdTest, DumpFileChecksumNoChecksum) {
   // Verify the checksum information in memory is the same as that in Manifest;
   std::vector<LiveFileMetaData> live_files;
   db->GetLiveFilesMetaData(&live_files);
-  delete db;
+  db.reset();
   ASSERT_OK(fct_helper_ac.VerifyChecksumInManifest(live_files));
 }
 
@@ -374,7 +377,7 @@ TEST_F(LdbCmdTest, BlobDBDumpFileChecksumNoChecksum) {
   opts.create_if_missing = true;
   opts.enable_blob_files = true;
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
   ASSERT_OK(DB::Open(opts, dbname, &db));
 
@@ -411,7 +414,7 @@ TEST_F(LdbCmdTest, BlobDBDumpFileChecksumNoChecksum) {
   }
   ASSERT_OK(db->Flush(fopts));
   ASSERT_OK(db->Close());
-  delete db;
+  db.reset();
 
   char arg1[] = "./ldb";
   std::string arg2_str = "--db=" + dbname;
@@ -425,7 +428,7 @@ TEST_F(LdbCmdTest, BlobDBDumpFileChecksumNoChecksum) {
   ASSERT_OK(DB::Open(opts, dbname, &db));
 
   // Verify each sst and blob file checksum value and checksum name
-  FileChecksumTestHelper fct_helper(opts, db, dbname);
+  FileChecksumTestHelper fct_helper(opts, db.get(), dbname);
   ASSERT_OK(fct_helper.VerifyEachFileChecksum());
 
   // Manually trigger compaction
@@ -441,11 +444,11 @@ TEST_F(LdbCmdTest, BlobDBDumpFileChecksumNoChecksum) {
   CompactRangeOptions options;
   ASSERT_OK(db->CompactRange(options, &begin, &end));
   // Verify each sst file checksum after compaction
-  FileChecksumTestHelper fct_helper_ac(opts, db, dbname);
+  FileChecksumTestHelper fct_helper_ac(opts, db.get(), dbname);
   ASSERT_OK(fct_helper_ac.VerifyEachFileChecksum());
 
   ASSERT_OK(db->Close());
-  delete db;
+  db.reset();
 
   ASSERT_EQ(0,
             LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
@@ -459,7 +462,7 @@ TEST_F(LdbCmdTest, DumpFileChecksumCRC32) {
   opts.create_if_missing = true;
   opts.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
   ASSERT_OK(DB::Open(opts, dbname, &db));
 
@@ -496,7 +499,7 @@ TEST_F(LdbCmdTest, DumpFileChecksumCRC32) {
   }
   ASSERT_OK(db->Flush(fopts));
   ASSERT_OK(db->Close());
-  delete db;
+  db.reset();
 
   char arg1[] = "./ldb";
   char arg2[1024];
@@ -511,7 +514,7 @@ TEST_F(LdbCmdTest, DumpFileChecksumCRC32) {
   ASSERT_OK(DB::Open(opts, dbname, &db));
 
   // Verify each sst file checksum value and checksum name
-  FileChecksumTestHelper fct_helper(opts, db, dbname);
+  FileChecksumTestHelper fct_helper(opts, db.get(), dbname);
   ASSERT_OK(fct_helper.VerifyEachFileChecksum());
 
   // Manually trigger compaction
@@ -524,11 +527,11 @@ TEST_F(LdbCmdTest, DumpFileChecksumCRC32) {
   CompactRangeOptions options;
   ASSERT_OK(db->CompactRange(options, &begin, &end));
   // Verify each sst file checksum after compaction
-  FileChecksumTestHelper fct_helper_ac(opts, db, dbname);
+  FileChecksumTestHelper fct_helper_ac(opts, db.get(), dbname);
   ASSERT_OK(fct_helper_ac.VerifyEachFileChecksum());
 
   ASSERT_OK(db->Close());
-  delete db;
+  db.reset();
 
   ASSERT_EQ(0,
             LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
@@ -541,7 +544,7 @@ TEST_F(LdbCmdTest, DumpFileChecksumCRC32) {
   ASSERT_OK(fct_helper_ac.VerifyChecksumInManifest(live_files));
 
   ASSERT_OK(db->Close());
-  delete db;
+  db.reset();
 }
 
 TEST_F(LdbCmdTest, BlobDBDumpFileChecksumCRC32) {
@@ -553,7 +556,7 @@ TEST_F(LdbCmdTest, BlobDBDumpFileChecksumCRC32) {
   opts.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
   opts.enable_blob_files = true;
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
   ASSERT_OK(DB::Open(opts, dbname, &db));
 
@@ -590,7 +593,7 @@ TEST_F(LdbCmdTest, BlobDBDumpFileChecksumCRC32) {
   }
   ASSERT_OK(db->Flush(fopts));
   ASSERT_OK(db->Close());
-  delete db;
+  db.reset();
 
   char arg1[] = "./ldb";
   std::string arg2_str = "--db=" + dbname;
@@ -604,7 +607,7 @@ TEST_F(LdbCmdTest, BlobDBDumpFileChecksumCRC32) {
   ASSERT_OK(DB::Open(opts, dbname, &db));
 
   // Verify each sst and blob file checksum value and checksum name
-  FileChecksumTestHelper fct_helper(opts, db, dbname);
+  FileChecksumTestHelper fct_helper(opts, db.get(), dbname);
   ASSERT_OK(fct_helper.VerifyEachFileChecksum());
 
   // Manually trigger compaction
@@ -620,11 +623,11 @@ TEST_F(LdbCmdTest, BlobDBDumpFileChecksumCRC32) {
   CompactRangeOptions options;
   ASSERT_OK(db->CompactRange(options, &begin, &end));
   // Verify each sst file checksum after compaction
-  FileChecksumTestHelper fct_helper_ac(opts, db, dbname);
+  FileChecksumTestHelper fct_helper_ac(opts, db.get(), dbname);
   ASSERT_OK(fct_helper_ac.VerifyEachFileChecksum());
 
   ASSERT_OK(db->Close());
-  delete db;
+  db.reset();
 
   ASSERT_EQ(0,
             LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
@@ -676,7 +679,7 @@ TEST_F(LdbCmdTest, ListFileTombstone) {
   opts.env = env.get();
   opts.create_if_missing = true;
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
   ASSERT_OK(DB::Open(opts, dbname, &db));
 
@@ -692,7 +695,7 @@ TEST_F(LdbCmdTest, ListFileTombstone) {
   ASSERT_OK(db->DeleteRange(wopts, db->DefaultColumnFamily(), "bar", "foo2"));
   ASSERT_OK(db->Flush(fopts));
 
-  delete db;
+  db.reset();
 
   {
     char arg1[] = "./ldb";
@@ -769,7 +772,7 @@ TEST_F(LdbCmdTest, DisableConsistencyChecks) {
   std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
 
   {
-    DB* db = nullptr;
+    std::unique_ptr<DB> db;
     ASSERT_OK(DB::Open(opts, dbname, &db));
 
     WriteOptions wopts;
@@ -783,8 +786,6 @@ TEST_F(LdbCmdTest, DisableConsistencyChecks) {
     ASSERT_OK(db->Put(wopts, "foo2", "3"));
     ASSERT_OK(db->Put(wopts, "bar2", "4"));
     ASSERT_OK(db->Flush(fopts));
-
-    delete db;
   }
 
   {
@@ -888,7 +889,7 @@ TEST_F(LdbCmdTest, LoadCFOptionsAndOverride) {
   opts.env = env.get();
   opts.create_if_missing = true;
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
   ASSERT_OK(DestroyDB(dbname, opts));
   ASSERT_OK(DB::Open(opts, dbname, &db));
@@ -899,7 +900,7 @@ TEST_F(LdbCmdTest, LoadCFOptionsAndOverride) {
   ASSERT_OK(db->CreateColumnFamily(cf_opts, "cf1", &cf_handle));
 
   delete cf_handle;
-  delete db;
+  db.reset();
 
   char arg1[] = "./ldb";
   char arg2[1024];
@@ -931,7 +932,7 @@ TEST_F(LdbCmdTest, UnsafeRemoveSstFile) {
   opts.level0_file_num_compaction_trigger = 10;
   opts.create_if_missing = true;
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   std::string dbname = test::PerThreadDBPath(Env::Default(), "ldb_cmd_test");
   ASSERT_OK(DestroyDB(dbname, opts));
   ASSERT_OK(DB::Open(opts, dbname, &db));
@@ -955,8 +956,7 @@ TEST_F(LdbCmdTest, UnsafeRemoveSstFile) {
   uint64_t to_remove = numbers[1];
 
   // Close for unsafe_remove_sst_file
-  delete db;
-  db = nullptr;
+  db.reset();
 
   char arg1[] = "./ldb";
   char arg2[1024];
@@ -1005,8 +1005,7 @@ TEST_F(LdbCmdTest, UnsafeRemoveSstFile) {
 
   // Close for unsafe_remove_sst_file
   delete cf_handle;
-  delete db;
-  db = nullptr;
+  db.reset();
 
   snprintf(arg4, sizeof(arg4), "%" PRIu64, to_remove);
   ASSERT_EQ(0,
@@ -1047,8 +1046,7 @@ TEST_F(LdbCmdTest, UnsafeRemoveSstFile) {
   for (auto& h : handles) {
     delete h;
   }
-  delete db;
-  db = nullptr;
+  db.reset();
 
   snprintf(arg4, sizeof(arg4), "%" PRIu64, to_remove);
   ASSERT_EQ(0,
@@ -1064,7 +1062,7 @@ TEST_F(LdbCmdTest, UnsafeRemoveSstFile) {
   for (auto& h : handles) {
     delete h;
   }
-  delete db;
+  db.reset();
 }
 
 TEST_F(LdbCmdTest, FileTemperatureUpdateManifest) {
@@ -1076,7 +1074,7 @@ TEST_F(LdbCmdTest, FileTemperatureUpdateManifest) {
   opts.create_if_missing = true;
   opts.env = env.get();
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
   ASSERT_OK(DestroyDB(dbname, opts));
   ASSERT_OK(DB::Open(opts, dbname, &db));
@@ -1100,8 +1098,7 @@ TEST_F(LdbCmdTest, FileTemperatureUpdateManifest) {
   }
 
   // Close & reopen
-  delete db;
-  db = nullptr;
+  db.reset();
   test_fs->PopRequestedSstFileTemperatures();
   ASSERT_OK(DB::Open(opts, dbname, &db));
 
@@ -1120,8 +1117,7 @@ TEST_F(LdbCmdTest, FileTemperatureUpdateManifest) {
   }
 
   // Close for update_manifest
-  delete db;
-  db = nullptr;
+  db.reset();
 
   char arg1[] = "./ldb";
   char arg2[1024];
@@ -1149,7 +1145,7 @@ TEST_F(LdbCmdTest, FileTemperatureUpdateManifest) {
   for (auto& r : requests) {
     ASSERT_EQ(r.second, number_to_temp[r.first]);
   }
-  delete db;
+  db.reset();
 }
 
 TEST_F(LdbCmdTest, RenameDbAndLoadOptions) {
@@ -1229,7 +1225,7 @@ TEST_F(LdbCmdTest, CustomComparator) {
   opts.comparator = &my_comparator;
 
   std::string dbname = test::PerThreadDBPath(env, "ldb_cmd_test");
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
 
   std::vector<ColumnFamilyDescriptor> cfds = {
       {kDefaultColumnFamilyName, opts}, {"cf1", opts}, {"cf2", opts}};
@@ -1241,7 +1237,7 @@ TEST_F(LdbCmdTest, CustomComparator) {
   for (auto& h : handles) {
     ASSERT_OK(db->DestroyColumnFamilyHandle(h));
   }
-  delete db;
+  db.reset();
 
   char arg1[] = "./ldb";
   std::string arg2 = "--db=" + dbname;
diff --git a/tools/ldb_test.py b/tools/ldb_test.py
index a8956f160f1d..e91d521d5b5c 100644
--- a/tools/ldb_test.py
+++ b/tools/ldb_test.py
@@ -347,6 +347,44 @@ def testTtlPutGet(self):
         self.assertRunFAIL("get --ttl a3")
         self.assertRunOK("checkconsistency", "OK")
 
+    def testTxnPutGet(self):
+        print("Running testTxnPutGet...")
+        # Test basic put/get with TransactionDB (WriteCommitted - default)
+        self.assertRunOK("put t1 v1 --use_txn --create_if_missing", "OK")
+        self.assertRunOK("put t2 v2 --use_txn", "OK")
+        self.assertRunOK("put t3 v3 --use_txn", "OK")
+        # Verify data can be read back with TransactionDB
+        self.assertRunOK("batchput t4 v4 t5 v5 --use_txn", "OK")
+
+        # Test with WritePrepared policy (txn_write_policy=1)
+        self.assertRunOK("put t6 v6 --use_txn --txn_write_policy=1", "OK")
+
+        # Test with WriteUnprepared policy (txn_write_policy=2)
+        self.assertRunOK("put t7 v7 --use_txn --txn_write_policy=2", "OK")
+
+        # Verify all data persists and can be read without --use_txn
+        # (regular DB::Open should work for WriteCommitted data)
+        self.assertRunOK(
+            "scan",
+            "t1 ==> v1\nt2 ==> v2\nt3 ==> v3\nt4 ==> v4\nt5 ==> v5\nt6 ==> v6\nt7 ==> v7",
+        )
+
+        # Test delete with TransactionDB
+        self.assertRunOK("delete t3 --use_txn", "OK")
+        self.assertRunOK(
+            "scan",
+            "t1 ==> v1\nt2 ==> v2\nt4 ==> v4\nt5 ==> v5\nt6 ==> v6\nt7 ==> v7",
+        )
+
+        # Verify that --use_txn and --ttl cannot be used together
+        self.assertRunFAIL("put x1 y1 --use_txn --ttl --create_if_missing")
+
+        # Verify invalid txn_write_policy values are handled
+        # (values outside 0-2 should fall back to 0)
+        self.assertRunOK("put t8 v8 --use_txn --txn_write_policy=0", "OK")
+
+        self.assertRunOK("checkconsistency", "OK")
+
     def testInvalidCmdLines(self):  # noqa: F811 T25377293 Grandfathered in
         print("Running testInvalidCmdLines...")
         # db not specified
@@ -613,7 +651,7 @@ def testDumpLiveFiles(self):
         # Call the dump_live_files function with the edited dbPath name.
         self.assertTrue(
             self.dumpLiveFiles(
-                "--db=%s --decode_blob_index --dump_uncompressed_blobs" % dbPath,
+                "--db=%s --decode_blob_index" % dbPath,
                 dumpFilePath,
             )
         )
@@ -881,7 +919,7 @@ def testBlobDump(self):
         expected_pattern = re.compile(regex)
         blob_files = self.getBlobFiles(dbPath)
         self.assertTrue(len(blob_files) >= 1)
-        cmd = "dump --path=%s --dump_uncompressed_blobs"
+        cmd = "dump --path=%s"
         self.assertRunOKFull(
             (cmd) % (blob_files[0]), expected_pattern, unexpected=False, isPattern=True
         )
diff --git a/tools/ldb_tool.cc b/tools/ldb_tool.cc
index 3dd1905e83ba..bee8d6f4f9ef 100644
--- a/tools/ldb_tool.cc
+++ b/tools/ldb_tool.cc
@@ -52,6 +52,13 @@ void LDBCommandRunner::PrintHelp(const LDBOptions& ldb_options,
   ret.append("  --" + LDBCommand::ARG_TTL +
              " with 'put','get','scan','dump','query','batchput'"
              " : DB supports ttl and value is internally timestamp-suffixed\n");
+  ret.append("  --" + LDBCommand::ARG_USE_TXN +
+             " : Open database as TransactionDB. Required for databases "
+             "created with WritePrepared or WriteUnprepared transactions.\n");
+  ret.append("  --" + LDBCommand::ARG_TXN_WRITE_POLICY +
+             "=<0|1|2> : Transaction write policy. "
+             "0=WRITE_COMMITTED (default), 1=WRITE_PREPARED, "
+             "2=WRITE_UNPREPARED\n");
   ret.append("  --" + LDBCommand::ARG_TRY_LOAD_OPTIONS +
              " : Try to load option file from DB. Default to true if " +
              LDBCommand::ARG_DB +
@@ -119,6 +126,7 @@ void LDBCommandRunner::PrintHelp(const LDBOptions& ldb_options,
   DBDumperCommand::Help(ret);
   DBLoaderCommand::Help(ret);
   ManifestDumpCommand::Help(ret);
+  CompactionProgressDumpCommand::Help(ret);
   UpdateManifestCommand::Help(ret);
   FileChecksumDumpCommand::Help(ret);
   GetPropertyCommand::Help(ret);
@@ -185,8 +193,14 @@ int LDBCommandRunner::RunCommand(
 void LDBTool::Run(int argc, char** argv, Options options,
                   const LDBOptions& ldb_options,
                   const std::vector<ColumnFamilyDescriptor>* column_families) {
-  int error_code = LDBCommandRunner::RunCommand(argc, argv, options,
-                                                ldb_options, column_families);
-  exit(error_code);
+  exit(RunAndReturn(argc, argv, options, ldb_options, column_families));
+}
+
+int LDBTool::RunAndReturn(
+    int argc, char** argv, const Options& options,
+    const LDBOptions& ldb_options,
+    const std::vector<ColumnFamilyDescriptor>* column_families) {
+  return LDBCommandRunner::RunCommand(argc, argv, options, ldb_options,
+                                      column_families);
 }
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/tools/reduce_levels_test.cc b/tools/reduce_levels_test.cc
index 061270d6d32a..2ab32a8c97b1 100644
--- a/tools/reduce_levels_test.cc
+++ b/tools/reduce_levels_test.cc
@@ -21,7 +21,7 @@ class ReduceLevelTest : public testing::Test {
   ReduceLevelTest() {
     dbname_ = test::PerThreadDBPath("db_reduce_levels_test");
     EXPECT_OK(DestroyDB(dbname_, Options()));
-    db_ = nullptr;
+    db_.reset();
   }
 
   Status OpenDB(bool create_if_missing, int levels);
@@ -46,12 +46,12 @@ class ReduceLevelTest : public testing::Test {
     if (db_ == nullptr) {
       return Status::InvalidArgument("DB not opened.");
     }
-    DBImpl* db_impl = static_cast_with_check<DBImpl>(db_);
+    DBImpl* db_impl = static_cast_with_check<DBImpl>(db_.get());
     return db_impl->TEST_FlushMemTable();
   }
 
   void MoveL0FileToLevel(int level) {
-    DBImpl* db_impl = static_cast_with_check<DBImpl>(db_);
+    DBImpl* db_impl = static_cast_with_check<DBImpl>(db_.get());
     for (int i = 0; i < level; ++i) {
       ASSERT_OK(db_impl->TEST_CompactRange(i, nullptr, nullptr));
     }
@@ -59,8 +59,7 @@ class ReduceLevelTest : public testing::Test {
 
   void CloseDB() {
     if (db_ != nullptr) {
-      delete db_;
-      db_ = nullptr;
+      db_.reset();
     }
   }
 
@@ -75,7 +74,7 @@ class ReduceLevelTest : public testing::Test {
 
  private:
   std::string dbname_;
-  DB* db_;
+  std::unique_ptr<DB> db_;
 };
 
 Status ReduceLevelTest::OpenDB(bool create_if_missing, int num_levels) {
diff --git a/tools/regression_test.sh b/tools/regression_test.sh
index 26380f61439c..a823f39b735a 100755
--- a/tools/regression_test.sh
+++ b/tools/regression_test.sh
@@ -127,17 +127,15 @@ function main {
 
   setup_test_directory
   if [ $TEST_MODE -le 1 ]; then
-      test_remote "test -d $ORIGIN_PATH"
-      if [[ $? -ne 0 ]]; then
-          echo "Building DB..."
-          # compactall alone will not print ops or threads, which will fail update_report
-          run_db_bench "fillseq,compactall" $NUM_KEYS 1 0 0
-          # only save for future use on success
-          test_remote "mv $DB_PATH $ORIGIN_PATH"
-      fi
+      echo "Building DB..."
+      # compactall alone will not print ops or threads, which will fail update_report
+     run_db_bench "fillseq,compactall" $NUM_KEYS 1 0 0
+     # only save for future use on success
+     test_remote "mv $DB_PATH $ORIGIN_PATH"
   fi
   if [ $TEST_MODE -ge 1 ]; then
       build_checkpoint
+
       # run_db_bench benchmark_name NUM_OPS NUM_THREADS USED_EXISTING_DB UPDATE_REPORT ASYNC_IO
       run_db_bench "seekrandom_asyncio" $NUM_OPS $NUM_THREADS  1 1 true
       run_db_bench "multireadrandom_asyncio" $NUM_OPS $NUM_THREADS  1 1 true
@@ -332,25 +330,22 @@ function set_async_io_parameters {
 }
 
 function build_checkpoint {
-    cmd_prefix=""
-    if ! [ -z "$REMOTE_USER_AT_HOST" ]; then
-        cmd_prefix="$SSH $REMOTE_USER_AT_HOST "
-    fi
+    echo "NUM_MULTI_DB=$NUM_MULTI_DB"
     if [ $NUM_MULTI_DB -gt 1 ]; then
-        dirs=$($cmd_prefix find $ORIGIN_PATH -type d -links 2)
+        run_remote "mkdir -p $DB_PATH"
+        run_remote "find $ORIGIN_PATH -type d -links 2"
+        dirs=$?
         for dir in $dirs; do
             db_index=$(basename $dir)
             echo "Building checkpoints: $ORIGIN_PATH/$db_index -> $DB_PATH/$db_index ..."
-            $cmd_prefix $DB_BENCH_DIR/ldb checkpoint --checkpoint_dir=$DB_PATH/$db_index \
-                        --db=$ORIGIN_PATH/$db_index --try_load_options 2>&1
+            run_remote "$DB_BENCH_DIR/ldb checkpoint --checkpoint_dir=$DB_PATH/$db_index --db=$ORIGIN_PATH/$db_index --try_load_options 2>&1"
             exit_on_error $?
         done
     else
         # checkpoint cannot build in directory already exists
-        $cmd_prefix rm -rf $DB_PATH
+        run_remote "rm -rf $DB_PATH"
         echo "Building checkpoint: $ORIGIN_PATH -> $DB_PATH ..."
-        $cmd_prefix $DB_BENCH_DIR/ldb checkpoint --checkpoint_dir=$DB_PATH \
-                    --db=$ORIGIN_PATH --try_load_options 2>&1
+        run_remote "$DB_BENCH_DIR/ldb checkpoint --checkpoint_dir=$DB_PATH --db=$ORIGIN_PATH --try_load_options 2>&1"
         exit_on_error $?
     fi
 }
@@ -453,7 +448,7 @@ function setup_options_file {
  if ! [ -z "$OPTIONS_FILE" ]; then
     if ! [ -z "$REMOTE_USER_AT_HOST" ]; then
       options_file="$DB_BENCH_DIR/OPTIONS_FILE"
-      run_local "$SCP $OPTIONS_FILE $REMOTE_USER_AT_HOST:$options_file"
+      $SCP $OPTIONS_FILE $REMOTE_USER_AT_HOST:$options_file
     else
       options_file="$OPTIONS_FILE"
     fi
@@ -486,9 +481,8 @@ function setup_test_directory {
   run_remote "ls -l $DB_BENCH_DIR"
 
   if ! [ -z "$REMOTE_USER_AT_HOST" ]; then
-      shopt -s nullglob # allow missing librocksdb*.so* for static lib build
-      run_local "tar cz db_bench ldb librocksdb*.so* | $SSH $REMOTE_USER_AT_HOST 'cd $DB_BENCH_DIR/ && tar xzv'"
-      shopt -u nullglob
+    run_local "$SCP db_bench $REMOTE_USER_AT_HOST:$DB_BENCH_DIR/."
+    run_local "$SCP ldb $REMOTE_USER_AT_HOST:$DB_BENCH_DIR/."
   fi
 
   run_local "mkdir -p $RESULT_PATH"
diff --git a/tools/run_clang_tidy.py b/tools/run_clang_tidy.py
new file mode 100755
index 000000000000..0ae70bbd5829
--- /dev/null
+++ b/tools/run_clang_tidy.py
@@ -0,0 +1,683 @@
+#!/usr/bin/env python3
+"""
+Run clang-tidy on locally changed code and filter results to changed lines.
+
+This script detects local changes by combining:
+  1. Uncommitted changes (staged + unstaged + untracked files)
+  2. Committed-but-not-pushed changes (local commits not in the remote)
+
+It then runs clang-tidy only on the changed .cc/.cpp files (in parallel) and
+filters the output to show only warnings on lines that were actually modified.
+
+Usage:
+  python3 tools/run_clang_tidy.py [options]
+
+Examples:
+  # Basic usage (auto-detects base from remote tracking branch):
+  python3 tools/run_clang_tidy.py
+
+  # Specify clang-tidy binary and parallelism:
+  python3 tools/run_clang_tidy.py --clang-tidy-binary clang-tidy-18 -j 14
+
+  # Explicit diff base (useful in CI where the checkout is a merge commit):
+  python3 tools/run_clang_tidy.py --diff-base HEAD~1
+
+  # Save full (unfiltered) output to a file:
+  python3 tools/run_clang_tidy.py -o full_output.txt
+
+  # Show all warnings, not just on changed lines:
+  python3 tools/run_clang_tidy.py --verbose
+
+  # CI mode with GitHub annotations and step summary:
+  python3 tools/run_clang_tidy.py --diff-base HEAD~1 --github-annotations --github-step-summary
+"""
+
+import argparse
+import json
+import os
+import re
+import subprocess
+import sys
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+
+def log(msg=""):
+    """Print and flush immediately so output is visible in real time."""
+    print(msg, flush=True)
+
+
+def run_cmd(cmd, cwd=None):
+    """Run a command and return (stdout, return_code)."""
+    result = subprocess.run(cmd, capture_output=True, text=True, cwd=cwd)
+    return result.stdout.strip(), result.returncode
+
+
+def get_repo_root():
+    """Get the git repository root directory."""
+    out, rc = run_cmd(["git", "rev-parse", "--show-toplevel"])
+    if rc != 0:
+        log("Error: not inside a git repository.")
+        sys.exit(1)
+    return out
+
+
+def find_remote_base(repo_root):
+    """
+    Auto-detect the base commit to diff against.
+
+    Strategy:
+      1. Use the upstream tracking branch of the current branch if available.
+      2. Fall back to origin/main, origin/master, upstream/main, upstream/master.
+      3. Return the merge-base of HEAD and that ref.
+    """
+    out, rc = run_cmd(
+        ["git", "rev-parse", "--abbrev-ref", "--symbolic-full-name", "@{upstream}"],
+        cwd=repo_root,
+    )
+    if rc == 0 and out:
+        base_ref = out
+    else:
+        base_ref = None
+        for candidate in [
+            "origin/main", "origin/master",
+            "upstream/main", "upstream/master",
+        ]:
+            _, rc = run_cmd(["git", "rev-parse", "--verify", candidate], cwd=repo_root)
+            if rc == 0:
+                base_ref = candidate
+                break
+        if base_ref is None:
+            log(
+                "Error: cannot determine remote base branch.\n"
+                "Set an upstream: git branch --set-upstream-to=<remote>/<branch>\n"
+                "Or use --diff-base <ref> to specify the base explicitly."
+            )
+            sys.exit(1)
+
+    merge_base, rc = run_cmd(["git", "merge-base", "HEAD", base_ref], cwd=repo_root)
+    if rc != 0:
+        log(f"Error: cannot compute merge-base with {base_ref}.")
+        sys.exit(1)
+
+    return merge_base, base_ref
+
+
+def resolve_diff_base(diff_base_arg, repo_root):
+    """
+    Resolve --diff-base to a concrete commit SHA.
+
+    When --diff-base is given, resolve the ref and return (sha, display_name).
+    Otherwise, fall back to auto-detection via find_remote_base().
+    """
+    if diff_base_arg:
+        sha, rc = run_cmd(
+            ["git", "rev-parse", "--verify", diff_base_arg], cwd=repo_root
+        )
+        if rc != 0:
+            log(f"Error: --diff-base '{diff_base_arg}' is not a valid git ref.")
+            sys.exit(1)
+        return sha, diff_base_arg
+
+    return find_remote_base(repo_root)
+
+
+def parse_diff_for_changed_lines(diff_text):
+    """
+    Parse a unified diff and return {relative_path: set_of_new_line_numbers}.
+
+    Only tracks added/modified lines (the '+' side of the diff).
+    """
+    changed = {}
+    current_file = None
+
+    for line in diff_text.split("\n"):
+        m = re.match(r"^\+\+\+ b/(.*)", line)
+        if m:
+            current_file = m.group(1)
+            changed.setdefault(current_file, set())
+            continue
+
+        m = re.match(r"^@@ -\d+(?:,\d+)? \+(\d+)(?:,(\d+))? @@", line)
+        if m and current_file is not None:
+            start = int(m.group(1))
+            count = int(m.group(2)) if m.group(2) else 1
+            if count == 0:
+                continue
+            for i in range(start, start + count):
+                changed[current_file].add(i)
+
+    return changed
+
+
+def collect_changed_lines(repo_root, diff_base_arg=None):
+    """
+    Collect every locally-changed file and its changed line numbers.
+
+    When diff_base_arg is provided, diffs HEAD against that ref directly.
+    Otherwise, auto-detects the remote base and also picks up uncommitted
+    and untracked changes.
+    """
+    base_sha, base_label = resolve_diff_base(diff_base_arg, repo_root)
+    head_short, _ = run_cmd(["git", "rev-parse", "--short", "HEAD"], cwd=repo_root)
+
+    log_out, _ = run_cmd(
+        ["git", "log", "--oneline", f"{base_sha}..HEAD"], cwd=repo_root
+    )
+    local_commits = [l for l in log_out.split("\n") if l.strip()] if log_out else []
+
+    log(f"  Diff base   : {base_label}  ({base_sha[:10]})")
+    log(f"  HEAD        : {head_short}")
+    log(f"  Commits in range: {len(local_commits)}")
+    for c in local_commits[:20]:
+        log(f"    {c}")
+    if len(local_commits) > 20:
+        log(f"    ... and {len(local_commits) - 20} more")
+
+    all_changed = {}
+    src_pattern = r"\.(cc|cpp|h)$"
+
+    def merge_into(target, source):
+        for f, lines in source.items():
+            target.setdefault(f, set()).update(lines)
+
+    # Committed changes: base..HEAD
+    diff_committed, _ = run_cmd(
+        ["git", "diff", "--unified=0", f"{base_sha}..HEAD",
+         "--", "*.cc", "*.cpp", "*.h"],
+        cwd=repo_root,
+    )
+    merge_into(all_changed, parse_diff_for_changed_lines(diff_committed))
+
+    # When using explicit --diff-base (e.g. CI), skip working-tree checks
+    if diff_base_arg is None:
+        # Unstaged changes
+        diff_unstaged, _ = run_cmd(
+            ["git", "diff", "--unified=0", "--", "*.cc", "*.cpp", "*.h"],
+            cwd=repo_root,
+        )
+        merge_into(all_changed, parse_diff_for_changed_lines(diff_unstaged))
+
+        # Staged changes
+        diff_staged, _ = run_cmd(
+            ["git", "diff", "--unified=0", "--cached", "--", "*.cc", "*.cpp", "*.h"],
+            cwd=repo_root,
+        )
+        merge_into(all_changed, parse_diff_for_changed_lines(diff_staged))
+
+        # Untracked files — treat every line as changed
+        untracked_out, _ = run_cmd(
+            ["git", "ls-files", "--others", "--exclude-standard"], cwd=repo_root
+        )
+        for f in untracked_out.split("\n"):
+            f = f.strip()
+            if not f or not re.search(src_pattern, f):
+                continue
+            filepath = os.path.join(repo_root, f)
+            if os.path.isfile(filepath):
+                with open(filepath) as fh:
+                    line_count = sum(1 for _ in fh)
+                all_changed.setdefault(f, set()).update(range(1, line_count + 1))
+
+    return all_changed
+
+
+def load_compile_db(compile_db_path, repo_root):
+    """Load compile_commands.json and return a set of known file paths (both abs and rel)."""
+    if not os.path.exists(compile_db_path):
+        log(
+            f"Error: {compile_db_path} not found.\n"
+            "Generate it with:\n"
+            "  mkdir build && cd build && cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=ON ..\n"
+            "  ln -sf build/compile_commands.json compile_commands.json"
+        )
+        sys.exit(1)
+
+    with open(compile_db_path) as f:
+        db = json.load(f)
+
+    files = set()
+    prefix = repo_root.rstrip("/") + "/"
+    for entry in db:
+        abs_path = entry["file"]
+        files.add(abs_path)
+        if abs_path.startswith(prefix):
+            files.add(abs_path[len(prefix):])
+    return files
+
+
+def invoke_clang_tidy(clang_tidy_bin, compile_db_dir, filepath, repo_root):
+    """Run clang-tidy on a single file. Returns (filepath, combined_output, return_code)."""
+    abs_path = os.path.join(repo_root, filepath)
+    cmd = [clang_tidy_bin, "-p", compile_db_dir, abs_path]
+    try:
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=600)
+        return filepath, result.stdout + result.stderr, result.returncode
+    except subprocess.TimeoutExpired:
+        return filepath, f"TIMEOUT after 600s: {abs_path}\n", -1
+
+
+def emit_github_annotations(filtered_lines, repo_root):
+    """
+    Emit GitHub Actions workflow commands for each warning/error so they
+    appear as inline annotations on the PR diff.
+
+    Format: ::warning file={path},line={line}::{message}
+
+    Errors are emitted first so they occupy annotation slots before warnings,
+    since GitHub Actions caps display at 10 warnings and 10 errors per step.
+    Use --github-step-summary for the complete report.
+    """
+    prefix = repo_root.rstrip("/") + "/"
+
+    annotations = []
+    for line in filtered_lines:
+        m = re.match(r"^(.*?):(\d+):(\d+): (warning|error): (.+)", line)
+        if not m:
+            continue
+        filepath = m.group(1)
+        lineno = m.group(2)
+        col = m.group(3)
+        severity = m.group(4)
+        message = m.group(5)
+
+        rel_path = filepath
+        if filepath.startswith(prefix):
+            rel_path = filepath[len(prefix):]
+
+        gh_level = "error" if severity == "error" else "warning"
+        annotations.append((gh_level, rel_path, lineno, col, message))
+
+    annotations.sort(key=lambda a: (0 if a[0] == "error" else 1, a[1], int(a[2])))
+
+    for gh_level, rel_path, lineno, col, message in annotations:
+        log(f"::{gh_level} file={rel_path},line={lineno},col={col}::{message}")
+
+
+COMMENT_MARKER = "<!-- clang-tidy-bot -->"
+
+
+def _format_diagnostic_counts(diagnostic_lines):
+    """Return a human-readable string like '3 error(s) and 5 warning(s)'."""
+    n_errors = sum(1 for l in diagnostic_lines if re.search(r": error:", l))
+    n_warnings = sum(1 for l in diagnostic_lines if re.search(r": warning:", l))
+    parts = []
+    if n_errors:
+        parts.append(f"{n_errors} error(s)")
+    if n_warnings:
+        parts.append(f"{n_warnings} warning(s)")
+    return " and ".join(parts) if parts else "0 findings"
+
+
+def build_markdown_summary(diagnostic_lines, by_check, wall_time, repo_root):
+    """Build a Markdown summary string from clang-tidy results."""
+    prefix = repo_root.rstrip("/") + "/"
+    lines = []
+
+    if not diagnostic_lines:
+        lines.append("## :white_check_mark: clang-tidy: No findings on changed lines")
+        lines.append(f"\nCompleted in {wall_time:.1f}s.")
+    else:
+        counts = _format_diagnostic_counts(diagnostic_lines)
+        has_errors = any(re.search(r": error:", l) for l in diagnostic_lines)
+        icon = ":x:" if has_errors else ":warning:"
+        lines.append(f"## {icon} clang-tidy: {counts} on changed lines")
+        lines.append(f"\nCompleted in {wall_time:.1f}s.\n")
+
+        lines.append("### Summary by check\n")
+        lines.append("| Check | Count |")
+        lines.append("|-------|------:|")
+        for check in sorted(by_check):
+            lines.append(f"| `{check}` | {len(by_check[check])} |")
+        lines.append(f"| **Total** | **{len(diagnostic_lines)}** |")
+
+        lines.append("\n### Details\n")
+        by_file = {}
+        for line in diagnostic_lines:
+            m = re.match(r"^(.*?):(\d+):(\d+): (warning|error): (.+)", line)
+            if m:
+                filepath = m.group(1)
+                if filepath.startswith(prefix):
+                    filepath = filepath[len(prefix):]
+                by_file.setdefault(filepath, []).append(line)
+
+        for filepath in sorted(by_file):
+            n_e = sum(1 for l in by_file[filepath] if ": error:" in l)
+            n_w = sum(1 for l in by_file[filepath] if ": warning:" in l)
+            file_parts = []
+            if n_e:
+                file_parts.append(f"{n_e} error(s)")
+            if n_w:
+                file_parts.append(f"{n_w} warning(s)")
+            file_summary = ", ".join(file_parts)
+            lines.append(f"<details><summary><code>{filepath}</code> ({file_summary})</summary>\n")
+            lines.append("```")
+            for w in by_file[filepath]:
+                clean = w
+                if clean.startswith(prefix):
+                    clean = clean[len(prefix):]
+                lines.append(clean)
+            lines.append("```\n")
+            lines.append("</details>\n")
+
+    return "\n".join(lines)
+
+
+def write_github_step_summary(warning_lines, by_check, wall_time, repo_root):
+    """
+    Write a Markdown summary to $GITHUB_STEP_SUMMARY.
+
+    This appears on the job's summary page in GitHub Actions and has no
+    practical size limit, unlike annotations (capped at 10+10 per step).
+    """
+    summary_path = os.environ.get("GITHUB_STEP_SUMMARY")
+    if not summary_path:
+        log("  $GITHUB_STEP_SUMMARY not set; skipping step summary.")
+        return
+
+    md = build_markdown_summary(warning_lines, by_check, wall_time, repo_root)
+    with open(summary_path, "a") as f:
+        f.write(md + "\n")
+    log(f"  Step summary written to $GITHUB_STEP_SUMMARY")
+
+
+def write_comment_file(path, warning_lines, by_check, wall_time, repo_root):
+    """
+    Write the Markdown summary to a file for posting as a PR comment.
+
+    Includes a hidden HTML marker so the workflow can find and update an
+    existing comment instead of creating duplicates on re-runs.
+    """
+    md = build_markdown_summary(warning_lines, by_check, wall_time, repo_root)
+    with open(path, "w") as f:
+        f.write(COMMENT_MARKER + "\n" + md + "\n")
+    log(f"  Comment body written to {path}")
+
+
+def filter_to_changed_lines(raw_output, changed_lines, repo_root):
+    """
+    Parse clang-tidy output and keep only diagnostics whose location falls on
+    a changed line.  Also keeps note/context lines that follow a kept warning.
+    """
+    prefix = repo_root.rstrip("/") + "/"
+    results = []
+    keep_current = False
+
+    for line in raw_output.split("\n"):
+        m = re.match(r"^(.*?):(\d+):\d+: (warning|error): (.+)", line)
+        if m:
+            filepath_abs = m.group(1)
+            lineno = int(m.group(2))
+
+            rel_path = filepath_abs
+            if filepath_abs.startswith(prefix):
+                rel_path = filepath_abs[len(prefix):]
+
+            if rel_path in changed_lines and lineno in changed_lines[rel_path]:
+                keep_current = True
+                results.append(line)
+            else:
+                keep_current = False
+            continue
+
+        if keep_current:
+            if line.strip():
+                results.append(line)
+            else:
+                keep_current = False
+
+    return results
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Run clang-tidy on locally changed code, filtered to changed lines.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+    parser.add_argument(
+        "--clang-tidy-binary",
+        default="clang-tidy",
+        help="Path to clang-tidy binary (default: %(default)s)",
+    )
+    parser.add_argument(
+        "-p", "--compile-commands-dir",
+        default=None,
+        help="Directory containing compile_commands.json (default: repo root)",
+    )
+    parser.add_argument(
+        "-j", "--jobs",
+        type=int, default=None,
+        help="Number of parallel clang-tidy jobs (default: CPU count)",
+    )
+    parser.add_argument(
+        "--diff-base",
+        default=None,
+        metavar="REF",
+        help=(
+            "Explicit git ref to diff against (e.g. HEAD~1, a commit SHA, or a "
+            "branch name). When set, only the committed diff from REF to HEAD is "
+            "analyzed (working-tree changes are ignored). This is useful in CI "
+            "where the checkout is a merge commit: --diff-base HEAD~1 gives "
+            "exactly the PR's changes. When omitted, the base is auto-detected "
+            "from the remote tracking branch."
+        ),
+    )
+    parser.add_argument(
+        "-o", "--output",
+        default=None,
+        help="Write full (unfiltered) clang-tidy output to this file",
+    )
+    parser.add_argument(
+        "-v", "--verbose",
+        action="store_true",
+        help="Print all clang-tidy output, not just warnings on changed lines",
+    )
+    parser.add_argument(
+        "--github-annotations",
+        action="store_true",
+        help=(
+            "Emit GitHub Actions workflow commands (::warning) so that "
+            "clang-tidy findings appear as inline annotations on the PR's "
+            "\"Files changed\" tab.  Note: GitHub caps this at 10 warnings "
+            "and 10 errors per step."
+        ),
+    )
+    parser.add_argument(
+        "--github-step-summary",
+        action="store_true",
+        help=(
+            "Write a Markdown summary of all findings to $GITHUB_STEP_SUMMARY. "
+            "This appears on the job's summary page with no size limit, "
+            "complementing the capped inline annotations."
+        ),
+    )
+    parser.add_argument(
+        "--comment-output",
+        default=None,
+        metavar="FILE",
+        help=(
+            "Write a Markdown summary to FILE for posting as a PR comment. "
+            "Includes a hidden marker so the CI workflow can find and update "
+            "an existing comment instead of creating duplicates on re-runs."
+        ),
+    )
+    args = parser.parse_args()
+
+    repo_root = get_repo_root()
+    compile_db_dir = args.compile_commands_dir or repo_root
+    compile_db_path = os.path.join(compile_db_dir, "compile_commands.json")
+    jobs = args.jobs or os.cpu_count() or 4
+
+    # ------------------------------------------------------------------
+    # Step 1 — detect changes
+    # ------------------------------------------------------------------
+    log("=" * 70)
+    log("Step 1: Detecting changes")
+    log("=" * 70)
+    changed_lines = collect_changed_lines(repo_root, args.diff_base)
+
+    if not changed_lines:
+        log("\nNo changes detected. Nothing to check.")
+        if args.comment_output:
+            write_comment_file(args.comment_output, [], {}, 0, repo_root)
+        return 0
+
+    total_lines = sum(len(v) for v in changed_lines.values())
+    log(f"\n  {len(changed_lines)} file(s) changed, {total_lines} line(s) total:")
+    for f in sorted(changed_lines):
+        log(f"    {f}  ({len(changed_lines[f])} lines)")
+
+    # ------------------------------------------------------------------
+    # Step 2 — select compilable files present in compile_commands.json
+    # ------------------------------------------------------------------
+    db_files = load_compile_db(compile_db_path, repo_root)
+    cc_changed = sorted(
+        f for f in changed_lines
+        if re.search(r"\.(cc|cpp)$", f)
+        and (f in db_files or os.path.join(repo_root, f) in db_files)
+    )
+
+    if not cc_changed:
+        log("\nNo compilable changed files found in compile_commands.json.")
+        if args.comment_output:
+            write_comment_file(args.comment_output, [], {}, 0, repo_root)
+        return 0
+
+    log(f"\n{'=' * 70}")
+    log(f"Step 2: Running clang-tidy on {len(cc_changed)} file(s)  [jobs={jobs}]")
+    log("=" * 70)
+
+    # ------------------------------------------------------------------
+    # Step 3 — run clang-tidy in parallel via ThreadPoolExecutor
+    # ------------------------------------------------------------------
+    all_raw_output = []
+    all_filtered = []
+    t0 = time.time()
+
+    with ThreadPoolExecutor(max_workers=jobs) as pool:
+        futures = {
+            pool.submit(
+                invoke_clang_tidy,
+                args.clang_tidy_binary,
+                compile_db_dir,
+                f,
+                repo_root,
+            ): f
+            for f in cc_changed
+        }
+
+        done = 0
+        for future in as_completed(futures):
+            done += 1
+            fpath = futures[future]
+            fpath, output, rc = future.result()
+            all_raw_output.append(output)
+
+            filtered = filter_to_changed_lines(output, changed_lines, repo_root)
+            all_filtered.extend(filtered)
+
+            n_diags = sum(
+                1 for l in filtered if re.search(r": (warning|error):", l)
+            )
+            elapsed = time.time() - t0
+            if rc == 0:
+                status = "clean"
+            elif rc == -1:
+                status = "TIMEOUT"
+            else:
+                status = f"{n_diags} on changed lines"
+            log(
+                f"  [{done:>{len(str(len(cc_changed)))}}/{len(cc_changed)}]"
+                f" {elapsed:6.1f}s  {fpath}  ({status})"
+            )
+
+    wall_time = time.time() - t0
+
+    # ------------------------------------------------------------------
+    # Optional: save full output
+    # ------------------------------------------------------------------
+    if args.output:
+        with open(args.output, "w") as f:
+            f.write("\n".join(all_raw_output))
+        log(f"\nFull clang-tidy output saved to {args.output}")
+
+    # ------------------------------------------------------------------
+    # Step 4 — report filtered results
+    # ------------------------------------------------------------------
+    log(f"\n{'=' * 70}")
+    log(f"Step 3: Results  (wall time {wall_time:.1f}s)")
+    log("=" * 70)
+
+    if args.verbose:
+        log("\n--- Full output ---")
+        for chunk in all_raw_output:
+            log(chunk)
+        log("--- End full output ---\n")
+
+    diagnostic_lines = [
+        l for l in all_filtered if re.search(r": (warning|error):", l)
+    ]
+    if not diagnostic_lines:
+        log("\nNo findings on changed lines. Clean!")
+        if args.github_step_summary:
+            write_github_step_summary([], {}, wall_time, repo_root)
+        if args.comment_output:
+            write_comment_file(args.comment_output, [], {}, wall_time, repo_root)
+        return 0
+
+    error_lines = [l for l in diagnostic_lines if re.search(r": error:", l)]
+    warning_lines = [l for l in diagnostic_lines if re.search(r": warning:", l)]
+
+    by_check = {}
+    for line in diagnostic_lines:
+        m = re.search(r"\[([\w.-]+)\]\s*$", line)
+        check = m.group(1) if m else "unknown"
+        by_check.setdefault(check, []).append(line)
+
+    parts = []
+    if error_lines:
+        parts.append(f"{len(error_lines)} error(s)")
+    if warning_lines:
+        parts.append(f"{len(warning_lines)} warning(s)")
+    log(f"\n{' and '.join(parts)} on changed lines:\n")
+    for line in all_filtered:
+        log(line)
+
+    if args.github_annotations:
+        log(f"\n{'=' * 70}")
+        log("Emitting GitHub Actions annotations")
+        log("=" * 70)
+        emit_github_annotations(all_filtered, repo_root)
+
+    if args.github_step_summary:
+        log(f"\n{'=' * 70}")
+        log("Writing GitHub step summary")
+        log("=" * 70)
+        write_github_step_summary(diagnostic_lines, by_check, wall_time, repo_root)
+
+    if args.comment_output:
+        log(f"\n{'=' * 70}")
+        log("Writing PR comment body")
+        log("=" * 70)
+        write_comment_file(
+            args.comment_output, diagnostic_lines, by_check, wall_time, repo_root
+        )
+
+    log(f"\n{'=' * 70}")
+    log("Summary by check:")
+    log("=" * 70)
+    for check in sorted(by_check):
+        log(f"  [{check}]  x{len(by_check[check])}")
+    summary_parts = []
+    if error_lines:
+        summary_parts.append(f"{len(error_lines)} error(s)")
+    if warning_lines:
+        summary_parts.append(f"{len(warning_lines)} warning(s)")
+    log(f"\n  Total: {' and '.join(summary_parts)}")
+
+    return 1 if error_lines else 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tools/sst_dump_test.cc b/tools/sst_dump_test.cc
index c513a23bc93d..9480ffb30f2a 100644
--- a/tools/sst_dump_test.cc
+++ b/tools/sst_dump_test.cc
@@ -15,6 +15,7 @@
 #include "rocksdb/convenience.h"
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/sst_dump_tool.h"
+#include "rocksdb/utilities/object_registry.h"
 #include "table/block_based/block_based_table_factory.h"
 #include "table/sst_file_dumper.h"
 #include "table/table_builder.h"
@@ -97,6 +98,26 @@ class SSTDumpToolTest : public testing::Test {
     return path;
   }
 
+  // RAII class to ensure cleanup of usage array
+  template <std::size_t N>
+  struct CleanupUsage {
+    char* (&usage)[N];
+    explicit CleanupUsage(char* (&_usage)[N]) : usage(_usage) {}
+    // No copies/moves
+    CleanupUsage(const CleanupUsage&) = delete;
+    CleanupUsage& operator=(const CleanupUsage&) = delete;
+    CleanupUsage(CleanupUsage&&) = delete;
+    CleanupUsage& operator=(CleanupUsage&&) = delete;
+    ~CleanupUsage() {
+      for (std::size_t i = 0; i < N; ++i) {
+        delete[] usage[i];
+      }
+    }
+  };
+
+#define ASSERT_TOOL_PASS(tool_expr) ASSERT_EQ(0, (tool_expr));
+#define ASSERT_TOOL_FAIL(tool_expr) ASSERT_NE(0, (tool_expr));
+
   template <std::size_t N>
   void PopulateCommandArgs(const std::string& file_path, const char* command,
                            char* (&usage)[N]) const {
@@ -174,9 +195,29 @@ class SSTDumpToolTest : public testing::Test {
 
  protected:
   constexpr static int kNumKey = 1024;
-};
 
-constexpr int SSTDumpToolTest::kNumKey;
+  void SSTDumpToolTestCase(Options& opts, bool filter, int wide_column_one_in,
+                           const char* cmd_arg) {
+    opts.env = env();
+    BlockBasedTableOptions table_opts;
+    if (filter) {
+      table_opts.filter_policy.reset(
+          ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10));
+    }
+    opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
+    std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
+    createSST(opts, file_path, wide_column_one_in);
+
+    char* usage[3];
+    auto cleanup_usage = CleanupUsage{usage};
+    PopulateCommandArgs(file_path, cmd_arg, usage);
+
+    ROCKSDB_NAMESPACE::SSTDumpTool tool;
+    ASSERT_TOOL_PASS(tool.Run(3, usage, opts));
+
+    cleanup(opts, file_path);
+  }
+};
 
 TEST_F(SSTDumpToolTest, HelpAndVersion) {
   Options opts;
@@ -185,146 +226,111 @@ TEST_F(SSTDumpToolTest, HelpAndVersion) {
   ROCKSDB_NAMESPACE::SSTDumpTool tool;
 
   static const char* help[] = {"./sst_dump", "--help"};
-  ASSERT_TRUE(!tool.Run(2, help, opts));
+  ASSERT_TOOL_PASS(tool.Run(2, help, opts));
+  static const char* bad_help[] = {"./sst_dump", "--", "--help"};
+  ASSERT_TOOL_FAIL(tool.Run(3, bad_help, opts));
   static const char* version[] = {"./sst_dump", "--version"};
-  ASSERT_TRUE(!tool.Run(2, version, opts));
+  ASSERT_TOOL_PASS(tool.Run(2, version, opts));
   static const char* bad[] = {"./sst_dump", "--not_an_option"};
-  ASSERT_TRUE(tool.Run(2, bad, opts));
+  ASSERT_TOOL_FAIL(tool.Run(2, bad, opts));
 }
 
 TEST_F(SSTDumpToolTest, EmptyFilter) {
   Options opts;
-  opts.env = env();
-  std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
-  createSST(opts, file_path, 10);
-
-  char* usage[3];
-  PopulateCommandArgs(file_path, "--command=raw", usage);
-
-  ROCKSDB_NAMESPACE::SSTDumpTool tool;
-  ASSERT_TRUE(!tool.Run(3, usage, opts));
-
-  cleanup(opts, file_path);
-  for (int i = 0; i < 3; i++) {
-    delete[] usage[i];
-  }
+  SSTDumpToolTestCase(opts, /*filter=*/false, /*wide_column_one_in=*/10,
+                      "--command=raw");
 }
 
 TEST_F(SSTDumpToolTest, SstDumpReverseBytewiseComparator) {
   Options opts;
-  opts.env = env();
   opts.comparator = ReverseBytewiseComparator();
-  BlockBasedTableOptions table_opts;
-  table_opts.filter_policy.reset(
-      ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10, false));
-  opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
-  std::string file_path =
-      MakeFilePath("rocksdb_sst_reverse_bytewise_comparator.sst");
-  createSST(opts, file_path);
-
-  char* usage[3];
-  PopulateCommandArgs(file_path, "--command=raw", usage);
-
-  ROCKSDB_NAMESPACE::SSTDumpTool tool;
-  ASSERT_TRUE(!tool.Run(3, usage, opts));
-
-  cleanup(opts, file_path);
-  for (int i = 0; i < 3; i++) {
-    delete[] usage[i];
-  }
+  SSTDumpToolTestCase(opts, /*filter=*/true, /*wide_column_one_in=*/10,
+                      "--command=raw");
 }
 
 TEST_F(SSTDumpToolTest, SstDumpComparatorWithU64Ts) {
   Options opts;
-  opts.env = env();
   opts.comparator = test::BytewiseComparatorWithU64TsWrapper();
-  BlockBasedTableOptions table_opts;
-  table_opts.filter_policy.reset(
-      ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10, false));
-  opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
-  std::string file_path =
-      MakeFilePath("rocksdb_sst_comparator_with_u64_ts.sst");
-  createSST(opts, file_path, 10);
-
-  char* usage[3];
-  PopulateCommandArgs(file_path, "--command=raw", usage);
-
-  ROCKSDB_NAMESPACE::SSTDumpTool tool;
-  ASSERT_TRUE(!tool.Run(3, usage, opts));
+  SSTDumpToolTestCase(opts, /*filter=*/true, /*wide_column_one_in=*/10,
+                      "--command=raw");
+}
 
-  cleanup(opts, file_path);
-  for (int i = 0; i < 3; i++) {
-    delete[] usage[i];
-  }
+TEST_F(SSTDumpToolTest, FilterBlockWideColumn) {
+  Options opts;
+  SSTDumpToolTestCase(opts, /*filter=*/true, /*wide_column_one_in=*/10,
+                      "--command=raw");
 }
 
 TEST_F(SSTDumpToolTest, FilterBlock) {
   Options opts;
-  opts.env = env();
-  BlockBasedTableOptions table_opts;
-  table_opts.filter_policy.reset(
-      ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10, true));
-  opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
-  std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
-  createSST(opts, file_path, 10);
-
-  char* usage[3];
-  PopulateCommandArgs(file_path, "--command=raw", usage);
-
-  ROCKSDB_NAMESPACE::SSTDumpTool tool;
-  ASSERT_TRUE(!tool.Run(3, usage, opts));
+  SSTDumpToolTestCase(opts, /*filter=*/true, /*wide_column_one_in=*/0,
+                      "--command=raw");
+}
 
-  cleanup(opts, file_path);
-  for (int i = 0; i < 3; i++) {
-    delete[] usage[i];
-  }
+TEST_F(SSTDumpToolTest, GetProperties) {
+  Options opts;
+  SSTDumpToolTestCase(opts, /*filter=*/true, /*wide_column_one_in=*/0,
+                      "--show_properties");
 }
 
-TEST_F(SSTDumpToolTest, FullFilterBlock) {
+TEST_F(SSTDumpToolTest, CompressedSizes) {
   Options opts;
-  opts.env = env();
-  BlockBasedTableOptions table_opts;
-  table_opts.filter_policy.reset(
-      ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10, false));
-  opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
-  std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
-  createSST(opts, file_path);
+  SSTDumpToolTestCase(opts, /*filter=*/true, /*wide_column_one_in=*/10,
+                      "--command=recompress");
+}
 
-  char* usage[3];
-  PopulateCommandArgs(file_path, "--command=raw", usage);
+TEST_F(SSTDumpToolTest, ListMetaBlocks) {
+  Options opts;
+  SSTDumpToolTestCase(opts, /*filter=*/true, /*wide_column_one_in=*/0,
+                      "--list_meta_blocks");
+}
 
-  ROCKSDB_NAMESPACE::SSTDumpTool tool;
-  ASSERT_TRUE(!tool.Run(3, usage, opts));
+namespace {
+using Compressor8A = test::CompressorCustomAlg<kCustomCompression8A>;
+class MyManager : public CompressionManager {
+ public:
+  static constexpr const char* kCompatibilityName = "SSTDumpToolTest:MyManager";
+  const char* Name() const override { return kCompatibilityName; }
+  const char* CompatibilityName() const override { return kCompatibilityName; }
 
-  cleanup(opts, file_path);
-  for (int i = 0; i < 3; i++) {
-    delete[] usage[i];
+  bool SupportsCompressionType(CompressionType type) const override {
+    return type == kCustomCompression8A;
   }
-}
-
-TEST_F(SSTDumpToolTest, GetProperties) {
-  Options opts;
-  opts.env = env();
-  BlockBasedTableOptions table_opts;
-  table_opts.filter_policy.reset(
-      ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10, false));
-  opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
-  std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
-  createSST(opts, file_path);
 
-  char* usage[3];
-  PopulateCommandArgs(file_path, "--show_properties", usage);
+  std::unique_ptr<Compressor> GetCompressor(const CompressionOptions& /*opts*/,
+                                            CompressionType type) override {
+    switch (static_cast<unsigned char>(type)) {
+      case kCustomCompression8A:
+        return std::make_unique<Compressor8A>();
+      default:
+        return nullptr;
+    }
+  }
 
-  ROCKSDB_NAMESPACE::SSTDumpTool tool;
-  ASSERT_TRUE(!tool.Run(3, usage, opts));
+  std::shared_ptr<Decompressor> GetDecompressor() override {
+    return std::make_shared<test::DecompressorCustomAlg>();
+  }
+};
+}  // namespace
 
-  cleanup(opts, file_path);
-  for (int i = 0; i < 3; i++) {
-    delete[] usage[i];
+TEST_F(SSTDumpToolTest, CompressionManager) {
+  if (!Compressor8A::Supported()) {
+    fprintf(stderr,
+            "Prerequisite compression library not supported. Skipping\n");
+    return;
   }
-}
 
-TEST_F(SSTDumpToolTest, CompressedSizes) {
+  // Registery in ObjectLibrary to check that sst_dump can use named
+  // CompressionManagers with dependency injection
+  auto& library = *ObjectLibrary::Default();
+  library.AddFactory<CompressionManager>(
+      MyManager::kCompatibilityName,
+      [](const std::string& /*uri*/, std::unique_ptr<CompressionManager>* guard,
+         std::string* /*errmsg*/) {
+        *guard = std::make_unique<MyManager>();
+        return guard->get();
+      });
+
   Options opts;
   opts.env = env();
   BlockBasedTableOptions table_opts;
@@ -334,16 +340,17 @@ TEST_F(SSTDumpToolTest, CompressedSizes) {
   std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
   createSST(opts, file_path, 10);
 
-  char* usage[3];
+  char* usage[5];
+  auto cleanup_usage = CleanupUsage{usage};
   PopulateCommandArgs(file_path, "--command=recompress", usage);
+  snprintf(usage[3], kOptLength, "--compression_manager=%s",
+           MyManager::kCompatibilityName);
+  snprintf(usage[4], kOptLength, "--compression_types=kCustomCompression8A");
 
   ROCKSDB_NAMESPACE::SSTDumpTool tool;
-  ASSERT_TRUE(!tool.Run(3, usage, opts));
+  ASSERT_TOOL_PASS(tool.Run(5, usage, opts));
 
   cleanup(opts, file_path);
-  for (int i = 0; i < 3; i++) {
-    delete[] usage[i];
-  }
 }
 
 TEST_F(SSTDumpToolTest, MemEnv) {
@@ -354,15 +361,13 @@ TEST_F(SSTDumpToolTest, MemEnv) {
   createSST(opts, file_path);
 
   char* usage[3];
+  auto cleanup_usage = CleanupUsage{usage};
   PopulateCommandArgs(file_path, "--command=verify_checksum", usage);
 
   ROCKSDB_NAMESPACE::SSTDumpTool tool;
-  ASSERT_TRUE(!tool.Run(3, usage, opts));
+  ASSERT_TOOL_PASS(tool.Run(3, usage, opts));
 
   cleanup(opts, file_path);
-  for (int i = 0; i < 3; i++) {
-    delete[] usage[i];
-  }
 }
 
 TEST_F(SSTDumpToolTest, ReadaheadSize) {
@@ -372,6 +377,7 @@ TEST_F(SSTDumpToolTest, ReadaheadSize) {
   createSST(opts, file_path);
 
   char* usage[4];
+  auto cleanup_usage = CleanupUsage{usage};
   PopulateCommandArgs(file_path, "--command=verify", usage);
   snprintf(usage[3], kOptLength, "--readahead_size=4000000");
 
@@ -381,27 +387,26 @@ TEST_F(SSTDumpToolTest, ReadaheadSize) {
   SyncPoint::GetInstance()->EnableProcessing();
 
   SSTDumpTool tool;
-  ASSERT_TRUE(!tool.Run(4, usage, opts));
+  ASSERT_TOOL_PASS(tool.Run(4, usage, opts));
 
   // The file is approximately 10MB. Readahead is 4MB.
   // We usually need 3 reads + one metadata read.
-  // One extra read is needed before opening the file for metadata.
-  ASSERT_EQ(5, num_reads);
+  // Three extra read is needed before opening the file for metadata.
+  ASSERT_EQ(7, num_reads);
 
   SyncPoint::GetInstance()->ClearAllCallBacks();
   SyncPoint::GetInstance()->DisableProcessing();
 
   cleanup(opts, file_path);
-  for (int i = 0; i < 4; i++) {
-    delete[] usage[i];
-  }
 }
 
+#ifndef __clang_analyzer__  // False positive memory leaks reported
 TEST_F(SSTDumpToolTest, NoSstFile) {
   Options opts;
   opts.env = env();
   std::string file_path = MakeFilePath("no_such_file.sst");
   char* usage[3];
+  auto cleanup_usage = CleanupUsage{usage};
   PopulateCommandArgs(file_path, "", usage);
   ROCKSDB_NAMESPACE::SSTDumpTool tool;
   for (const auto& command :
@@ -409,17 +414,15 @@ TEST_F(SSTDumpToolTest, NoSstFile) {
         "--command=verify", "--command=recompress", "--command=verify_checksum",
         "--show_properties"}) {
     snprintf(usage[1], kOptLength, "%s", command);
-    ASSERT_TRUE(tool.Run(3, usage, opts));
-  }
-  for (int i = 0; i < 3; i++) {
-    delete[] usage[i];
+    ASSERT_TOOL_FAIL(tool.Run(3, usage, opts));
   }
 }
 
 TEST_F(SSTDumpToolTest, ValidSSTPath) {
   Options opts;
   opts.env = env();
-  char* usage[3];
+  char* usage[5];
+  auto cleanup_usage = CleanupUsage{usage};
   PopulateCommandArgs("", "", usage);
   SSTDumpTool tool;
   std::string file_not_exists = MakeFilePath("file_not_exists.sst");
@@ -429,30 +432,68 @@ TEST_F(SSTDumpToolTest, ValidSSTPath) {
   ASSERT_OK(WriteStringToFile(opts.env, "Hello World!", text_file, false));
   std::string fake_sst = MakeFilePath("fake_sst.sst");
   ASSERT_OK(WriteStringToFile(opts.env, "Not an SST file!", fake_sst, false));
+  std::string good_dir = MakeFilePath("");
 
   for (const auto& command_arg : {"--command=verify", "--command=identify"}) {
     snprintf(usage[1], kOptLength, "%s", command_arg);
 
-    snprintf(usage[2], kOptLength, "--file=%s", file_not_exists.c_str());
-    ASSERT_TRUE(tool.Run(3, usage, opts));
+    // Test both classic --file and standalone argument
+    for (const auto& file_fmt : {"--file=%s", "%s"}) {
+      snprintf(usage[2], kOptLength, file_fmt, file_not_exists.c_str());
+      ASSERT_TOOL_FAIL(tool.Run(3, usage, opts));
+
+      snprintf(usage[2], kOptLength, file_fmt, sst_file.c_str());
+      ASSERT_TOOL_PASS(tool.Run(3, usage, opts));
+
+      snprintf(usage[2], kOptLength, file_fmt, good_dir.c_str());
+      ASSERT_TOOL_PASS(tool.Run(3, usage, opts));
+
+      snprintf(usage[2], kOptLength, file_fmt, text_file.c_str());
+      ASSERT_TOOL_FAIL(tool.Run(3, usage, opts));
+
+      snprintf(usage[2], kOptLength, file_fmt, fake_sst.c_str());
+      ASSERT_TOOL_FAIL(tool.Run(3, usage, opts));
+    }
+
+    // If one file is valid, that's enough to succeed as long as the others
+    // exist
+    for (const auto& good : {sst_file, good_dir}) {
+      // Additional file-or-dir argument
+      snprintf(usage[3], kOptLength, "%s", good.c_str());
+
+      snprintf(usage[2], kOptLength, "%s", file_not_exists.c_str());
+      ASSERT_TOOL_FAIL(tool.Run(4, usage, opts));
 
-    snprintf(usage[2], kOptLength, "--file=%s", sst_file.c_str());
-    ASSERT_TRUE(!tool.Run(3, usage, opts));
+      snprintf(usage[2], kOptLength, "%s", sst_file.c_str());
+      ASSERT_TOOL_PASS(tool.Run(4, usage, opts));
 
-    snprintf(usage[2], kOptLength, "--file=%s", text_file.c_str());
-    ASSERT_TRUE(tool.Run(3, usage, opts));
+      snprintf(usage[2], kOptLength, "%s", good_dir.c_str());
+      ASSERT_TOOL_PASS(tool.Run(4, usage, opts));
 
-    snprintf(usage[2], kOptLength, "--file=%s", fake_sst.c_str());
-    ASSERT_TRUE(tool.Run(3, usage, opts));
+      snprintf(usage[2], kOptLength, "%s", text_file.c_str());
+      // DIFFERENT
+      ASSERT_TOOL_PASS(tool.Run(4, usage, opts));
+
+      snprintf(usage[2], kOptLength, "%s", fake_sst.c_str());
+      // DIFFERENT
+      ASSERT_TOOL_PASS(tool.Run(4, usage, opts));
+
+      // Some extra cases to test "--" handling
+      snprintf(usage[2], kOptLength, "%s", "--");
+      ASSERT_TOOL_PASS(tool.Run(4, usage, opts));
+
+      snprintf(usage[4], kOptLength, "%s", file_not_exists.c_str());
+      ASSERT_TOOL_FAIL(tool.Run(5, usage, opts));
+
+      snprintf(usage[4], kOptLength, "%s", fake_sst.c_str());
+      ASSERT_TOOL_PASS(tool.Run(5, usage, opts));
+    }
   }
   ASSERT_OK(opts.env->DeleteFile(sst_file));
   ASSERT_OK(opts.env->DeleteFile(text_file));
   ASSERT_OK(opts.env->DeleteFile(fake_sst));
-
-  for (int i = 0; i < 3; i++) {
-    delete[] usage[i];
-  }
 }
+#endif  // __clang_analyzer__
 
 TEST_F(SSTDumpToolTest, RawOutput) {
   Options opts;
@@ -461,10 +502,11 @@ TEST_F(SSTDumpToolTest, RawOutput) {
   createSST(opts, file_path, 10);
 
   char* usage[3];
+  auto cleanup_usage = CleanupUsage{usage};
   PopulateCommandArgs(file_path, "--command=raw", usage);
 
   ROCKSDB_NAMESPACE::SSTDumpTool tool;
-  ASSERT_TRUE(!tool.Run(3, usage, opts));
+  ASSERT_TOOL_PASS(tool.Run(3, usage, opts));
 
   const std::string raw_path = MakeFilePath("rocksdb_sst_test_dump.txt");
   std::ifstream raw_file(raw_path);
@@ -487,9 +529,6 @@ TEST_F(SSTDumpToolTest, RawOutput) {
   raw_file.close();
 
   cleanup(opts, file_path);
-  for (int i = 0; i < 3; i++) {
-    delete[] usage[i];
-  }
 }
 
 TEST_F(SSTDumpToolTest, SstFileDumperMmapReads) {
diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc
index d7d784c54689..c155fa01b1e0 100644
--- a/tools/sst_dump_tool.cc
+++ b/tools/sst_dump_tool.cc
@@ -8,26 +8,19 @@
 
 #include <cinttypes>
 #include <iostream>
+#include <regex>
 
+#include "db_stress_tool/db_stress_compression_manager.h"
 #include "options/options_helper.h"
 #include "port/port.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/utilities/ldb_cmd.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_factory.h"
 #include "table/sst_file_dumper.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-static const std::vector<std::pair<CompressionType, const char*>>
-    kCompressions = {
-        {CompressionType::kNoCompression, "kNoCompression"},
-        {CompressionType::kSnappyCompression, "kSnappyCompression"},
-        {CompressionType::kZlibCompression, "kZlibCompression"},
-        {CompressionType::kBZip2Compression, "kBZip2Compression"},
-        {CompressionType::kLZ4Compression, "kLZ4Compression"},
-        {CompressionType::kLZ4HCCompression, "kLZ4HCCompression"},
-        {CompressionType::kXpressCompression, "kXpressCompression"},
-        {CompressionType::kZSTD, "kZSTD"}};
-
 namespace {
 
 void print_help(bool to_stderr) {
@@ -43,9 +36,9 @@ void print_help(bool to_stderr) {
   }
   fprintf(
       to_stderr ? stderr : stdout,
-      R"(sst_dump --file=<data_dir_OR_sst_file> [--command=check|scan|raw|recompress|identify]
-    --file=<data_dir_OR_sst_file>
-      Path to SST file or directory containing SST files
+      R"(sst_dump <db_dirs_OR_sst_files...> [--command=check|scan|raw|recompress|identify]
+    --file=<db_dir_OR_sst_file>
+      Path to SST file or directory containing SST files (old option syntax)
 
     --env_uri=<uri of underlying Env>
       URI of underlying Env, mutually exclusive with fs_uri
@@ -70,6 +63,9 @@ void print_help(bool to_stderr) {
     --decode_blob_index
       Decode blob indexes and print them in a human-readable format during scans.
 
+    --show_sequence_number_type
+      Show sequence number and value type when executing raw command
+
     --from=<user_key>
       Key to start reading from when executing check|scan
 
@@ -93,20 +89,32 @@ void print_help(bool to_stderr) {
       Print table properties after iterating over the file when executing
       check|scan|raw|identify
 
-    --set_block_size=<block_size>
+    --block_size=<block_size>
       Can be combined with --command=recompress to set the block size that will
       be used when trying different compression algorithms
 
     --compression_types=<comma-separated list of CompressionType members, e.g.,
-      kSnappyCompression>
+      kSnappyCompression or kCustomCompressionC4>
       Can be combined with --command=recompress to run recompression for this
       list of compression types
-      Supported compression types: %s
+      Supported built-in compression types: %s
+
+    --compression_manager=<compression manager string>
+      Used with --command=recompress to specify a compression manager to use
+      instead of the built-in compression manager, which may support a
+      different set of compression types.
+
+    --enable_index_compression=<bool>
+      Used with --command=recompress to specify whether to compress index
+      blocks (in addition to data blocks).
 
     --parse_internal_key=<0xKEY>
       Convenience option to parse an internal key on the command line. Dumps the
       internal key in hex format {'key' @ SN: type}
 
+    --compression_level=<compression_level>
+      Sets both --compression_level_from= and --compression_level_to=
+
     --compression_level_from=<compression_level>
       Compression level to start compressing when executing recompress. One compression type
       and compression_level_to must also be specified
@@ -115,17 +123,23 @@ void print_help(bool to_stderr) {
       Compression level to stop compressing when executing recompress. One compression type
       and compression_level_from must also be specified
 
+    --compression_max_dict_buffer_bytes=<int64_t>
+      Limit on buffer size from which we collect samples for dictionary generation.
+
     --compression_max_dict_bytes=<uint32_t>
       Maximum size of dictionary used to prime the compression library
 
-    --compression_zstd_max_train_bytes=<uint32_t>
-      Maximum size of training data passed to zstd's dictionary trainer
-
-    --compression_max_dict_buffer_bytes=<int64_t>
-      Limit on buffer size from which we collect samples for dictionary generation.
+    --compression_parallel_threads=<uint32_t>
+      Number of parallel threads to use with --command=recompress
 
     --compression_use_zstd_finalize_dict
       Use zstd's finalizeDictionary() API instead of zstd's dictionary trainer to generate dictionary.
+
+    --compression_zstd_max_train_bytes=<uint32_t>
+      Maximum size of training data passed to zstd's dictionary trainer
+
+    --list_meta_blocks
+      Print the list of all meta blocks in the file
 )",
       supported_compressions.c_str());
 }
@@ -152,7 +166,12 @@ bool ParseIntArg(const char* arg, const std::string arg_name,
 
 int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
   std::string env_uri, fs_uri;
-  const char* dir_or_file = nullptr;
+  enum DirVsFile {
+    kUnknownDirVsFile,
+    kDir,
+    kFile,
+  };
+  std::vector<std::pair<const char*, DirVsFile>> dirs_or_files;
   uint64_t read_num = std::numeric_limits<uint64_t>::max();
   std::string command;
 
@@ -161,24 +180,32 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
   bool verify_checksum = false;
   bool output_hex = false;
   bool decode_blob_index = false;
+  bool show_sequence_number_type = false;
   bool input_key_hex = false;
   bool has_from = false;
   bool has_to = false;
   bool use_from_as_prefix = false;
   bool show_properties = false;
   bool show_summary = false;
-  bool set_block_size = false;
+  bool list_meta_blocks = false;
   bool has_compression_level_from = false;
   bool has_compression_level_to = false;
-  bool has_specified_compression_types = false;
   std::string from_key;
   std::string to_key;
   std::string block_size_str;
   std::string compression_level_from_str;
   std::string compression_level_to_str;
-  size_t block_size = 0;
+  size_t block_size = 16384;  // A popular choice for default
   size_t readahead_size = 2 * 1024 * 1024;
-  std::vector<std::pair<CompressionType, const char*>> compression_types;
+  // These two options are intentionally secret options because they are
+  // niche ways to select files to get the "recompress" treatment. And even
+  // if std::regex is flawed, it should be good enough for these niche uses.
+  std::unique_ptr<std::regex> require_property_regex;
+  std::unique_ptr<std::regex> exclude_property_regex;
+  std::vector<CompressionType> compression_types;
+  std::shared_ptr<CompressionManager> compression_manager;
+  bool enable_index_compression =
+      BlockBasedTableOptions{}.enable_index_compression;
   uint64_t total_num_files = 0;
   uint64_t total_num_data_blocks = 0;
   uint64_t total_data_block_size = 0;
@@ -194,20 +221,26 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
       ROCKSDB_NAMESPACE::CompressionOptions().max_dict_buffer_bytes;
   bool compression_use_zstd_finalize_dict =
       !ROCKSDB_NAMESPACE::CompressionOptions().use_zstd_dict_trainer;
+  uint32_t compression_parallel_threads = 1;
 
   int64_t tmp_val;
 
+  TEST_AllowUnsupportedFormatVersion() = true;
+  DbStressCustomCompressionManager::Register();
+
   for (int i = 1; i < argc; i++) {
     if (strncmp(argv[i], "--env_uri=", 10) == 0) {
       env_uri = argv[i] + 10;
     } else if (strncmp(argv[i], "--fs_uri=", 9) == 0) {
       fs_uri = argv[i] + 9;
     } else if (strncmp(argv[i], "--file=", 7) == 0) {
-      dir_or_file = argv[i] + 7;
+      dirs_or_files.emplace_back(argv[i] + 7, kUnknownDirVsFile);
     } else if (strcmp(argv[i], "--output_hex") == 0) {
       output_hex = true;
     } else if (strcmp(argv[i], "--decode_blob_index") == 0) {
       decode_blob_index = true;
+    } else if (strcmp(argv[i], "--show_sequence_number_type") == 0) {
+      show_sequence_number_type = true;
     } else if (strcmp(argv[i], "--input_key_hex") == 0) {
       input_key_hex = true;
     } else if (sscanf(argv[i], "--read_num=%lu%c", (unsigned long*)&n, &junk) ==
@@ -231,8 +264,9 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
     } else if (strcmp(argv[i], "--show_summary") == 0) {
       show_summary = true;
     } else if (ParseIntArg(argv[i], "--set_block_size=",
+                           "block size must be numeric", &tmp_val) ||
+               ParseIntArg(argv[i], "--block_size=",
                            "block size must be numeric", &tmp_val)) {
-      set_block_size = true;
       block_size = static_cast<size_t>(tmp_val);
     } else if (ParseIntArg(argv[i], "--readahead_size=",
                            "readahead_size must be numeric", &tmp_val)) {
@@ -241,19 +275,46 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
       std::string compression_types_csv = argv[i] + 20;
       std::istringstream iss(compression_types_csv);
       std::string compression_type;
-      has_specified_compression_types = true;
+
       while (std::getline(iss, compression_type, ',')) {
-        auto iter = std::find_if(
-            kCompressions.begin(), kCompressions.end(),
-            [&compression_type](std::pair<CompressionType, const char*> curr) {
-              return curr.second == compression_type;
-            });
-        if (iter == kCompressions.end()) {
+        auto iter =
+            OptionsHelper::compression_type_string_map.find(compression_type);
+        if (iter == OptionsHelper::compression_type_string_map.end()) {
           fprintf(stderr, "%s is not a valid CompressionType\n",
                   compression_type.c_str());
           exit(1);
         }
-        compression_types.emplace_back(*iter);
+        compression_types.emplace_back(iter->second);
+      }
+    } else if (strncmp(argv[i], "--require_property_regex=", 25) == 0) {
+      require_property_regex = std::make_unique<std::regex>(
+          argv[i] + 25, std::regex_constants::egrep);
+    } else if (strncmp(argv[i], "--exclude_property_regex=", 25) == 0) {
+      exclude_property_regex = std::make_unique<std::regex>(
+          argv[i] + 25, std::regex_constants::egrep);
+    } else if (strncmp(argv[i], "--compression_manager=", 22) == 0) {
+      std::string compression_manager_str = argv[i] + 22;
+      ConfigOptions config_options;
+      config_options.ignore_unsupported_options = false;
+      Status s = CompressionManager::CreateFromString(
+          config_options, compression_manager_str, &compression_manager);
+      if (!s.ok()) {
+        fprintf(stderr, "Failed to create compression manager: %s\n",
+                s.ToString().c_str());
+        exit(1);
+      }
+      if (compression_manager == nullptr) {
+        fprintf(stderr, "No compression manager created: %s\n",
+                compression_manager_str.c_str());
+        exit(1);
+      }
+      options.compression_manager = compression_manager;
+      printf("Using compression manager: %s\n",
+             compression_manager->GetId().c_str());
+    } else if (strncmp(argv[i], "--enable_index_compression=", 27) == 0) {
+      if (strlen(argv[i]) > 27) {
+        enable_index_compression =
+            argv[i][27] == '1' || argv[i][27] == 't' || argv[i][27] == 'T';
       }
     } else if (strncmp(argv[i], "--parse_internal_key=", 21) == 0) {
       std::string in_key(argv[i] + 21);
@@ -276,6 +337,12 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
       }
       fprintf(stdout, "key=%s\n", ikey.DebugString(true, true).c_str());
       return retc;
+    } else if (ParseIntArg(argv[i], "--compression_level=",
+                           "compression_level must be numeric", &tmp_val)) {
+      has_compression_level_from = true;
+      has_compression_level_to = true;
+      compress_level_from = static_cast<int>(tmp_val);
+      compress_level_to = static_cast<int>(tmp_val);
     } else if (ParseIntArg(argv[i], "--compression_level_from=",
                            "compression_level_from must be numeric",
                            &tmp_val)) {
@@ -295,6 +362,16 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
         return 1;
       }
       compression_max_dict_bytes = static_cast<uint32_t>(tmp_val);
+    } else if (ParseIntArg(argv[i], "--compression_parallel_threads=",
+                           "compression_parallel_threads must be numeric",
+                           &tmp_val)) {
+      if (tmp_val < 0 || tmp_val > 100) {
+        fprintf(stderr, "compression_parallel_threads out of range: '%s'\n",
+                argv[i]);
+        print_help(/*to_stderr*/ true);
+        return 1;
+      }
+      compression_parallel_threads = static_cast<uint32_t>(tmp_val);
     } else if (ParseIntArg(argv[i], "--compression_zstd_max_train_bytes=",
                            "compression_zstd_max_train_bytes must be numeric",
                            &tmp_val)) {
@@ -319,25 +396,30 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
       compression_max_dict_buffer_bytes = static_cast<uint64_t>(tmp_val);
     } else if (strcmp(argv[i], "--compression_use_zstd_finalize_dict") == 0) {
       compression_use_zstd_finalize_dict = true;
+    } else if (strcmp(argv[i], "--list_meta_blocks") == 0) {
+      list_meta_blocks = true;
     } else if (strcmp(argv[i], "--help") == 0) {
       print_help(/*to_stderr*/ false);
       return 0;
     } else if (strcmp(argv[i], "--version") == 0) {
       printf("%s\n", GetRocksBuildInfoAsString("sst_dump").c_str());
       return 0;
-    } else {
+    } else if (strcmp(argv[i], "--") == 0) {
+      // Remaining args are dir-or-file
+      for (++i; i < argc; ++i) {
+        dirs_or_files.emplace_back(argv[i], kUnknownDirVsFile);
+      }
+    } else if (argv[i][0] == '-') {
       fprintf(stderr, "Unrecognized argument '%s'\n\n", argv[i]);
       print_help(/*to_stderr*/ true);
       return 1;
+    } else {
+      // Dir-or-file arg
+      dirs_or_files.emplace_back(argv[i], kUnknownDirVsFile);
     }
   }
 
-  if (has_compression_level_from && has_compression_level_to) {
-    if (!has_specified_compression_types || compression_types.size() != 1) {
-      fprintf(stderr, "Specify one compression type.\n\n");
-      exit(1);
-    }
-  } else if (has_compression_level_from || has_compression_level_to) {
+  if (has_compression_level_from ^ has_compression_level_to) {
     fprintf(stderr,
             "Specify both --compression_level_from and "
             "--compression_level_to.\n\n");
@@ -358,7 +440,7 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
     }
   }
 
-  if (dir_or_file == nullptr) {
+  if (dirs_or_files.empty()) {
     fprintf(stderr, "file or directory must be specified.\n\n");
     print_help(/*to_stderr*/ true);
     exit(1);
@@ -384,26 +466,35 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
 
   std::vector<std::string> filenames;
   ROCKSDB_NAMESPACE::Env* env = options.env;
-  ROCKSDB_NAMESPACE::Status st = env->GetChildren(dir_or_file, &filenames);
-  bool dir = true;
-  if (!st.ok() || filenames.empty()) {
-    // dir_or_file does not exist or does not contain children
-    // Check its existence first
-    Status s = env->FileExists(dir_or_file);
-    // dir_or_file does not exist
-    if (!s.ok()) {
-      fprintf(stderr, "%s%s: No such file or directory\n", s.ToString().c_str(),
-              dir_or_file);
-      return 1;
+  ROCKSDB_NAMESPACE::Status st;
+
+  for (size_t i = 0; i < dirs_or_files.size(); ++i) {
+    auto dir_or_file = dirs_or_files[i].first;
+    std::vector<std::string> children;
+    st = env->GetChildren(dirs_or_files[i].first, &children);
+    if (!st.ok() || children.empty()) {
+      // dir_or_file does not exist or does not contain children
+      // Check its existence first
+      Status s = env->FileExists(dir_or_file);
+      // dir_or_file does not exist
+      if (!s.ok()) {
+        fprintf(stderr, "%s%s: No such file or directory\n",
+                s.ToString().c_str(), dir_or_file);
+        return 1;
+      }
+      // dir_or_file exists and is treated as a "file"
+      // since it has no children
+      // This is ok since later it will be checked
+      // that whether it is a valid sst or not
+      // (A directory "file" is not a valid sst)
+      filenames.emplace_back(dir_or_file);
+      dirs_or_files[i].second = kFile;
+    } else {
+      for (auto& child : children) {
+        filenames.push_back(std::string{dir_or_file} + "/" + child);
+      }
+      dirs_or_files[i].second = kDir;
     }
-    // dir_or_file exists and is treated as a "file"
-    // since it has no children
-    // This is ok since later it will be checked
-    // that whether it is a valid sst or not
-    // (A directory "file" is not a valid sst)
-    filenames.clear();
-    filenames.emplace_back(dir_or_file);
-    dir = false;
   }
 
   uint64_t total_read = 0;
@@ -417,48 +508,97 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
       continue;
     }
 
-    if (dir) {
-      filename = std::string(dir_or_file) + "/" + filename;
-    }
-
     if (command == "verify") {
       verify_checksum = true;
     }
 
+    // Update options for when simulating writing a table file
+    {
+      BlockBasedTableOptions bbto;
+      if (options.table_factory->IsInstanceOf(
+              TableFactory::kBlockBasedTableName()) &&
+          options.table_factory->GetOptions<BlockBasedTableOptions>()) {
+        bbto = *options.table_factory->GetOptions<BlockBasedTableOptions>();
+      }
+      bbto.block_size = block_size;
+      bbto.enable_index_compression = enable_index_compression;
+      // Maximize compression features available
+      bbto.format_version = kLatestBbtFormatVersion;
+      options.table_factory = std::make_shared<BlockBasedTableFactory>(bbto);
+    }
+    options.compression_opts.max_dict_bytes = compression_max_dict_bytes;
+    options.compression_opts.zstd_max_train_bytes =
+        compression_zstd_max_train_bytes;
+    options.compression_opts.max_dict_buffer_bytes =
+        compression_max_dict_buffer_bytes;
+    options.compression_opts.use_zstd_dict_trainer =
+        !compression_use_zstd_finalize_dict;
+    options.compression_opts.parallel_threads = compression_parallel_threads;
+
     ROCKSDB_NAMESPACE::SstFileDumper dumper(
         options, filename, Temperature::kUnknown, readahead_size,
-        verify_checksum, output_hex, decode_blob_index);
+        verify_checksum, output_hex, decode_blob_index, EnvOptions(), false,
+        show_sequence_number_type);
+
     // Not a valid SST
     if (!dumper.getStatus().ok()) {
       fprintf(stderr, "%s: %s\n", filename.c_str(),
               dumper.getStatus().ToString().c_str());
       continue;
-    } else {
-      valid_sst_files.push_back(filename);
-      // Print out from and to key information once
-      // where there is at least one valid SST
-      if (valid_sst_files.size() == 1) {
-        // from_key and to_key are only used for "check", "scan", or ""
-        if (command == "check" || command == "scan" || command == "") {
-          fprintf(stdout, "from [%s] to [%s]\n",
-                  ROCKSDB_NAMESPACE::Slice(from_key).ToString(true).c_str(),
-                  ROCKSDB_NAMESPACE::Slice(to_key).ToString(true).c_str());
-        }
+    }
+    auto props_ptr = dumper.GetInitTableProperties();
+    if (props_ptr && (require_property_regex || exclude_property_regex)) {
+      // Call should match with show_properties below
+      auto props_str = props_ptr->ToString("\n  ", ": ");
+      if (require_property_regex &&
+          !std::regex_search(props_str, *require_property_regex)) {
+        fprintf(stderr,
+                "%s: skipping because properties string doesn't match required "
+                "regex\n",
+                filename.c_str());
+        continue;
+      }
+      if (exclude_property_regex &&
+          std::regex_search(props_str, *exclude_property_regex)) {
+        fprintf(
+            stderr,
+            "%s: skipping because properties string matches excluded regex\n",
+            filename.c_str());
+        continue;
+      }
+    }
+    valid_sst_files.push_back(filename);
+    // Print out from and to key information once
+    // where there is at least one valid SST
+    if (valid_sst_files.size() == 1) {
+      // from_key and to_key are only used for "check", "scan", or ""
+      if (command == "check" || command == "scan" || command == "") {
+        fprintf(stdout, "from [%s] to [%s]\n",
+                ROCKSDB_NAMESPACE::Slice(from_key).ToString(true).c_str(),
+                ROCKSDB_NAMESPACE::Slice(to_key).ToString(true).c_str());
       }
     }
 
     if (command == "recompress") {
+      if (compression_types.empty()) {
+        if (options.compression_manager != nullptr) {
+          for (int c = 0; c < kDisableCompressionOption; ++c) {
+            if (options.compression_manager->SupportsCompressionType(
+                    static_cast<CompressionType>(c))) {
+              compression_types.emplace_back(static_cast<CompressionType>(c));
+            }
+          }
+        } else {
+          compression_types = GetSupportedCompressions();
+        }
+      }
       st = dumper.ShowAllCompressionSizes(
-          set_block_size ? block_size : 16384,
-          compression_types.empty() ? kCompressions : compression_types,
-          compress_level_from, compress_level_to, compression_max_dict_bytes,
-          compression_zstd_max_train_bytes, compression_max_dict_buffer_bytes,
-          !compression_use_zstd_finalize_dict);
+          compression_types, compress_level_from, compress_level_to);
       if (!st.ok()) {
         fprintf(stderr, "Failed to recompress: %s\n", st.ToString().c_str());
         exit(1);
       }
-      return 0;
+      continue;
     }
 
     if (command == "raw") {
@@ -542,7 +682,35 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
         fprintf(stderr, "Reader unexpectedly returned null properties\n");
       }
     }
+
+    BlockContents& meta_index_contents = dumper.GetMetaIndexContents();
+    if (list_meta_blocks && meta_index_contents.data.size() > 0) {
+      Block meta_index_block(std::move(meta_index_contents));
+      std::unique_ptr<MetaBlockIter> meta_index_iter;
+      meta_index_iter.reset(meta_index_block.NewMetaIterator());
+      meta_index_iter->SeekToFirst();
+      fprintf(stdout,
+              "Meta Blocks:\n"
+              "------------------------------\n");
+      while (meta_index_iter->status().ok() && meta_index_iter->Valid()) {
+        Slice v = meta_index_iter->value();
+        BlockHandle handle;
+        st = handle.DecodeFrom(&v);
+        if (!st.ok()) {
+          fprintf(stderr, "%s: Could not decode block handle - %s\n",
+                  filename.c_str(), st.ToString().c_str());
+        } else {
+          fprintf(stdout, "  %s: %" PRIu64 " %" PRIu64 "\n",
+                  meta_index_iter->key().ToString().c_str(), handle.offset(),
+                  handle.size());
+        }
+        meta_index_iter->Next();
+      }
+    } else if (list_meta_blocks) {
+      fprintf(stderr, "Could not read the meta index block\n");
+    }
   }
+
   if (show_summary) {
     fprintf(stdout, "total number of files: %" PRIu64 "\n", total_num_files);
     fprintf(stdout, "total number of data blocks: %" PRIu64 "\n",
@@ -558,25 +726,34 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
   if (valid_sst_files.empty()) {
     // No valid SST files are found
     // Exit with an error state
-    if (dir) {
-      fprintf(stdout, "------------------------------\n");
-      fprintf(stderr, "No valid SST files found in %s\n", dir_or_file);
-    } else {
-      fprintf(stderr, "%s is not a valid SST file\n", dir_or_file);
+    for (auto& e : dirs_or_files) {
+      if (e.second == kDir) {
+        fprintf(stdout, "------------------------------\n");
+        fprintf(stderr, "No valid SST files found in %s\n", e.first);
+      } else {
+        assert(e.second == kFile);
+        fprintf(stderr, "%s is not a valid SST file\n", e.first);
+      }
     }
     return 1;
   } else {
+    assert(!dirs_or_files.empty());
     if (command == "identify") {
-      if (dir) {
+      if (dirs_or_files.size() > 1 || dirs_or_files[0].second == kDir) {
         fprintf(stdout, "------------------------------\n");
-        fprintf(stdout, "List of valid SST files found in %s:\n", dir_or_file);
+        std::string single_dir_msg;
+        if (dirs_or_files.size() == 1) {
+          single_dir_msg += " found in ";
+          single_dir_msg += dirs_or_files[0].first;
+        }
+        fprintf(stdout, "List of valid SST files%s:\n", single_dir_msg.c_str());
         for (const auto& f : valid_sst_files) {
           fprintf(stdout, "%s\n", f.c_str());
         }
         fprintf(stdout, "Number of valid SST files: %zu\n",
                 valid_sst_files.size());
       } else {
-        fprintf(stdout, "%s is a valid SST file\n", dir_or_file);
+        fprintf(stdout, "%s is a valid SST file\n", dirs_or_files[0].first);
       }
     }
     // At least one valid SST
diff --git a/tools/tool_hooks.cc b/tools/tool_hooks.cc
index bdccd4d49157..32fac03e61fb 100644
--- a/tools/tool_hooks.cc
+++ b/tools/tool_hooks.cc
@@ -16,19 +16,20 @@
 namespace ROCKSDB_NAMESPACE {
 
 Status DefaultHooks::Open(const Options& db_options, const std::string& name,
-                          DB** dbptr) {
+                          std::unique_ptr<DB>* dbptr) {
   return DB::Open(db_options, name, dbptr);
 };
 
 Status DefaultHooks::Open(
     const DBOptions& db_options, const std::string& name,
     const std::vector<ColumnFamilyDescriptor>& column_families,
-    std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) {
+    std::vector<ColumnFamilyHandle*>* handles, std::unique_ptr<DB>* dbptr) {
   return DB::Open(db_options, name, column_families, handles, dbptr);
 };
 
 Status DefaultHooks::OpenForReadOnly(const Options& options,
-                                     const std::string& name, DB** dbptr,
+                                     const std::string& name,
+                                     std::unique_ptr<DB>* dbptr,
                                      bool error_if_wal_file_exists = false) {
   return DB::OpenForReadOnly(options, name, dbptr, error_if_wal_file_exists);
 };
@@ -36,7 +37,7 @@ Status DefaultHooks::OpenForReadOnly(const Options& options,
 Status DefaultHooks::OpenForReadOnly(
     const Options& options, const std::string& name,
     const std::vector<ColumnFamilyDescriptor>& column_families,
-    std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) {
+    std::vector<ColumnFamilyHandle*>* handles, std::unique_ptr<DB>* dbptr) {
   return DB::OpenForReadOnly(options, name, column_families, handles, dbptr);
 };
 Status DefaultHooks::OpenTransactionDB(
@@ -72,7 +73,7 @@ Status DefaultHooks::OpenOptimisticTransactionDB(
 Status DefaultHooks::OpenAsSecondary(const Options& options,
                                      const std::string& name,
                                      const std::string& secondary_path,
-                                     DB** dbptr) {
+                                     std::unique_ptr<DB>* dbptr) {
   return DB::OpenAsSecondary(options, name, secondary_path, dbptr);
 }
 Status DefaultHooks::OpenAsFollower(const Options& options,
diff --git a/tools/trace_analyzer_test.cc b/tools/trace_analyzer_test.cc
index 1d5c870540ad..1d1ee61e6670 100644
--- a/tools/trace_analyzer_test.cc
+++ b/tools/trace_analyzer_test.cc
@@ -69,7 +69,7 @@ class TraceAnalyzerTest : public testing::Test {
     ro.iterate_lower_bound = &lower_bound;
     WriteOptions wo;
     TraceOptions trace_opt;
-    DB* db_ = nullptr;
+    std::unique_ptr<DB> db_;
     std::string value;
     std::unique_ptr<TraceWriter> trace_writer;
     Iterator* single_iter = nullptr;
@@ -125,7 +125,7 @@ class TraceAnalyzerTest : public testing::Test {
     ASSERT_OK(env_->NewWritableFile(whole_path, &whole_f, env_options_));
     std::string whole_str = "0x61\n0x62\n0x63\n0x64\n0x65\n0x66\n";
     ASSERT_OK(whole_f->Append(whole_str));
-    delete db_;
+    db_.reset();
     ASSERT_OK(DestroyDB(dbname_, options));
   }
 
@@ -786,7 +786,7 @@ TEST_F(TraceAnalyzerTest, Iterator) {
 }
 
 TEST_F(TraceAnalyzerTest, ExistsPreviousTraceWriteError) {
-  DB* db_ = nullptr;
+  std::unique_ptr<DB> db_;
   Options options;
   options.create_if_missing = true;
 
@@ -823,7 +823,7 @@ TEST_F(TraceAnalyzerTest, ExistsPreviousTraceWriteError) {
   ASSERT_TRUE(s.ToString().find("Tracing has seen error") != std::string::npos);
   ASSERT_TRUE(s.ToString().find("Injected") != std::string::npos);
 
-  delete db_;
+  db_.reset();
   ASSERT_OK(DestroyDB(dbname_, options));
 }
 
diff --git a/tools/write_stress.cc b/tools/write_stress.cc
index 5cfec3e8e5bd..30fc0467d52d 100644
--- a/tools/write_stress.cc
+++ b/tools/write_stress.cc
@@ -145,13 +145,11 @@ class WriteStress {
     }
 
     // open DB
-    DB* db;
-    Status s = DB::Open(options, FLAGS_db, &db);
+    Status s = DB::Open(options, FLAGS_db, &db_);
     if (!s.ok()) {
       fprintf(stderr, "Can't open database: %s\n", s.ToString().c_str());
       std::abort();
     }
-    db_.reset(db);
   }
 
   void WriteThread() {
diff --git a/trace_replay/io_tracer.cc b/trace_replay/io_tracer.cc
index a860130f8560..e72b80c4f1bf 100644
--- a/trace_replay/io_tracer.cc
+++ b/trace_replay/io_tracer.cc
@@ -82,7 +82,7 @@ Status IOTraceWriter::WriteIOOp(const IOTraceRecord& record,
     uint32_t set_pos = static_cast<uint32_t>(log2(trace_data & -trace_data));
     switch (set_pos) {
       case IODebugContext::TraceData::kRequestID: {
-        Slice request_id(dbg->request_id);
+        Slice request_id(*dbg->request_id);
         PutLengthPrefixedSlice(&trace.payload, request_id);
       } break;
       default:
diff --git a/trace_replay/io_tracer_test.cc b/trace_replay/io_tracer_test.cc
index be3af4fb3597..6946fa4be11d 100644
--- a/trace_replay/io_tracer_test.cc
+++ b/trace_replay/io_tracer_test.cc
@@ -145,7 +145,8 @@ TEST_F(IOTracerTest, MultipleRecordsWithDifferentIOOpOptions) {
     // Write record with IODebugContext.
     io_op_data = 0;
     IODebugContext dbg;
-    dbg.SetRequestId("request_id_1");
+    const std::string test_request_id = "request_id_1";
+    dbg.SetRequestId(&test_request_id);
     IOTraceRecord record5(0, TraceType::kIOTracer, io_op_data,
                           GetFileOperation(5), 10 /*latency*/,
                           IOStatus::OK().ToString(), file_name);
diff --git a/unreleased_history/behavior_changes/ldb_comp.md b/unreleased_history/behavior_changes/ldb_comp.md
deleted file mode 100644
index 1dff841ef511..000000000000
--- a/unreleased_history/behavior_changes/ldb_comp.md
+++ /dev/null
@@ -1 +0,0 @@
-* `ldb` now returns an error if the specified `--compression_type` is not supported in the build.
diff --git a/unreleased_history/behavior_changes/reduce_file_locking_default_true.md b/unreleased_history/behavior_changes/reduce_file_locking_default_true.md
new file mode 100644
index 000000000000..31968f307888
--- /dev/null
+++ b/unreleased_history/behavior_changes/reduce_file_locking_default_true.md
@@ -0,0 +1 @@
+Change the default value of `CompactionOptionsUniversal::reduce_file_locking` from `false` to `true` to improve write stall and reduce read regression
diff --git a/unreleased_history/bug_fixes/deleterange_format_compatible.md b/unreleased_history/bug_fixes/deleterange_format_compatible.md
new file mode 100644
index 000000000000..150faffc3695
--- /dev/null
+++ b/unreleased_history/bug_fixes/deleterange_format_compatible.md
@@ -0,0 +1 @@
+* Fix longstanding failures that can arise from reading and/or compacting old DB dirs with range deletions (likely from version < 5.19.0) in many newer versions.
diff --git a/unreleased_history/bug_fixes/txn_two_write_queues_seqno_recovery.md b/unreleased_history/bug_fixes/txn_two_write_queues_seqno_recovery.md
new file mode 100644
index 000000000000..95413bf15fd0
--- /dev/null
+++ b/unreleased_history/bug_fixes/txn_two_write_queues_seqno_recovery.md
@@ -0,0 +1 @@
+Fix a bug where WritePrepared/WriteUnprepared TransactionDB with two_write_queues=true could experience "sequence number going backwards" corruption during recovery from a background error, due to allocated-but-not-published sequence numbers not being synced before creating new WAL files.
diff --git a/unreleased_history/new_features/fifo_kv_ratio_compaction.md b/unreleased_history/new_features/fifo_kv_ratio_compaction.md
new file mode 100644
index 000000000000..3cf01dda0c78
--- /dev/null
+++ b/unreleased_history/new_features/fifo_kv_ratio_compaction.md
@@ -0,0 +1 @@
+Added `CompactionOptionsFIFO::max_data_files_size` to support FIFO compaction trimming based on combined SST and blob file sizes. Added `CompactionOptionsFIFO::use_kv_ratio_compaction` to enable a capacity-derived intra-L0 compaction strategy optimized for BlobDB workloads, producing uniform-sized compacted files for predictable FIFO trimming.
diff --git a/unreleased_history/new_features/interpolation_search b/unreleased_history/new_features/interpolation_search
new file mode 100644
index 000000000000..ded1f773a07c
--- /dev/null
+++ b/unreleased_history/new_features/interpolation_search
@@ -0,0 +1 @@
+Include interpolation search as an alternative to binary search, which typically performs better when keys are uniformly distributed. This is exposed as a new table option `index_block_search_type`. The default is `binary_search`.
diff --git a/unreleased_history/new_features/wide_column_blob_support.md b/unreleased_history/new_features/wide_column_blob_support.md
new file mode 100644
index 000000000000..abc8c237162b
--- /dev/null
+++ b/unreleased_history/new_features/wide_column_blob_support.md
@@ -0,0 +1 @@
+Added support for storing wide-column entity column values in blob files. When `min_blob_size` is configured, large column values in wide-column entities will be stored in blob files, reducing SST file size and improving read performance.
diff --git a/unreleased_history/public_api_changes/abort_compaction_apis.md b/unreleased_history/public_api_changes/abort_compaction_apis.md
new file mode 100644
index 000000000000..d55882b3935d
--- /dev/null
+++ b/unreleased_history/public_api_changes/abort_compaction_apis.md
@@ -0,0 +1 @@
+Added new virtual methods `AbortAllCompactions()` and `ResumeAllCompactions()` to the `DB` class. Added new `Status::SubCode::kCompactionAborted` to indicate a compaction was aborted. Added `Status::IsCompactionAborted()` helper method to check if a status represents an aborted compaction.
diff --git a/unreleased_history/public_api_changes/remove_deprecated_apis_batch1.md b/unreleased_history/public_api_changes/remove_deprecated_apis_batch1.md
new file mode 100644
index 000000000000..3897c4918d87
--- /dev/null
+++ b/unreleased_history/public_api_changes/remove_deprecated_apis_batch1.md
@@ -0,0 +1 @@
+Remove deprecated, unused APIs and options: `ReadOptions::managed` and `ColumnFamilyOptions::snap_refresh_nanos`. Corresponding C and Java APIs are also removed.
diff --git a/unreleased_history/public_api_changes/remove_fv_1.md b/unreleased_history/public_api_changes/remove_fv_1.md
new file mode 100644
index 000000000000..dbeb3d870b69
--- /dev/null
+++ b/unreleased_history/public_api_changes/remove_fv_1.md
@@ -0,0 +1 @@
+* Drop support for reading (and writing) SST files using `BlockBasedTableOptions.format_version` < 2, which hasn't been the default format for about 10 years. An upgrade path is still possible with full compaction using a RocksDB version >= 4.6.0 and < 11.0.0 and then using the newer version.
diff --git a/unreleased_history/public_api_changes/remove_raw_ptr_db_open.md b/unreleased_history/public_api_changes/remove_raw_ptr_db_open.md
new file mode 100644
index 000000000000..03c442d2ac53
--- /dev/null
+++ b/unreleased_history/public_api_changes/remove_raw_ptr_db_open.md
@@ -0,0 +1,2 @@
+* Remove deprecated raw `DB*` variants of `DB::Open` and related functions. Some other minor public APIs were updated as a result
+* Remove deprecated `DB::MaxMemCompactionLevel()`
diff --git a/unreleased_history/public_api_changes/remove_secondary_compress_format_version.md b/unreleased_history/public_api_changes/remove_secondary_compress_format_version.md
new file mode 100644
index 000000000000..ecd17cfd7144
--- /dev/null
+++ b/unreleased_history/public_api_changes/remove_secondary_compress_format_version.md
@@ -0,0 +1 @@
+* Remove useless option `CompressedSecondaryCacheOptions::compress_format_version`
diff --git a/unreleased_history/public_api_changes/remove_skip_checking_sst_file_sizes_on_db_open.md b/unreleased_history/public_api_changes/remove_skip_checking_sst_file_sizes_on_db_open.md
new file mode 100644
index 000000000000..385dda3e9167
--- /dev/null
+++ b/unreleased_history/public_api_changes/remove_skip_checking_sst_file_sizes_on_db_open.md
@@ -0,0 +1 @@
+Remove deprecated DB option `skip_checking_sst_file_sizes_on_db_open`. The option was deprecated in 10.5.0 and has been a no-op since then. File size validation is now always performed in parallel during DB open.
diff --git a/unreleased_history/public_api_changes/remove_slice_transform_inrange.md b/unreleased_history/public_api_changes/remove_slice_transform_inrange.md
new file mode 100644
index 000000000000..bc007588b9f7
--- /dev/null
+++ b/unreleased_history/public_api_changes/remove_slice_transform_inrange.md
@@ -0,0 +1 @@
+Remove deprecated `SliceTransform::InRange()` virtual method and the `in_range` callback parameter from `rocksdb_slicetransform_create()` in the C API. `InRange()` was never called by RocksDB and existed only for backward compatibility.
diff --git a/unreleased_history/public_api_changes/remove_sst_file_writer_deprecated.md b/unreleased_history/public_api_changes/remove_sst_file_writer_deprecated.md
new file mode 100644
index 000000000000..d4096f82b359
--- /dev/null
+++ b/unreleased_history/public_api_changes/remove_sst_file_writer_deprecated.md
@@ -0,0 +1 @@
+Remove deprecated `SstFileWriter::Add()` method (use `Put()` instead) and the deprecated `skip_filters` parameter from `SstFileWriter` constructors (use `BlockBasedTableOptions::filter_policy` set to `nullptr` to skip filter generation instead).
diff --git a/util/aligned_buffer.h b/util/aligned_buffer.h
index 4d1471c7aef7..d1137642bfcf 100644
--- a/util/aligned_buffer.h
+++ b/util/aligned_buffer.h
@@ -11,6 +11,7 @@
 #include <algorithm>
 #include <cassert>
 
+#include "port/malloc.h"
 #include "port/port.h"
 #include "rocksdb/file_system.h"
 namespace ROCKSDB_NAMESPACE {
@@ -251,4 +252,72 @@ class AlignedBuffer {
 
   void Size(size_t cursize) { cursize_ = cursize; }
 };
+
+// Related to std::string but more easily avoids zeroing out a buffer that's
+// going to be overwritten anyway.
+class GrowableBuffer {
+ public:
+  GrowableBuffer() : capacity_(0) {}
+  ~GrowableBuffer() { free(data_); }
+  // No copies
+  GrowableBuffer(const GrowableBuffer&) = delete;
+  GrowableBuffer& operator=(const GrowableBuffer&) = delete;
+  // Movable
+  GrowableBuffer(GrowableBuffer&& other) noexcept
+      : data_(other.data_), size_(other.size_), capacity_(other.capacity_) {
+    other.data_ = nullptr;
+    other.size_ = 0;
+    other.capacity_ = 0;
+  }
+  GrowableBuffer& operator=(GrowableBuffer&& other) noexcept {
+    if (this == &other) {
+      return *this;
+    }
+    free(data_);
+    data_ = other.data_;
+    size_ = other.size_;
+    capacity_ = other.capacity_;
+    other.data_ = nullptr;
+    other.size_ = 0;
+    other.capacity_ = 0;
+    return *this;
+  }
+
+  char* data() { return data_; }
+  const char* data() const { return data_; }
+
+  size_t size() const { return size_; }
+  size_t& MutableSize() { return size_; }
+
+  bool empty() const { return size_ == 0; }
+
+  void Reset() { size_ = 0; }
+  void ResetForSize(size_t new_size) {
+    if (new_size > capacity_) {
+      free(data_);
+      size_t new_capacity = std::max(capacity_ * 2, new_size);
+      new_capacity = std::max(size_t{64}, new_capacity);
+      data_ = static_cast<char*>(malloc(new_capacity));
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+      capacity_ = malloc_usable_size(data_);
+#else
+      capacity_ = new_capacity;
+#endif
+      // Warm the memory in CPU cache
+      for (size_t i = 0; i < new_capacity; i += CACHE_LINE_SIZE) {
+        data_[i] = 1;
+      }
+    }
+    size_ = new_size;
+  }
+
+  Slice AsSlice() const { return Slice(data_, size_); }
+  operator Slice() const { return AsSlice(); }
+
+ private:
+  char* data_ = nullptr;
+  size_t size_ = 0;
+  size_t capacity_;
+};
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/util/async_file_reader.cc b/util/async_file_reader.cc
index 8fa4d19933c4..67acc978b9be 100644
--- a/util/async_file_reader.cc
+++ b/util/async_file_reader.cc
@@ -31,7 +31,7 @@ bool AsyncFileReader::MultiReadAsyncImpl(ReadAwaiter* awaiter) {
           }
         },
         &awaiter->read_reqs_[i], &awaiter->io_handle_[i], &awaiter->del_fn_[i],
-        /*aligned_buf=*/nullptr);
+        /*aligned_buf=*/nullptr, awaiter->dbg_);
     if (!s.ok()) {
       // For any non-ok status, the FileSystem will not call the callback
       // So let's update the status ourselves
diff --git a/util/async_file_reader.h b/util/async_file_reader.h
index 50a59519491f..989f392cace5 100644
--- a/util/async_file_reader.h
+++ b/util/async_file_reader.h
@@ -36,9 +36,10 @@ class AsyncFileReader {
                                             const IOOptions& opts,
                                             FSReadRequest* read_reqs,
                                             size_t num_reqs,
-                                            AlignedBuf* aligned_buf) noexcept {
-    return ReadOperation<ReadAwaiter>{*this,     file,     opts,
-                                      read_reqs, num_reqs, aligned_buf};
+                                            AlignedBuf* aligned_buf,
+                                            IODebugContext* dbg) noexcept {
+    return ReadOperation<ReadAwaiter>{*this,    file,        opts, read_reqs,
+                                      num_reqs, aligned_buf, dbg};
   }
 
  private:
@@ -49,12 +50,14 @@ class AsyncFileReader {
    public:
     explicit ReadAwaiter(AsyncFileReader& reader, RandomAccessFileReader* file,
                          const IOOptions& opts, FSReadRequest* read_reqs,
-                         size_t num_reqs, AlignedBuf* /*aligned_buf*/) noexcept
+                         size_t num_reqs, AlignedBuf* /*aligned_buf*/,
+                         IODebugContext* dbg) noexcept
         : reader_(reader),
           file_(file),
           opts_(opts),
           read_reqs_(read_reqs),
           num_reqs_(num_reqs),
+          dbg_(dbg),
           next_(nullptr) {}
 
     bool await_ready() noexcept { return false; }
@@ -82,6 +85,7 @@ class AsyncFileReader {
     const IOOptions& opts_;
     FSReadRequest* read_reqs_;
     size_t num_reqs_;
+    IODebugContext* dbg_;
     autovector<void*, 32> io_handle_;
     autovector<IOHandleDeleter, 32> del_fn_;
     folly::coro::impl::coroutine_handle<> awaiting_coro_;
@@ -101,18 +105,20 @@ class AsyncFileReader {
     explicit ReadOperation(AsyncFileReader& reader,
                            RandomAccessFileReader* file, const IOOptions& opts,
                            FSReadRequest* read_reqs, size_t num_reqs,
-                           AlignedBuf* aligned_buf) noexcept
+                           AlignedBuf* aligned_buf,
+                           IODebugContext* dbg) noexcept
         : reader_(reader),
           file_(file),
           opts_(opts),
           read_reqs_(read_reqs),
           num_reqs_(num_reqs),
-          aligned_buf_(aligned_buf) {}
+          aligned_buf_(aligned_buf),
+          dbg_(dbg) {}
 
     auto viaIfAsync(folly::Executor::KeepAlive<> executor) const {
       return folly::coro::co_viaIfAsync(
-          std::move(executor),
-          Awaiter{reader_, file_, opts_, read_reqs_, num_reqs_, aligned_buf_});
+          std::move(executor), Awaiter{reader_, file_, opts_, read_reqs_,
+                                       num_reqs_, aligned_buf_, dbg_});
     }
 
    private:
@@ -122,6 +128,7 @@ class AsyncFileReader {
     FSReadRequest* read_reqs_;
     size_t num_reqs_;
     AlignedBuf* aligned_buf_;
+    IODebugContext* dbg_;
   };
 
   // This function does the actual work when this awaitable starts execution
diff --git a/util/atomic.h b/util/atomic.h
index afb3dc540050..209be20f3f50 100644
--- a/util/atomic.h
+++ b/util/atomic.h
@@ -13,22 +13,28 @@ namespace ROCKSDB_NAMESPACE {
 
 // Background:
 // std::atomic is somewhat easy to misuse:
-// * Implicit conversion to T using std::memory_order_seq_cst, along with
-// memory order parameter defaults, make it easy to accidentally mix sequential
-// consistency ordering with acquire/release memory ordering. See
-// "The single total order might not be consistent with happens-before" at
-// https://en.cppreference.com/w/cpp/atomic/memory_order
+// * Implicit conversion to T makes it easy to use an unnecessarily strong
+// memory ordering (std::memory_order_seq_cst) and to hide atomic operations
+// that should be evident on reading the code.
+// * Similarly, defaulting to std::memory_order_seq_cst for atomic operations
+// makes it easy to use unnecessarily strong orderings. (It's always safe if
+// some ordering is safe, but it's better to be intentional and thoughtful when
+// carefully optimizing code with atomics.) Legitimate needs for seq_cst vs.
+// acq_rel are rare, such as drawing inferences across two atomics in
+// implementing hazard pointers.
 // * It's easy to use nonsensical (UB) combinations like store with
-// std::memory_order_acquire.
-// For such reasons, we provide wrappers below to make safe usage easier.
+// std::memory_order_acquire. Getting these right in development is an
+// unnecessary cognitive overhead even if they are caught by UBSAN.
+//
+// For such reasons, we provide wrappers below to make clear and explicit
+// usage of atomics easier.
 
-// Wrapper around std::atomic to avoid certain bugs (see Background above).
+// Wrapper around std::atomic for better code clarity (see Background above).
 //
-// This relaxed-only wrapper is intended for atomics that do not need
-// ordering constraints with other data reads/writes aside from those
-// necessary for computing data values or given by other happens-before
-// relationships. For example, a cross-thread counter that never returns
-// the same result can be a RelaxedAtomic.
+// This relaxed-only wrapper is intended for atomics that are not used to
+// synchronize other data across threads (only the atomic data), so can always
+// used relaxed memory ordering. For example, a cross-thread counter that never
+// returns the same result can be a RelaxedAtomic.
 template <typename T>
 class RelaxedAtomic {
  public:
@@ -66,14 +72,21 @@ class RelaxedAtomic {
   std::atomic<T> v_;
 };
 
-// Wrapper around std::atomic to avoid certain bugs (see Background above).
+// A reasonably general-purpose wrapper around std::atomic for better code
+// clarity (see Background above).
+//
+// Operations use std::memory_order_acq_rel by default (or just acquire or just
+// release for read-only and write-only operations), but relaxed operations are
+// also available and can be mixed in when appropriate.
 //
-// Except for some unusual cases requiring sequential consistency, this is
-// a general-purpose atomic. Relaxed operations can be mixed in as appropriate.
+// Future: add std::memory_order_seqcst variants like StoreSeqCst if/when
+// there's a need for them (rare). No distinct type is needed because the
+// distinction between acq_rel and seq_cst is more about where it is used in
+// combination with other atomics than the atomic itself.
 template <typename T>
-class AcqRelAtomic : public RelaxedAtomic<T> {
+class Atomic : public RelaxedAtomic<T> {
  public:
-  explicit AcqRelAtomic(T initial = {}) : RelaxedAtomic<T>(initial) {}
+  explicit Atomic(T initial = {}) : RelaxedAtomic<T>(initial) {}
   void Store(T desired) {
     RelaxedAtomic<T>::v_.store(desired, std::memory_order_release);
   }
diff --git a/util/auto_tune_compressor.cc b/util/auto_tune_compressor.cc
new file mode 100644
index 000000000000..c61ba97bbe9b
--- /dev/null
+++ b/util/auto_tune_compressor.cc
@@ -0,0 +1,331 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "util/auto_tune_compressor.h"
+
+#include "options/options_helper.h"
+#include "rocksdb/advanced_compression.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+#include "util/stop_watch.h"
+namespace ROCKSDB_NAMESPACE {
+const std::vector<std::vector<int>> CostAwareCompressor::kCompressionLevels{
+    {0},         // KSnappyCompression
+    {},          // kZlibCompression
+    {},          // kBZip2Compression
+    {1, 4, 9},   // kLZ4Compression
+    {1, 4, 9},   // klZ4HCCompression
+    {},          // kXpressCompression
+    {1, 15, 22}  // kZSTD
+};
+
+int CompressionRejectionProbabilityPredictor::Predict() const {
+  return pred_rejection_prob_percentage_;
+}
+
+size_t CompressionRejectionProbabilityPredictor::attempted_compression_count()
+    const {
+  return rejected_count_ + compressed_count_;
+}
+
+bool CompressionRejectionProbabilityPredictor::Record(
+    Slice /*uncompressed_block_data*/, char* /*compressed_output*/,
+    size_t /*compressed_output_size*/, CompressionType compression_type) {
+  if (compression_type == kNoCompression) {
+    rejected_count_++;
+  } else {
+    compressed_count_++;
+  }
+  auto attempted = attempted_compression_count();
+  if (attempted >= window_size_) {
+    pred_rejection_prob_percentage_ =
+        static_cast<int>(rejected_count_ * 100 / attempted);
+    compressed_count_ = 0;
+    rejected_count_ = 0;
+    assert(attempted_compression_count() == 0);
+  }
+  return true;
+}
+
+AutoSkipCompressorWrapper::AutoSkipCompressorWrapper(
+    std::unique_ptr<Compressor> compressor, const CompressionOptions& opts)
+    : CompressorWrapper::CompressorWrapper(std::move(compressor)),
+      opts_(opts) {}
+
+const char* AutoSkipCompressorWrapper::Name() const {
+  return "AutoSkipCompressorWrapper";
+}
+
+std::unique_ptr<Compressor> AutoSkipCompressorWrapper::Clone() const {
+  return std::make_unique<AutoSkipCompressorWrapper>(wrapped_->Clone(), opts_);
+}
+
+std::unique_ptr<Compressor> AutoSkipCompressorWrapper::MaybeCloneSpecialized(
+    CacheEntryRole block_type, DictConfigArgs&& dict_config) const {
+  auto clone =
+      wrapped_->CloneMaybeSpecialized(block_type, std::move(dict_config));
+  return std::make_unique<AutoSkipCompressorWrapper>(std::move(clone), opts_);
+}
+
+Status AutoSkipCompressorWrapper::CompressBlock(
+    Slice uncompressed_data, char* compressed_output,
+    size_t* compressed_output_size, CompressionType* out_compression_type,
+    ManagedWorkingArea* wa) {
+  // Check if the managed working area is provided or owned by this object.
+  // If not, bypass auto-skip logic since the working area lacks a predictor to
+  // record or make necessary decisions to compress or bypass compression of the
+  // block
+  if (wa == nullptr || wa->owner() != this) {
+    return wrapped_->CompressBlock(uncompressed_data, compressed_output,
+                                   compressed_output_size, out_compression_type,
+                                   wa);
+  }
+  bool exploration =
+      Random::GetTLSInstance()->PercentTrue(kExplorationPercentage);
+  TEST_SYNC_POINT_CALLBACK(
+      "AutoSkipCompressorWrapper::CompressBlock::exploitOrExplore",
+      &exploration);
+  auto autoskip_wa = static_cast<AutoSkipWorkingArea*>(wa->get());
+  if (exploration) {
+    return CompressBlockAndRecord(uncompressed_data, compressed_output,
+                                  compressed_output_size, out_compression_type,
+                                  autoskip_wa);
+  } else {
+    auto predictor_ptr = autoskip_wa->predictor;
+    auto prediction = predictor_ptr->Predict();
+    if (prediction <= kProbabilityCutOff) {
+      // decide to compress
+      return CompressBlockAndRecord(uncompressed_data, compressed_output,
+                                    compressed_output_size,
+                                    out_compression_type, autoskip_wa);
+    } else {
+      // decide to bypass compression
+      *out_compression_type = kNoCompression;
+      *compressed_output_size = 0;
+      return Status::OK();
+    }
+  }
+  return Status::OK();
+}
+
+Compressor::ManagedWorkingArea AutoSkipCompressorWrapper::ObtainWorkingArea() {
+  auto wrap_wa = wrapped_->ObtainWorkingArea();
+  return ManagedWorkingArea(new AutoSkipWorkingArea(std::move(wrap_wa)), this);
+}
+void AutoSkipCompressorWrapper::ReleaseWorkingArea(WorkingArea* wa) {
+  delete static_cast<AutoSkipWorkingArea*>(wa);
+}
+
+Status AutoSkipCompressorWrapper::CompressBlockAndRecord(
+    Slice uncompressed_data, char* compressed_output,
+    size_t* compressed_output_size, CompressionType* out_compression_type,
+    AutoSkipWorkingArea* wa) {
+  Status status = wrapped_->CompressBlock(uncompressed_data, compressed_output,
+                                          compressed_output_size,
+                                          out_compression_type, &(wa->wrapped));
+  // determine if it was rejected or compressed
+  auto predictor_ptr = wa->predictor;
+  predictor_ptr->Record(uncompressed_data, compressed_output,
+                        *compressed_output_size, *out_compression_type);
+  return status;
+}
+
+const char* AutoSkipCompressorManager::Name() const {
+  // should have returned "AutoSkipCompressorManager" but we currently have an
+  // error so for now returning name of the wrapped container
+  return wrapped_->Name();
+}
+
+std::unique_ptr<Compressor> AutoSkipCompressorManager::GetCompressorForSST(
+    const FilterBuildingContext& context, const CompressionOptions& opts,
+    CompressionType preferred) {
+  assert(GetSupportedCompressions().size() > 1);
+  assert(preferred != kNoCompression);
+  return std::make_unique<AutoSkipCompressorWrapper>(
+      wrapped_->GetCompressorForSST(context, opts, preferred), opts);
+}
+
+CostAwareCompressor::CostAwareCompressor(const CompressionOptions& opts)
+    : opts_(opts) {
+  // Creates compressor supporting all the compression types and levels as per
+  // the compression levels set in vector CompressionLevels
+  auto builtInManager = GetBuiltinV2CompressionManager();
+  const auto& compressions = GetSupportedCompressions();
+  for (size_t i = 0; i < kCompressionLevels.size(); i++) {
+    CompressionType type = static_cast<CompressionType>(i + 1);
+    if (type == kNoCompression) {
+      continue;
+    }
+    if (kCompressionLevels[type - 1].size() == 0) {
+      allcompressors_.emplace_back();
+      continue;
+    } else {
+      // if the compression type is not supported, then skip and remove
+      // compression levels from the supported compression level list
+      if (std::find(compressions.begin(), compressions.end(), type) ==
+          compressions.end()) {
+        allcompressors_.emplace_back();
+        continue;
+      }
+      std::vector<std::unique_ptr<Compressor>> compressors_diff_levels;
+      for (size_t j = 0; j < kCompressionLevels[type - 1].size(); j++) {
+        auto level = kCompressionLevels[type - 1][j];
+        CompressionOptions new_opts = opts;
+        new_opts.level = level;
+        compressors_diff_levels.push_back(
+            builtInManager->GetCompressor(new_opts, type));
+        allcompressors_index_.emplace_back(i, j);
+      }
+      allcompressors_.push_back(std::move(compressors_diff_levels));
+    }
+  }
+}
+
+const char* CostAwareCompressor::Name() const { return "CostAwareCompressor"; }
+
+std::unique_ptr<Compressor> CostAwareCompressor::Clone() const {
+  return std::make_unique<CostAwareCompressor>(opts_);
+}
+Compressor::DictConfig CostAwareCompressor::GetDictGuidance(
+    CacheEntryRole block_type) const {
+  auto idx = allcompressors_index_.back();
+  return allcompressors_[idx.first][idx.second]->GetDictGuidance(block_type);
+}
+
+Slice CostAwareCompressor::GetSerializedDict() const {
+  auto idx = allcompressors_index_.back();
+  return allcompressors_[idx.first][idx.second]->GetSerializedDict();
+}
+
+CompressionType CostAwareCompressor::GetPreferredCompressionType() const {
+  return kZSTD;
+}
+std::unique_ptr<Compressor> CostAwareCompressor::MaybeCloneSpecialized(
+    CacheEntryRole block_type, DictConfigArgs&& dict_config) const {
+  // TODO: full dictionary compression support. Currently this just falls
+  // back on a non-multi compressor when asked to use a dictionary.
+  auto idx = allcompressors_index_.back();
+  return allcompressors_[idx.first][idx.second]->MaybeCloneSpecialized(
+      block_type, std::move(dict_config));
+}
+Status CostAwareCompressor::CompressBlock(Slice uncompressed_data,
+                                          char* compressed_output,
+                                          size_t* compressed_output_size,
+                                          CompressionType* out_compression_type,
+                                          ManagedWorkingArea* wa) {
+  // Check if the managed working area is provided or owned by this object.
+  // If not, bypass compressor logic since the working area lacks a predictor
+  if (allcompressors_.size() == 0) {
+    return Status::NotSupported("No compression type supported");
+  }
+  if (wa == nullptr || wa->owner() != this) {
+    // highest compression level of Zstd
+    size_t choosen_compression_type = 6;
+    size_t compression_level_ptr = 2;
+    return allcompressors_[choosen_compression_type][compression_level_ptr]
+        ->CompressBlock(uncompressed_data, compressed_output,
+                        compressed_output_size, out_compression_type, wa);
+  }
+  auto local_wa = static_cast<CostAwareWorkingArea*>(wa->get());
+  std::pair<size_t, size_t> choosen_index(6, 2);
+  size_t choosen_compression_type = choosen_index.first;
+  size_t compresion_level_ptr = choosen_index.second;
+  return CompressBlockAndRecord(choosen_compression_type, compresion_level_ptr,
+                                uncompressed_data, compressed_output,
+                                compressed_output_size, out_compression_type,
+                                local_wa);
+}
+
+Compressor::ManagedWorkingArea CostAwareCompressor::ObtainWorkingArea() {
+  auto wrap_wa = allcompressors_.back().back()->ObtainWorkingArea();
+  auto wa = new CostAwareWorkingArea(std::move(wrap_wa));
+  // Create cost predictors for each compression type and level
+  wa->cost_predictors_.reserve(allcompressors_.size());
+  for (size_t i = 0; i < allcompressors_.size(); i++) {
+    CompressionType type = static_cast<CompressionType>(i + 1);
+    if (allcompressors_[type - 1].size() == 0) {
+      wa->cost_predictors_.emplace_back();
+      continue;
+    } else {
+      std::vector<IOCPUCostPredictor*> predictors_diff_levels;
+      predictors_diff_levels.reserve(kCompressionLevels[type - 1].size());
+      for (size_t j = 0; j < kCompressionLevels[type - 1].size(); j++) {
+        predictors_diff_levels.emplace_back(new IOCPUCostPredictor(10));
+      }
+      wa->cost_predictors_.emplace_back(std::move(predictors_diff_levels));
+    }
+  }
+  return ManagedWorkingArea(wa, this);
+}
+void CostAwareCompressor::ReleaseWorkingArea(WorkingArea* wa) {
+  // remove all created cost predictors
+  for (auto& prdictors_diff_levels :
+       static_cast<CostAwareWorkingArea*>(wa)->cost_predictors_) {
+    for (auto& predictor : prdictors_diff_levels) {
+      delete predictor;
+    }
+  }
+  delete static_cast<CostAwareWorkingArea*>(wa);
+}
+
+Status CostAwareCompressor::CompressBlockAndRecord(
+    size_t choosen_compression_type, size_t compression_level_ptr,
+    Slice uncompressed_data, char* compressed_output,
+    size_t* compressed_output_size, CompressionType* out_compression_type,
+    CostAwareWorkingArea* wa) {
+  assert(choosen_compression_type < allcompressors_.size());
+  assert(compression_level_ptr <
+         allcompressors_[choosen_compression_type].size());
+  assert(choosen_compression_type < wa->cost_predictors_.size());
+  assert(compression_level_ptr <
+         wa->cost_predictors_[choosen_compression_type].size());
+  StopWatchNano<> timer(Env::Default()->GetSystemClock().get(), true);
+  Status status =
+      allcompressors_[choosen_compression_type][compression_level_ptr]
+          ->CompressBlock(uncompressed_data, compressed_output,
+                          compressed_output_size, out_compression_type,
+                          &(wa->wrapped_));
+  std::pair<size_t, size_t> measured_data(timer.ElapsedMicros(),
+                                          *compressed_output_size);
+  auto predictor =
+      wa->cost_predictors_[choosen_compression_type][compression_level_ptr];
+  auto output_length = measured_data.second;
+  auto cpu_time = measured_data.first;
+  predictor->CPUPredictor.Record(cpu_time);
+  predictor->IOPredictor.Record(output_length);
+  TEST_SYNC_POINT_CALLBACK(
+      "CostAwareCompressor::CompressBlockAndRecord::GetPredictor",
+      wa->cost_predictors_[choosen_compression_type][compression_level_ptr]);
+  return status;
+}
+
+std::shared_ptr<CompressionManagerWrapper> CreateAutoSkipCompressionManager(
+    std::shared_ptr<CompressionManager> wrapped) {
+  return std::make_shared<AutoSkipCompressorManager>(
+      wrapped == nullptr ? GetBuiltinV2CompressionManager() : wrapped);
+}
+const char* CostAwareCompressorManager::Name() const {
+  // should have returned "CostAwareCompressorManager" but we currently have an
+  // error so for now returning name of the wrapped container
+  return wrapped_->Name();
+}
+
+std::unique_ptr<Compressor> CostAwareCompressorManager::GetCompressorForSST(
+    const FilterBuildingContext& context, const CompressionOptions& opts,
+    CompressionType preferred) {
+  assert(GetSupportedCompressions().size() > 1);
+  (void)context;
+  (void)preferred;
+  return std::make_unique<CostAwareCompressor>(opts);
+}
+
+std::shared_ptr<CompressionManagerWrapper> CreateCostAwareCompressionManager(
+    std::shared_ptr<CompressionManager> wrapped) {
+  return std::make_shared<CostAwareCompressorManager>(
+      wrapped == nullptr ? GetBuiltinV2CompressionManager() : wrapped);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/util/auto_tune_compressor.h b/util/auto_tune_compressor.h
new file mode 100644
index 000000000000..791193eb6c6b
--- /dev/null
+++ b/util/auto_tune_compressor.h
@@ -0,0 +1,197 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Defines auto skip compressor wrapper which intelligently decides bypassing
+// compression based on past data
+// Defines CostAwareCompressor which currently tries to predict the cpu and io
+// cost of the compression
+
+#pragma once
+#include <memory>
+
+#include "rocksdb/advanced_compression.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Auto Skip Compression Components
+// Predict rejection probability using a moving window approach
+class CompressionRejectionProbabilityPredictor {
+ public:
+  explicit CompressionRejectionProbabilityPredictor(int window_size)
+      : pred_rejection_prob_percentage_(0),
+        rejected_count_(0),
+        compressed_count_(0),
+        window_size_(window_size) {}
+  int Predict() const;
+  bool Record(Slice uncompressed_block_data, char* compressed_output,
+              size_t compressed_output_size, CompressionType compression_type);
+  size_t attempted_compression_count() const;
+
+ protected:
+  int pred_rejection_prob_percentage_;
+  size_t rejected_count_;
+  size_t compressed_count_;
+  size_t window_size_;
+};
+
+class AutoSkipWorkingArea : public Compressor::WorkingArea {
+ public:
+  explicit AutoSkipWorkingArea(Compressor::ManagedWorkingArea&& wa)
+      : wrapped(std::move(wa)),
+        predictor(
+            std::make_shared<CompressionRejectionProbabilityPredictor>(10)) {}
+  ~AutoSkipWorkingArea() {}
+  AutoSkipWorkingArea(const AutoSkipWorkingArea&) = delete;
+  AutoSkipWorkingArea& operator=(const AutoSkipWorkingArea&) = delete;
+  AutoSkipWorkingArea(AutoSkipWorkingArea&& other) noexcept
+      : wrapped(std::move(other.wrapped)),
+        predictor(std::move(other.predictor)) {}
+
+  AutoSkipWorkingArea& operator=(AutoSkipWorkingArea&& other) noexcept {
+    if (this != &other) {
+      wrapped = std::move(other.wrapped);
+      predictor = std::move(other.predictor);
+    }
+    return *this;
+  }
+  Compressor::ManagedWorkingArea wrapped;
+  std::shared_ptr<CompressionRejectionProbabilityPredictor> predictor;
+};
+class AutoSkipCompressorWrapper : public CompressorWrapper {
+ public:
+  const char* Name() const override;
+  explicit AutoSkipCompressorWrapper(std::unique_ptr<Compressor> compressor,
+                                     const CompressionOptions& opts);
+
+  std::unique_ptr<Compressor> Clone() const override;
+  std::unique_ptr<Compressor> MaybeCloneSpecialized(
+      CacheEntryRole block_type, DictConfigArgs&& dict_config) const override;
+  Status CompressBlock(Slice uncompressed_data, char* compressed_output,
+                       size_t* compressed_output_size,
+                       CompressionType* out_compression_type,
+                       ManagedWorkingArea* wa) override;
+  ManagedWorkingArea ObtainWorkingArea() override;
+  void ReleaseWorkingArea(WorkingArea* wa) override;
+
+ private:
+  Status CompressBlockAndRecord(Slice uncompressed_data,
+                                char* compressed_output,
+                                size_t* compressed_output_size,
+                                CompressionType* out_compression_type,
+                                AutoSkipWorkingArea* wa);
+  static constexpr int kExplorationPercentage = 10;
+  static constexpr int kProbabilityCutOff = 50;
+  const CompressionOptions opts_;
+};
+
+class AutoSkipCompressorManager : public CompressionManagerWrapper {
+  using CompressionManagerWrapper::CompressionManagerWrapper;
+  const char* Name() const override;
+  std::unique_ptr<Compressor> GetCompressorForSST(
+      const FilterBuildingContext& context, const CompressionOptions& opts,
+      CompressionType preferred) override;
+};
+// Cost Aware Components
+template <typename T>
+class WindowAveragePredictor {
+ public:
+  explicit WindowAveragePredictor(int window_size)
+      : sum_(0), prediction_(0), count_(0), kWindowSize(window_size) {}
+  T Predict() { return prediction_; }
+  bool Record(T data) {
+    sum_ += data;
+    count_++;
+    if (count_ >= kWindowSize) {
+      prediction_ = sum_ / count_;
+      sum_ = 0;
+      count_ = 0;
+    }
+    return true;
+  }
+  void SetPrediction(T prediction) { prediction_ = prediction; }
+
+ private:
+  T sum_;
+  T prediction_;
+  int count_;
+  const int kWindowSize;
+};
+
+using IOCostPredictor = WindowAveragePredictor<size_t>;
+using CPUUtilPredictor = WindowAveragePredictor<uint64_t>;
+
+struct IOCPUCostPredictor {
+  explicit IOCPUCostPredictor(int window_size)
+      : IOPredictor(window_size), CPUPredictor(window_size) {}
+  IOCostPredictor IOPredictor;
+  CPUUtilPredictor CPUPredictor;
+};
+class CostAwareWorkingArea : public Compressor::WorkingArea {
+ public:
+  explicit CostAwareWorkingArea(Compressor::ManagedWorkingArea&& wa)
+      : wrapped_(std::move(wa)) {}
+  ~CostAwareWorkingArea() {}
+  CostAwareWorkingArea(const CostAwareWorkingArea&) = delete;
+  CostAwareWorkingArea& operator=(const CostAwareWorkingArea&) = delete;
+  CostAwareWorkingArea(CostAwareWorkingArea&& other) noexcept
+      : wrapped_(std::move(other.wrapped_)) {}
+
+  CostAwareWorkingArea& operator=(CostAwareWorkingArea&& other) noexcept {
+    if (this != &other) {
+      wrapped_ = std::move(other.wrapped_);
+      cost_predictors_ = std::move(other.cost_predictors_);
+    }
+    return *this;
+  }
+  Compressor::ManagedWorkingArea wrapped_;
+  std::vector<std::vector<IOCPUCostPredictor*>> cost_predictors_;
+};
+
+class CostAwareCompressor : public Compressor {
+ public:
+  explicit CostAwareCompressor(const CompressionOptions& opts);
+  const char* Name() const override;
+  std::unique_ptr<Compressor> Clone() const override;
+  DictConfig GetDictGuidance(CacheEntryRole block_type) const override;
+  Slice GetSerializedDict() const override;
+  CompressionType GetPreferredCompressionType() const override;
+  ManagedWorkingArea ObtainWorkingArea() override;
+  std::unique_ptr<Compressor> MaybeCloneSpecialized(
+      CacheEntryRole block_type, DictConfigArgs&& dict_config) const override;
+
+  Status CompressBlock(Slice uncompressed_data, char* compressed_output,
+                       size_t* compressed_output_size,
+                       CompressionType* out_compression_type,
+                       ManagedWorkingArea* wa) override;
+  void ReleaseWorkingArea(WorkingArea* wa) override;
+
+ private:
+  Status CompressBlockAndRecord(size_t choosen_compression_type,
+                                size_t compresion_level_ptr,
+                                Slice uncompressed_data,
+                                char* compressed_output,
+                                size_t* compressed_output_size,
+                                CompressionType* out_compression_type,
+                                CostAwareWorkingArea* wa);
+  static constexpr int kExplorationPercentage = 10;
+  static constexpr int kProbabilityCutOff = 50;
+  // This is the vector containing the list of compression levels that
+  // CostAwareCompressor will use create compressor and predicts the cost
+  // The vector contains list of compression level for compression algorithm in
+  // the order defined by enum CompressionType
+  static const std::vector<std::vector<int>> kCompressionLevels;
+  const CompressionOptions opts_;
+  std::vector<std::vector<std::unique_ptr<Compressor>>> allcompressors_;
+  std::vector<std::pair<size_t, size_t>> allcompressors_index_;
+};
+
+class CostAwareCompressorManager : public CompressionManagerWrapper {
+  using CompressionManagerWrapper::CompressionManagerWrapper;
+  const char* Name() const override;
+  std::unique_ptr<Compressor> GetCompressorForSST(
+      const FilterBuildingContext& context, const CompressionOptions& opts,
+      CompressionType preferred) override;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/util/bit_fields.h b/util/bit_fields.h
new file mode 100644
index 000000000000..00a43ed90118
--- /dev/null
+++ b/util/bit_fields.h
@@ -0,0 +1,470 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+#include <vector>
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "test_util/sync_point.h"
+#include "util/math.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Declares a wrapper type around UnderlyingT that allows it to be divided up
+// into and accessed as bit fields. This is mostly intended to aid in packing
+// fields into atomic variables to reduce the need for locking in concurrent
+// code and/or to simplify reasoning on and accommodation of different
+// interesting, bug-prone interleavings. Convenient atomic wrappers
+// (RelaxedAtomic, Atomic) are provided below to aid usage with atomics,
+// especially for CAS updates, but it is even possible to combine operations on
+// multiple bit fields into a single non-CAS atomic operation using Transforms
+// below.
+//
+// Unlike C/C++ bit fields, this implementation guarantees tight bit packing
+// so that all available lock-free atomic bits can be utilized.
+//
+// The specific bit fields are declared outside the declaration using
+// BoolBitField and UnsignedBitField below. Example usage:
+//
+// struct MyState : public BitFields<uint32_t, MyState> {
+//   // Extra helper declarations and/or field type declarations
+// };
+//
+// // Starts with a 16-bit field returned as uint16_t
+// using Field1 = UnsignedBitField<MyState, 16, NoPrevBitField>;
+// using Field2 = BoolBitField<MyState, Field1>;
+// using Field3 = BoolBitField<MyState, Field2>;
+// using Field4 = UnsignedBitField<MyState, 5, Field3>;  // 5 bits in a uint8_t
+//
+// // MyState{} is zero-initialized
+// auto state = MyState{}.With<Field1>(42U).With<Field2>(true);
+// state.Set<Field4>(3U);
+// state.Ref<Field1>() += state.Get<Field4>();
+//
+// Note that there's nothing preventing you from declaring overlapping fields
+// in the same 'MyState' family. This could be useful for variant types where
+// an earlier field determines which layout later fields are using. For example,
+// an alternate field after Field2:
+//
+// using Field3a = UnsignedBitField<State, 6, Field2>;  // 6 bits in a uint8_t
+//
+template <typename UnderlyingT, typename DerivedT>
+struct BitFields {
+  using U = UnderlyingT;
+  U underlying = 0;
+  static constexpr int kBitCount = sizeof(U) * 8;
+
+  using Derived = DerivedT;
+
+  // Modify a given field in place
+  template <typename BitFieldT>
+  void Set(typename BitFieldT::V value) {
+    static_assert(std::is_same_v<typename BitFieldT::Parent, Derived>);
+    Derived& derived = static_cast<Derived&>(*this);
+    BitFieldT::SetIn(derived, value);
+  }
+
+  // Return a copy with the given field modified
+  template <typename BitFieldT>
+  constexpr Derived With(typename BitFieldT::V value) const {
+    static_assert(std::is_same_v<typename BitFieldT::Parent, Derived>);
+    Derived rv = static_cast<const Derived&>(*this);
+    BitFieldT::SetIn(rv, value);
+    return rv;
+  }
+
+  // Get the value of a field
+  template <typename BitFieldT>
+  typename BitFieldT::V Get() const {
+    static_assert(std::is_same_v<typename BitFieldT::Parent, Derived>);
+    return BitFieldT::GetFrom(static_cast<const Derived&>(*this));
+  }
+
+  // Reference and Ref() are not intended to behave as full references but to
+  // provide a convenient way to do operations like +=, |=, etc. Get and Set
+  // are preferred for simple operations.
+  template <typename BitFieldT>
+  struct Reference {
+    explicit Reference(BitFields& bf) : bf_(bf) {}
+    Reference(const Reference&) = default;
+    Reference& operator=(const Reference&) = default;
+    Reference(Reference&&) = default;
+    Reference& operator=(Reference&&) = default;
+
+    void operator=(typename BitFieldT::V value) { bf_.Set<BitFieldT>(value); }
+    void operator+=(typename BitFieldT::V value) {
+      bf_.Set<BitFieldT>(bf_.Get<BitFieldT>() + value);
+    }
+    void operator-=(typename BitFieldT::V value) {
+      bf_.Set<BitFieldT>(bf_.Get<BitFieldT>() - value);
+    }
+    void operator|=(typename BitFieldT::V value) {
+      bf_.Set<BitFieldT>(bf_.Get<BitFieldT>() | value);
+    }
+    void operator&=(typename BitFieldT::V value) {
+      bf_.Set<BitFieldT>(bf_.Get<BitFieldT>() & value);
+    }
+
+   private:
+    BitFields& bf_;
+  };
+
+  template <typename BitFieldT>
+  Reference<BitFieldT> Ref() {
+    return Reference<BitFieldT>(*this);
+  }
+
+  bool operator==(const BitFields& other) const = default;
+  bool operator!=(const BitFields& other) const = default;
+};
+
+// For building atomic updates affecting one or more fields, assuming all the
+// updates are bitwise-or.
+template <typename BitFieldsT>
+struct OrTransformer {
+  using U = typename BitFieldsT::U;
+  U to_or = 0;
+  // + for general combine
+  OrTransformer<BitFieldsT> operator+(
+      const OrTransformer<BitFieldsT>& other) const {
+    return OrTransformer<BitFieldsT>{to_or | other.to_or};
+  }
+};
+
+// For building atomic updates affecting one or more fields, assuming all the
+// updates are bitwise-and.
+template <typename BitFieldsT>
+struct AndTransformer {
+  using U = typename BitFieldsT::U;
+  U to_and = 0;
+  // + for general combine
+  AndTransformer<BitFieldsT> operator+(
+      const AndTransformer<BitFieldsT>& other) const {
+    return AndTransformer<BitFieldsT>{to_and & other.to_and};
+  }
+};
+
+// Can represent a combination of both subtractions and additions, representing
+// subtractions as the addition of a negated value. To ensure we don't create a
+// net overflow or underflow between fields, in debug builds we track the
+// corresponding preconditions. (NOTE that when representing a subtraction, we
+// rely on overflow of the unsigned representation.)
+template <typename BitFieldsT>
+struct AddTransformer {
+  using U = typename BitFieldsT::U;
+  U to_add = 0;
+#ifndef NDEBUG
+  struct Precondition {
+    U mask;   // for bits of the target field
+    U piece;  // component of to_add for the target field
+  };
+  std::vector<Precondition> preconditions;
+#endif  // NDEBUG
+  void AssertPreconditions([[maybe_unused]] U from) const {
+#ifndef NDEBUG
+    for (auto p : preconditions) {
+      U tmp = (from & p.mask) + p.piece;
+      // Assert no under/overflow (unless the field is at the top bits of the
+      // representation in U, which is allowed because it doesn't lead to
+      // leakage into other fields)
+      testable_assert((tmp & ~p.mask) == 0);
+    }
+#endif  // NDEBUG
+  }
+  // + for general combine
+  AddTransformer<BitFieldsT> operator+(
+      const AddTransformer<BitFieldsT>& other) const {
+    AddTransformer<BitFieldsT> rv{to_add + other.to_add};
+#ifndef NDEBUG
+    rv.preconditions = preconditions;
+    rv.preconditions.insert(rv.preconditions.end(), other.preconditions.begin(),
+                            other.preconditions.end());
+#endif  // NDEBUG
+    return rv;
+  }
+};
+
+// Placeholder for PrevField for the first field
+struct NoPrevBitField {
+  // no instances
+  NoPrevBitField() = delete;
+  static constexpr int kEndBit = 0;
+};
+
+// For declaring a single-bit field accessed as a boolean. See example above on
+// BitFields
+template <typename BitFieldsT, typename PrevField>
+struct BoolBitField {
+  using Parent = BitFieldsT;
+  using ParentBase =
+      BitFields<typename BitFieldsT::U, typename BitFieldsT::Derived>;
+  using U = typename BitFieldsT::U;
+  using V = bool;
+  static constexpr int kBitOffset = PrevField::kEndBit;
+  static constexpr int kEndBit = kBitOffset + 1;
+  static_assert(kBitOffset >= 0 && kEndBit <= BitFieldsT::kBitCount);
+
+  // no instances
+  BoolBitField() = delete;
+
+  // NOTE: allow BitFieldsT to be derived from BitFields<> which can be
+  // passed in here
+  static bool GetFrom(const ParentBase& bf) {
+    return (bf.underlying & (U{1} << kBitOffset)) != 0;
+  }
+  static void SetIn(ParentBase& bf, bool value) {
+    // NOTE: avoiding conditional branches is usually best for speed on modern
+    // processors
+    bf.underlying =
+        (bf.underlying & ~(U{1} << kBitOffset)) | (U{value} << kBitOffset);
+  }
+  static OrTransformer<BitFieldsT> SetTransform() { return Or(true); }
+  static OrTransformer<BitFieldsT> Or(bool b) {
+    return OrTransformer<BitFieldsT>{U{b} << kBitOffset};
+  }
+  static AndTransformer<BitFieldsT> ClearTransform() { return And(false); }
+  static AndTransformer<BitFieldsT> And(bool b) {
+    return AndTransformer<BitFieldsT>{~(U{!b} << kBitOffset)};
+  }
+};
+
+// For declaring a multi-bit field accessed as an unsigned int. See example
+// above on BitFields
+template <typename BitFieldsT, int kBitCount, typename PrevField>
+struct UnsignedBitField {
+  using Parent = BitFieldsT;
+  using U = typename BitFieldsT::U;
+  // Smallest uint type that can fit kBitCount bits
+  using V = std::conditional_t<
+      kBitCount <= 8, uint8_t,
+      std::conditional_t<
+          kBitCount <= 16, uint16_t,
+          std::conditional_t<kBitCount <= 32, uint32_t, uint64_t>>>;
+  static constexpr int kBitOffset = PrevField::kEndBit;
+  static constexpr int kEndBit = kBitOffset + kBitCount;
+  static_assert(kBitCount >= 1);
+  static_assert(kBitCount <= 64);
+  static_assert(kBitOffset >= 0 && kEndBit <= BitFieldsT::kBitCount);
+  static constexpr bool kIncludesTopBit = (kEndBit == BitFieldsT::kBitCount);
+
+  static constexpr V kMask = (V{1} << (kBitCount - 1) << 1) - 1;
+
+  // no instances
+  UnsignedBitField() = delete;
+
+  static V GetFrom(const BitFieldsT& bf) {
+    return BitwiseAnd(bf.underlying >> kBitOffset, kMask);
+  }
+
+  static void SetIn(BitFieldsT& bf, V value) {
+    bf.underlying &= ~(static_cast<U>(kMask) << kBitOffset);
+    bf.underlying |= static_cast<U>(value & kMask) << kBitOffset;
+  }
+
+  // Create a transform for clearing this field to zero.
+  static AndTransformer<BitFieldsT> ClearTransform() {
+    return AndTransformer<BitFieldsT>{~(static_cast<U>(kMask) << kBitOffset)};
+  }
+
+  // Create a transform for bitwise-and
+  static AndTransformer<BitFieldsT> AndTransform(V value) {
+    assert((value & ~kMask) == 0);
+    return AndTransformer<BitFieldsT>{
+        ~(static_cast<U>(value ^ kMask) << kBitOffset)};
+  }
+
+  // Create a transform for bitwise-or
+  static OrTransformer<BitFieldsT> OrTransform(V value) {
+    assert((value & ~kMask) == 0);
+    return OrTransformer<BitFieldsT>{static_cast<U>(value) << kBitOffset};
+  }
+
+  // Create a transform for adding a particular value, but with the precondition
+  // that adding the value will not overflow the field. This applies for fields
+  // that do not include the top bit of the underlying representation. Can be
+  // combined with other additive transforms for other fields.
+  static AddTransformer<BitFieldsT> PlusTransformPromiseNoOverflow(V value) {
+    static_assert(!kIncludesTopBit);
+    AddTransformer<BitFieldsT> rv{static_cast<U>(value) << kBitOffset};
+#ifndef NDEBUG
+    rv.preconditions.push_back(
+        {static_cast<U>(kMask) << kBitOffset, rv.to_add});
+#endif  // NDEBUG
+    return rv;
+  }
+
+  // Create a transform for adding a particular value, but ignoring any overflow
+  // in that field. This applies for fields that include the top bit of the
+  // underlying representation. Can be combined with other additive transforms
+  // for other fields.
+  static AddTransformer<BitFieldsT> PlusTransformIgnoreOverflow(V value) {
+    static_assert(kIncludesTopBit);
+    AddTransformer<BitFieldsT> rv{static_cast<U>(value) << kBitOffset};
+    return rv;
+  }
+
+  // Create a transform for subtracting a particular value, but with the
+  // precondition that subtracting the value will not underflow the field. This
+  // applies for fields that do not include the top bit of the underlying
+  // representation. Can be combined with other additive transforms for other
+  // fields.
+  static AddTransformer<BitFieldsT> MinusTransformPromiseNoUnderflow(V value) {
+    static_assert(!kIncludesTopBit);
+    AddTransformer<BitFieldsT> rv{U{0} - (static_cast<U>(value) << kBitOffset)};
+#ifndef NDEBUG
+    rv.preconditions.push_back(
+        {static_cast<U>(kMask) << kBitOffset, rv.to_add});
+#endif  // NDEBUG
+    return rv;
+  }
+
+  // Create a transform for subtracting a particular value, but ignoring any
+  // underflow in that field. This applies for fields that include the top bit
+  // of the underlying representation. Can be combined with other additive
+  // transforms for other fields.
+  static AddTransformer<BitFieldsT> MinusTransformIgnoreUnderflow(V value) {
+    static_assert(kIncludesTopBit);
+    AddTransformer<BitFieldsT> rv{U{0} - (static_cast<U>(value) << kBitOffset)};
+    return rv;
+  }
+};
+
+// A handy wrapper for a relaxed atomic on some BitFields type, like
+// RelaxedAtomic but without direct arithmetic operations. For encapsulation,
+// usual arithmetic atomic operations are only available by calling
+// ApplyRelaxed() on Transforms returned from field classes. Extending an
+// example from BitFields:
+//
+// auto transform = Field2::ClearTransform() + Field4::ClearTransform();
+// MyState old_state;
+// my_atomic.ApplyRelaxed(transform, &old_state);
+// auto field2_before_clearing = old_state.Get<Field2>();
+//
+template <typename BitFieldsT>
+class RelaxedBitFieldsAtomic {
+ public:
+  using U = typename BitFieldsT::U;
+  explicit RelaxedBitFieldsAtomic(BitFieldsT initial = {})
+      : v_(initial.underlying) {}
+  void StoreRelaxed(BitFieldsT desired) {
+    v_.store(desired.underlying, std::memory_order_relaxed);
+  }
+  BitFieldsT LoadRelaxed() const {
+    return BitFieldsT{v_.load(std::memory_order_relaxed)};
+  }
+  bool CasWeakRelaxed(BitFieldsT& expected, BitFieldsT desired) {
+    return v_.compare_exchange_weak(expected.underlying, desired.underlying,
+                                    std::memory_order_relaxed);
+  }
+  bool CasStrongRelaxed(BitFieldsT& expected, BitFieldsT desired) {
+    return v_.compare_exchange_strong(expected.underlying, desired.underlying,
+                                      std::memory_order_relaxed);
+  }
+  BitFieldsT ExchangeRelaxed(BitFieldsT desired) {
+    return BitFieldsT{
+        v_.exchange(desired.underlying, std::memory_order_relaxed)};
+  }
+  void ApplyRelaxed(const OrTransformer<BitFieldsT>& transform,
+                    BitFieldsT* before = nullptr, BitFieldsT* after = nullptr) {
+    ApplyImpl<std::memory_order_relaxed>(transform, before, after);
+  }
+  void ApplyRelaxed(const AndTransformer<BitFieldsT>& transform,
+                    BitFieldsT* before = nullptr, BitFieldsT* after = nullptr) {
+    ApplyImpl<std::memory_order_relaxed>(transform, before, after);
+  }
+  void ApplyRelaxed(const AddTransformer<BitFieldsT>& transform,
+                    BitFieldsT* before = nullptr, BitFieldsT* after = nullptr) {
+    ApplyImpl<std::memory_order_relaxed>(transform, before, after);
+  }
+
+ protected:  // fns
+  template <std::memory_order kOrder>
+  void ApplyImpl(const OrTransformer<BitFieldsT>& transform,
+                 BitFieldsT* before = nullptr, BitFieldsT* after = nullptr) {
+    U before_val = v_.fetch_or(transform.to_or, kOrder);
+    if (before) {
+      before->underlying = before_val;
+    }
+    if (after) {
+      after->underlying = before_val | transform.to_or;
+    }
+  }
+  template <std::memory_order kOrder>
+  void ApplyImpl(const AndTransformer<BitFieldsT>& transform,
+                 BitFieldsT* before = nullptr, BitFieldsT* after = nullptr) {
+    U before_val = v_.fetch_and(transform.to_and, kOrder);
+    if (before) {
+      before->underlying = before_val;
+    }
+    if (after) {
+      after->underlying = before_val & transform.to_and;
+    }
+  }
+  template <std::memory_order kOrder>
+  void ApplyImpl(const AddTransformer<BitFieldsT>& transform,
+                 BitFieldsT* before = nullptr, BitFieldsT* after = nullptr) {
+    U before_val = v_.fetch_add(transform.to_add, kOrder);
+    transform.AssertPreconditions(before_val);
+    if (before) {
+      before->underlying = before_val;
+    }
+    if (after) {
+      after->underlying = before_val + transform.to_add;
+    }
+  }
+
+ protected:  // data
+  std::atomic<U> v_;
+};
+
+// A handy wrapper for an aquire-release atomic (also relaxed semantics
+// available) on some BitFields type. See RelaxedBitFieldsAtomic and
+// Atomic in atomic.h for more info.
+template <typename BitFieldsT>
+class BitFieldsAtomic : public RelaxedBitFieldsAtomic<BitFieldsT> {
+ public:
+  using Base = RelaxedBitFieldsAtomic<BitFieldsT>;
+  using U = typename BitFieldsT::U;
+
+  explicit BitFieldsAtomic(BitFieldsT initial = {}) : Base(initial) {}
+
+  void Store(BitFieldsT desired) {
+    Base::v_.store(desired.underlying, std::memory_order_release);
+  }
+  BitFieldsT Load() const {
+    return BitFieldsT{Base::v_.load(std::memory_order_acquire)};
+  }
+  bool CasWeak(BitFieldsT& expected, BitFieldsT desired) {
+    return Base::v_.compare_exchange_weak(
+        expected.underlying, desired.underlying, std::memory_order_acq_rel);
+  }
+  bool CasStrong(BitFieldsT& expected, BitFieldsT desired) {
+    return Base::v_.compare_exchange_strong(
+        expected.underlying, desired.underlying, std::memory_order_acq_rel);
+  }
+  BitFieldsT Exchange(BitFieldsT desired) {
+    return BitFieldsT{
+        Base::v_.exchange(desired.underlying, std::memory_order_acq_rel)};
+  }
+  void Apply(const OrTransformer<BitFieldsT>& transform,
+             BitFieldsT* before = nullptr, BitFieldsT* after = nullptr) {
+    Base::template ApplyImpl<std::memory_order_acq_rel>(transform, before,
+                                                        after);
+  }
+  void Apply(const AndTransformer<BitFieldsT>& transform,
+             BitFieldsT* before = nullptr, BitFieldsT* after = nullptr) {
+    Base::template ApplyImpl<std::memory_order_acq_rel>(transform, before,
+                                                        after);
+  }
+  void Apply(const AddTransformer<BitFieldsT>& transform,
+             BitFieldsT* before = nullptr, BitFieldsT* after = nullptr) {
+    Base::template ApplyImpl<std::memory_order_acq_rel>(transform, before,
+                                                        after);
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/util/bloom_impl.h b/util/bloom_impl.h
index c9bbb125b8f1..3b2f1792934b 100644
--- a/util/bloom_impl.h
+++ b/util/bloom_impl.h
@@ -198,13 +198,13 @@ class FastLocalBloomImpl {
   }
 
   static inline void AddHash(uint32_t h1, uint32_t h2, uint32_t len_bytes,
-                             int num_probes, char *data) {
+                             int num_probes, char* data) {
     uint32_t bytes_to_cache_line = FastRange32(h1, len_bytes >> 6) << 6;
     AddHashPrepared(h2, num_probes, data + bytes_to_cache_line);
   }
 
   static inline void AddHashPrepared(uint32_t h2, int num_probes,
-                                     char *data_at_cache_line) {
+                                     char* data_at_cache_line) {
     uint32_t h = h2;
     for (int i = 0; i < num_probes; ++i, h *= uint32_t{0x9e3779b9}) {
       // 9-bit address within 512 bit cache line
@@ -214,8 +214,8 @@ class FastLocalBloomImpl {
   }
 
   static inline void PrepareHash(uint32_t h1, uint32_t len_bytes,
-                                 const char *data,
-                                 uint32_t /*out*/ *byte_offset) {
+                                 const char* data,
+                                 uint32_t /*out*/* byte_offset) {
     uint32_t bytes_to_cache_line = FastRange32(h1, len_bytes >> 6) << 6;
     PREFETCH(data + bytes_to_cache_line, 0 /* rw */, 1 /* locality */);
     PREFETCH(data + bytes_to_cache_line + 63, 0 /* rw */, 1 /* locality */);
@@ -223,13 +223,13 @@ class FastLocalBloomImpl {
   }
 
   static inline bool HashMayMatch(uint32_t h1, uint32_t h2, uint32_t len_bytes,
-                                  int num_probes, const char *data) {
+                                  int num_probes, const char* data) {
     uint32_t bytes_to_cache_line = FastRange32(h1, len_bytes >> 6) << 6;
     return HashMayMatchPrepared(h2, num_probes, data + bytes_to_cache_line);
   }
 
   static inline bool HashMayMatchPrepared(uint32_t h2, int num_probes,
-                                          const char *data_at_cache_line) {
+                                          const char* data_at_cache_line) {
     uint32_t h = h2;
 #ifdef __AVX2__
     int rem_probes = num_probes;
@@ -277,8 +277,8 @@ class FastLocalBloomImpl {
       //                            /*bytes / i32*/ 4);
       // END Option 1
       // Potentially unaligned as we're not *always* cache-aligned -> loadu
-      const __m256i *mm_data =
-          reinterpret_cast<const __m256i *>(data_at_cache_line);
+      const __m256i* mm_data =
+          reinterpret_cast<const __m256i*>(data_at_cache_line);
       __m256i lower = _mm256_loadu_si256(mm_data);
       __m256i upper = _mm256_loadu_si256(mm_data + 1);
       // Option 2: AVX512VL permute hack
@@ -362,7 +362,7 @@ class LegacyNoLocalityBloomImpl {
   }
 
   static inline void AddHash(uint32_t h, uint32_t total_bits, int num_probes,
-                             char *data) {
+                             char* data) {
     const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
     for (int i = 0; i < num_probes; i++) {
       const uint32_t bitpos = h % total_bits;
@@ -372,7 +372,7 @@ class LegacyNoLocalityBloomImpl {
   }
 
   static inline bool HashMayMatch(uint32_t h, uint32_t total_bits,
-                                  int num_probes, const char *data) {
+                                  int num_probes, const char* data) {
     const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
     for (int i = 0; i < num_probes; i++) {
       const uint32_t bitpos = h % total_bits;
@@ -430,10 +430,10 @@ class LegacyLocalityBloomImpl {
   }
 
   static inline void AddHash(uint32_t h, uint32_t num_lines, int num_probes,
-                             char *data, int log2_cache_line_bytes) {
+                             char* data, int log2_cache_line_bytes) {
     const int log2_cache_line_bits = log2_cache_line_bytes + 3;
 
-    char *data_at_offset =
+    char* data_at_offset =
         data + (GetLine(h, num_lines) << log2_cache_line_bytes);
     const uint32_t delta = (h >> 17) | (h << 15);
     for (int i = 0; i < num_probes; ++i) {
@@ -448,8 +448,8 @@ class LegacyLocalityBloomImpl {
   }
 
   static inline void PrepareHashMayMatch(uint32_t h, uint32_t num_lines,
-                                         const char *data,
-                                         uint32_t /*out*/ *byte_offset,
+                                         const char* data,
+                                         uint32_t /*out*/* byte_offset,
                                          int log2_cache_line_bytes) {
     uint32_t b = GetLine(h, num_lines) << log2_cache_line_bytes;
     PREFETCH(data + b, 0 /* rw */, 1 /* locality */);
@@ -459,14 +459,14 @@ class LegacyLocalityBloomImpl {
   }
 
   static inline bool HashMayMatch(uint32_t h, uint32_t num_lines,
-                                  int num_probes, const char *data,
+                                  int num_probes, const char* data,
                                   int log2_cache_line_bytes) {
     uint32_t b = GetLine(h, num_lines) << log2_cache_line_bytes;
     return HashMayMatchPrepared(h, num_probes, data + b, log2_cache_line_bytes);
   }
 
   static inline bool HashMayMatchPrepared(uint32_t h, int num_probes,
-                                          const char *data_at_offset,
+                                          const char* data_at_offset,
                                           int log2_cache_line_bytes) {
     const int log2_cache_line_bits = log2_cache_line_bytes + 3;
 
diff --git a/util/bloom_test.cc b/util/bloom_test.cc
index f3dbe6373532..d4d9fb87e5d7 100644
--- a/util/bloom_test.cc
+++ b/util/bloom_test.cc
@@ -86,10 +86,7 @@ class FullBloomTest : public testing::TestWithParam<std::string> {
     ResetPolicy();
   }
 
-  BuiltinFilterBitsBuilder* GetBuiltinFilterBitsBuilder() {
-    // Throws on bad cast
-    return dynamic_cast<BuiltinFilterBitsBuilder*>(bits_builder_.get());
-  }
+  FilterBitsBuilder* GetFilterBitsBuilder() { return bits_builder_.get(); }
 
   const BloomLikeFilterPolicy* GetBloomLikeFilterPolicy() {
     // Throws on bad cast
@@ -239,7 +236,7 @@ TEST_P(FullBloomTest, FilterSize) {
     EXPECT_EQ(bpk.second, bfp->GetMillibitsPerKey());
     EXPECT_EQ((bpk.second + 500) / 1000, bfp->GetWholeBitsPerKey());
 
-    auto bits_builder = GetBuiltinFilterBitsBuilder();
+    auto bits_builder = GetFilterBitsBuilder();
     if (bpk.second == 0) {
       ASSERT_EQ(bits_builder, nullptr);
       continue;
diff --git a/util/cast_util.h b/util/cast_util.h
index 414feda9cbea..24532cda7866 100644
--- a/util/cast_util.h
+++ b/util/cast_util.h
@@ -39,19 +39,30 @@ inline std::shared_ptr<DestClass> static_cast_with_check(
 }
 
 // A wrapper around static_cast for lossless conversion between integral
-// types, including enum types. For example, this can be used for converting
-// between signed/unsigned or enum type and underlying type without fear of
-// stripping away data, now or in the future.
+// types, including enum types, and pointers to such types. For example, this
+// can be used for converting between signed/unsigned or enum type and
+// underlying type without fear of stripping away data, now or in the future.
 template <typename To, typename From>
 inline To lossless_cast(From x) {
-  using FromValue = typename std::remove_reference<From>::type;
-  static_assert(
-      std::is_integral<FromValue>::value || std::is_enum<FromValue>::value,
-      "Only works on integral types");
-  static_assert(std::is_integral<To>::value || std::is_enum<To>::value,
-                "Only works on integral types");
-  static_assert(sizeof(To) >= sizeof(FromValue), "Must be lossless");
-  return static_cast<To>(x);
+  using FromValue = typename std::remove_reference_t<From>;
+  if constexpr (std::is_pointer_v<FromValue>) {
+    static_assert(std::is_pointer_v<To>);
+    using FromDeref = typename std::remove_pointer_t<FromValue>;
+    using ToDeref = typename std::remove_pointer_t<To>;
+    static_assert(std::is_integral_v<FromDeref> || std::is_enum_v<FromDeref>,
+                  "Only works on integral types");
+    static_assert(std::is_integral_v<ToDeref> || std::is_enum_v<To>,
+                  "Only works on integral types");
+    static_assert(sizeof(ToDeref) == sizeof(FromDeref), "Must be lossless");
+    return reinterpret_cast<To>(x);
+  } else {
+    static_assert(std::is_integral_v<FromValue> || std::is_enum_v<FromValue>,
+                  "Only works on integral types");
+    static_assert(std::is_integral_v<To> || std::is_enum_v<To>,
+                  "Only works on integral types");
+    static_assert(sizeof(To) >= sizeof(FromValue), "Must be lossless");
+    return static_cast<To>(x);
+  }
 }
 
 // For disambiguating a potentially heterogeneous aggregate as a homogeneous
diff --git a/util/coding.h b/util/coding.h
index 929c8e42c462..2d7522478461 100644
--- a/util/coding.h
+++ b/util/coding.h
@@ -21,6 +21,7 @@
 
 #include "port/port.h"
 #include "rocksdb/slice.h"
+#include "util/cast_util.h"
 #include "util/coding_lean.h"
 
 // Some processors does not allow unaligned access to memory
@@ -91,7 +92,7 @@ inline const char* GetVarsignedint64Ptr(const char* p, const char* limit,
 }
 
 // Returns the length of the varint32 or varint64 encoding of "v"
-int VarintLength(uint64_t v);
+uint16_t VarintLength(uint64_t v);
 
 // Lower-level versions of Put... that write directly into a character buffer
 // and return a pointer just past the last byte written.
@@ -105,7 +106,7 @@ const char* GetVarint32PtrFallback(const char* p, const char* limit,
 inline const char* GetVarint32Ptr(const char* p, const char* limit,
                                   uint32_t* value) {
   if (p < limit) {
-    uint32_t result = *(reinterpret_cast<const unsigned char*>(p));
+    uint32_t result = *(lossless_cast<const unsigned char*>(p));
     if ((result & 128) == 0) {
       *value = result;
       return p + 1;
@@ -172,13 +173,13 @@ inline void PutVarint32Varint32Varint32(std::string* dst, uint32_t v1,
 
 inline char* EncodeVarint64(char* dst, uint64_t v) {
   static const unsigned int B = 128;
-  unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
+  unsigned char* ptr = lossless_cast<unsigned char*>(dst);
   while (v >= B) {
     *(ptr++) = (v & (B - 1)) | B;
     v >>= 7;
   }
   *(ptr++) = static_cast<unsigned char>(v);
-  return reinterpret_cast<char*>(ptr);
+  return lossless_cast<char*>(ptr);
 }
 
 inline void PutVarint64(std::string* dst, uint64_t v) {
@@ -244,8 +245,8 @@ inline void PutLengthPrefixedSlicePartsWithPadding(
   dst->append(pad_sz, '\0');
 }
 
-inline int VarintLength(uint64_t v) {
-  int len = 1;
+inline uint16_t VarintLength(uint64_t v) {
+  uint16_t len = 1;
   while (v >= 128) {
     v >>= 7;
     len++;
@@ -354,8 +355,7 @@ __attribute__((__no_sanitize__("alignment")))
 __attribute__((__no_sanitize_undefined__))
 #endif
 #endif
-inline void
-PutUnaligned(T* memory, const T& value) {
+inline void PutUnaligned(T* memory, const T& value) {
 #if defined(PLATFORM_UNALIGNED_ACCESS_NOT_ALLOWED)
   char* nonAlignedMemory = reinterpret_cast<char*>(memory);
   memcpy(nonAlignedMemory, reinterpret_cast<const char*>(&value), sizeof(T));
@@ -372,8 +372,7 @@ __attribute__((__no_sanitize__("alignment")))
 __attribute__((__no_sanitize_undefined__))
 #endif
 #endif
-inline void
-GetUnaligned(const T* memory, T* value) {
+inline void GetUnaligned(const T* memory, T* value) {
 #if defined(PLATFORM_UNALIGNED_ACCESS_NOT_ALLOWED)
   char* nonAlignedMemory = reinterpret_cast<char*>(value);
   memcpy(nonAlignedMemory, reinterpret_cast<const char*>(memory), sizeof(T));
diff --git a/util/compaction_job_stats_impl.cc b/util/compaction_job_stats_impl.cc
index 895db35c1e87..1d8eaa3693d8 100644
--- a/util/compaction_job_stats_impl.cc
+++ b/util/compaction_job_stats_impl.cc
@@ -11,7 +11,7 @@ void CompactionJobStats::Reset() {
   elapsed_micros = 0;
   cpu_micros = 0;
 
-  has_num_input_records = true;
+  has_accurate_num_input_records = true;
   num_input_records = 0;
   num_blobs_read = 0;
   num_input_files = 0;
@@ -59,7 +59,7 @@ void CompactionJobStats::Add(const CompactionJobStats& stats) {
   elapsed_micros += stats.elapsed_micros;
   cpu_micros += stats.cpu_micros;
 
-  has_num_input_records &= stats.has_num_input_records;
+  has_accurate_num_input_records &= stats.has_accurate_num_input_records;
   num_input_records += stats.num_input_records;
   num_blobs_read += stats.num_blobs_read;
   num_input_files += stats.num_input_files;
diff --git a/util/compression.cc b/util/compression.cc
index 197b5a69d121..f5ceb7a149df 100644
--- a/util/compression.cc
+++ b/util/compression.cc
@@ -5,8 +5,156 @@
 
 #include "util/compression.h"
 
+#ifdef BZIP2
+#include <bzlib.h>
+#endif  // BZIP2
+
+#include <limits>
+
+#ifdef LZ4
+#include <lz4.h>
+#include <lz4hc.h>
+#if LZ4_VERSION_NUMBER < 10700  // < r129
+#error "LZ4 support requires version >= 1.7.0 (lz4-devel)"
+#endif  // LZ4_VERSION_NUMBER < 10700
+#endif  // LZ4
+
+#ifdef SNAPPY
+#include <snappy-sinksource.h>
+#include <snappy.h>
+#endif  // SNAPPY
+
+#ifdef ZLIB
+#include <zlib.h>
+#endif  // ZLIB
+
+#include "options/options_helper.h"
+#include "port/likely.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/string_util.h"
+
 namespace ROCKSDB_NAMESPACE {
 
+// WART: does not match OptionsHelper::compression_type_string_map
+std::string CompressionTypeToString(CompressionType compression_type) {
+  switch (compression_type) {
+    case kNoCompression:
+      return "NoCompression";
+    case kSnappyCompression:
+      return "Snappy";
+    case kZlibCompression:
+      return "Zlib";
+    case kBZip2Compression:
+      return "BZip2";
+    case kLZ4Compression:
+      return "LZ4";
+    case kLZ4HCCompression:
+      return "LZ4HC";
+    case kXpressCompression:
+      return "Xpress";
+    case kZSTD:
+      return "ZSTD";
+    case kDisableCompressionOption:
+      return "DisableOption";
+    default: {
+      bool is_custom = compression_type >= kFirstCustomCompression &&
+                       compression_type <= kLastCustomCompression;
+      unsigned char c = lossless_cast<unsigned char>(compression_type);
+      return (is_custom ? "Custom" : "Reserved") +
+             ToBaseCharsString<16>(2, c, /*uppercase=*/true);
+    }
+  }
+}
+
+// WART: does not match OptionsHelper::compression_type_string_map
+CompressionType CompressionTypeFromString(std::string compression_type_str) {
+  if (!compression_type_str.empty()) {
+    switch (compression_type_str[0]) {
+      case 'N':
+        if (compression_type_str == "NoCompression") {
+          return kNoCompression;
+        }
+        break;
+      case 'S':
+        if (compression_type_str == "Snappy") {
+          return kSnappyCompression;
+        }
+        break;
+      case 'Z':
+        if (compression_type_str == "ZSTD") {
+          return kZSTD;
+        }
+        if (compression_type_str == "Zlib") {
+          return kZlibCompression;
+        }
+        break;
+      case 'B':
+        if (compression_type_str == "BZip2") {
+          return kBZip2Compression;
+        }
+        break;
+      case 'L':
+        if (compression_type_str == "LZ4") {
+          return kLZ4Compression;
+        }
+        if (compression_type_str == "LZ4HC") {
+          return kLZ4HCCompression;
+        }
+        break;
+      case 'X':
+        if (compression_type_str == "Xpress") {
+          return kXpressCompression;
+        }
+        break;
+      default:;
+    }
+  }
+  // unrecognized
+  return kDisableCompressionOption;
+}
+
+std::string CompressionOptionsToString(
+    const CompressionOptions& compression_options) {
+  std::string result;
+  result.reserve(512);
+  result.append("window_bits=")
+      .append(std::to_string(compression_options.window_bits))
+      .append("; ");
+  result.append("level=")
+      .append(std::to_string(compression_options.level))
+      .append("; ");
+  result.append("strategy=")
+      .append(std::to_string(compression_options.strategy))
+      .append("; ");
+  result.append("max_dict_bytes=")
+      .append(std::to_string(compression_options.max_dict_bytes))
+      .append("; ");
+  result.append("zstd_max_train_bytes=")
+      .append(std::to_string(compression_options.zstd_max_train_bytes))
+      .append("; ");
+  // NOTE: parallel_threads is skipped because it doesn't really affect the file
+  // contents written, arguably doesn't belong in CompressionOptions
+  result.append("enabled=")
+      .append(std::to_string(compression_options.enabled))
+      .append("; ");
+  result.append("max_dict_buffer_bytes=")
+      .append(std::to_string(compression_options.max_dict_buffer_bytes))
+      .append("; ");
+  result.append("use_zstd_dict_trainer=")
+      .append(std::to_string(compression_options.use_zstd_dict_trainer))
+      .append("; ");
+  result.append("max_compressed_bytes_per_kb=")
+      .append(std::to_string(compression_options.max_compressed_bytes_per_kb))
+      .append("; ");
+  result.append("checksum=")
+      .append(std::to_string(compression_options.checksum))
+      .append("; ");
+  return result;
+}
+
 StreamingCompress* StreamingCompress::Create(CompressionType compression_type,
                                              const CompressionOptions& opts,
                                              uint32_t compress_format_version,
@@ -119,4 +267,1671 @@ void ZSTDStreamingUncompress::Reset() {
 #endif
 }
 
+void DecompressorDict::Populate(Decompressor& from_decompressor, Slice dict) {
+  if (UNLIKELY(dict.empty())) {
+    dict_str_ = {};
+    dict_allocation_ = {};
+    // Appropriately reject bad files with empty dictionary block.
+    // It is longstanding not to write an empty dictionary block:
+    // https://github.com/facebook/rocksdb/blame/10.2.fb/table/block_based/block_based_table_builder.cc#L1841
+    decompressor_ = std::make_unique<FailureDecompressor>(
+        Status::Corruption("Decompression dictionary is empty"));
+  } else {
+    Status s = from_decompressor.MaybeCloneForDict(dict, &decompressor_);
+    if (decompressor_ == nullptr) {
+      dict_str_ = {};
+      dict_allocation_ = {};
+      assert(!s.ok());
+      decompressor_ = std::make_unique<FailureDecompressor>(std::move(s));
+    } else {
+      assert(s.ok());
+      assert(decompressor_->GetSerializedDict() == dict);
+    }
+  }
+
+  memory_usage_ = sizeof(struct DecompressorDict);
+  memory_usage_ += dict_str_.size();
+  if (dict_allocation_) {
+    auto allocator = dict_allocation_.get_deleter().allocator;
+    if (allocator) {
+      memory_usage_ +=
+          allocator->UsableSize(dict_allocation_.get(), GetRawDict().size());
+    } else {
+      memory_usage_ += GetRawDict().size();
+    }
+  }
+  memory_usage_ += decompressor_->ApproximateOwnedMemoryUsage();
+}
+
+// ZSTD dictionary training implementations
+std::string ZSTD_TrainDictionary(const std::string& samples,
+                                 const std::vector<size_t>& sample_lens,
+                                 size_t max_dict_bytes) {
+#ifdef ZSTD
+  assert(samples.empty() == sample_lens.empty());
+  if (samples.empty()) {
+    return "";
+  }
+  std::string dict_data(max_dict_bytes, '\0');
+  size_t dict_len = ZDICT_trainFromBuffer(
+      &dict_data[0], max_dict_bytes, &samples[0], &sample_lens[0],
+      static_cast<unsigned>(sample_lens.size()));
+  if (ZDICT_isError(dict_len)) {
+    return "";
+  }
+  assert(dict_len <= max_dict_bytes);
+  dict_data.resize(dict_len);
+  return dict_data;
+#else
+  assert(false);
+  (void)samples;
+  (void)sample_lens;
+  (void)max_dict_bytes;
+  return "";
+#endif  // ZSTD
+}
+
+std::string ZSTD_TrainDictionary(const std::string& samples,
+                                 size_t sample_len_shift,
+                                 size_t max_dict_bytes) {
+#ifdef ZSTD
+  // skips potential partial sample at the end of "samples"
+  size_t num_samples = samples.size() >> sample_len_shift;
+  std::vector<size_t> sample_lens(num_samples, size_t(1) << sample_len_shift);
+  return ZSTD_TrainDictionary(samples, sample_lens, max_dict_bytes);
+#else
+  assert(false);
+  (void)samples;
+  (void)sample_len_shift;
+  (void)max_dict_bytes;
+  return "";
+#endif  // ZSTD
+}
+
+std::string ZSTD_FinalizeDictionary(const std::string& samples,
+                                    const std::vector<size_t>& sample_lens,
+                                    size_t max_dict_bytes, int level) {
+#ifdef ROCKSDB_ZDICT_FINALIZE
+  assert(samples.empty() == sample_lens.empty());
+  if (samples.empty()) {
+    return "";
+  }
+  if (level == CompressionOptions::kDefaultCompressionLevel) {
+    // NB: ZSTD_CLEVEL_DEFAULT is historically == 3
+    level = ZSTD_CLEVEL_DEFAULT;
+  }
+  std::string dict_data(max_dict_bytes, '\0');
+  size_t dict_len = ZDICT_finalizeDictionary(
+      dict_data.data(), max_dict_bytes, samples.data(),
+      std::min(static_cast<size_t>(samples.size()), max_dict_bytes),
+      samples.data(), sample_lens.data(),
+      static_cast<unsigned>(sample_lens.size()),
+      {level, 0 /* notificationLevel */, 0 /* dictID */});
+  if (ZDICT_isError(dict_len)) {
+    return "";
+  } else {
+    assert(dict_len <= max_dict_bytes);
+    dict_data.resize(dict_len);
+    return dict_data;
+  }
+#else
+  assert(false);
+  (void)samples;
+  (void)sample_lens;
+  (void)max_dict_bytes;
+  (void)level;
+  return "";
+#endif  // ROCKSDB_ZDICT_FINALIZE
+}
+
+// ***********************************************************************
+// BEGIN built-in implementation of customization interface
+// ***********************************************************************
+Status Decompressor::ExtractUncompressedSize(Args& args) {
+  // Default implementation:
+  //
+  // Standard format for prepending uncompressed size to the compressed
+  // payload. (RocksDB compress_format_version=2 except Snappy)
+  //
+  // This is historically a varint32, but it is preliminarily generalized
+  // to varint64, in case that is supported on the write side for some
+  // algorithms.
+  if (LIKELY(GetVarint64(&args.compressed_data, &args.uncompressed_size))) {
+    if (LIKELY(args.uncompressed_size <= SIZE_MAX)) {
+      return Status::OK();
+    } else {
+      return Status::MemoryLimit("Uncompressed size too large for platform");
+    }
+  } else {
+    return Status::Corruption("Unable to extract uncompressed size");
+  }
+}
+
+const Slice& Decompressor::GetSerializedDict() const {
+  // Default: empty slice => no dictionary
+  static Slice kEmptySlice;
+  return kEmptySlice;
+}
+
+namespace {
+
+class CompressorBase : public Compressor {
+ public:
+  explicit CompressorBase(const CompressionOptions& opts) : opts_(opts) {}
+
+ protected:
+  CompressionOptions opts_;
+};
+
+class CompressorWithSimpleDictBase : public CompressorBase {
+ public:
+  explicit CompressorWithSimpleDictBase(const CompressionOptions& opts,
+                                        std::string&& dict_data = {})
+      : CompressorBase(opts), dict_data_(std::move(dict_data)) {}
+
+  DictConfig GetDictGuidance(CacheEntryRole /*block_type*/) const override {
+    if (opts_.max_dict_bytes == 0) {
+      return DictDisabled{};
+    }
+    return DictSampling{opts_.max_dict_bytes};
+  }
+
+  // NOTE: empty dict is equivalent to no dict
+  Slice GetSerializedDict() const override { return dict_data_; }
+
+  std::unique_ptr<Compressor> Clone() const override {
+    return CloneForDict(std::string{dict_data_});
+  }
+
+  std::unique_ptr<Compressor> MaybeCloneSpecialized(
+      CacheEntryRole /*block_type*/,
+      DictConfigArgs&& dict_config) const final override {
+    if (auto* samples = std::get_if<DictSamples>(&dict_config)) {
+      assert(samples->Verify());
+      if (samples->empty()) {
+        return nullptr;
+      }
+      return CloneForDict(std::move(samples->sample_data));
+    } else if (auto* predef = std::get_if<DictPreDefined>(&dict_config)) {
+      if (predef->dict_data.empty()) {
+        return nullptr;
+      }
+      return CloneForDict(std::move(predef->dict_data));
+    } else {
+      assert(std::holds_alternative<DictDisabled>(dict_config));
+      return nullptr;
+    }
+  }
+
+  virtual std::unique_ptr<Compressor> CloneForDict(
+      std::string&& dict_data) const = 0;
+
+ protected:
+  const std::string dict_data_;
+};
+
+// NOTE: the legacy behavior is to pretend to use dictionary compression when
+// enabled, including storing a dictionary block, but to ignore it. That is
+// matched here.
+class BuiltinSnappyCompressorV2 final : public CompressorWithSimpleDictBase {
+ public:
+  using CompressorWithSimpleDictBase::CompressorWithSimpleDictBase;
+
+  const char* Name() const override { return "BuiltinSnappyCompressorV2"; }
+
+  CompressionType GetPreferredCompressionType() const override {
+    return kSnappyCompression;
+  }
+
+  std::unique_ptr<Compressor> CloneForDict(
+      std::string&& dict_data) const override {
+    return std::make_unique<BuiltinSnappyCompressorV2>(opts_,
+                                                       std::move(dict_data));
+  }
+
+  Status CompressBlock(Slice uncompressed_data, char* compressed_output,
+                       size_t* compressed_output_size,
+                       CompressionType* out_compression_type,
+                       ManagedWorkingArea*) override {
+#ifdef SNAPPY
+    struct MySink : public snappy::Sink {
+      MySink(char* output, size_t output_size)
+          : output_(output), output_size_(output_size) {}
+
+      char* output_;
+      size_t output_size_;
+      size_t pos_ = 0;
+
+      void Append(const char* data, size_t n) override {
+        if (pos_ + n <= output_size_) {
+          std::memcpy(output_ + pos_, data, n);
+          pos_ += n;
+        } else {
+          // Virtual abort
+          pos_ = output_size_ + 1;
+        }
+      }
+
+      char* GetAppendBuffer(size_t length, char* scratch) override {
+        if (pos_ + length <= output_size_) {
+          return output_ + pos_;
+        }
+        return scratch;
+      }
+    };
+    MySink sink{compressed_output, *compressed_output_size};
+    snappy::ByteArraySource source{uncompressed_data.data(),
+                                   uncompressed_data.size()};
+
+    size_t outlen = snappy::Compress(&source, &sink);
+    if (outlen > 0 && sink.pos_ <= sink.output_size_) {
+      // Compression kept/successful
+      assert(outlen == sink.pos_);
+      *compressed_output_size = outlen;
+      *out_compression_type = kSnappyCompression;
+      return Status::OK();
+    }
+    // Compression rejected
+    *compressed_output_size = 1;
+#else
+    (void)uncompressed_data;
+    (void)compressed_output;
+    // Compression bypassed (not supported)
+    *compressed_output_size = 0;
+#endif
+    *out_compression_type = kNoCompression;
+    return Status::OK();
+  }
+
+  std::shared_ptr<Decompressor> GetOptimizedDecompressor() const override;
+};
+
+[[maybe_unused]]
+std::pair<char*, size_t> StartCompressBlockV2(Slice uncompressed_data,
+                                              char* compressed_output,
+                                              size_t compressed_output_size) {
+  if (  // Can't compress more than 4GB
+      uncompressed_data.size() > std::numeric_limits<uint32_t>::max() ||
+      // Need enough output space for encoding uncompressed size
+      compressed_output_size <= 5) {
+    // Compression bypassed
+    return {nullptr, 0};
+  }
+  // Standard format for prepending uncompressed size to the compressed
+  // data in compress_format_version=2
+  char* alg_output = EncodeVarint32(
+      compressed_output, static_cast<uint32_t>(uncompressed_data.size()));
+  size_t alg_max_output_size =
+      compressed_output_size - (alg_output - compressed_output);
+  return {alg_output, alg_max_output_size};
+}
+
+class BuiltinZlibCompressorV2 final : public CompressorWithSimpleDictBase {
+ public:
+  using CompressorWithSimpleDictBase::CompressorWithSimpleDictBase;
+
+  const char* Name() const override { return "BuiltinZlibCompressorV2"; }
+
+  CompressionType GetPreferredCompressionType() const override {
+    return kZlibCompression;
+  }
+
+  std::unique_ptr<Compressor> CloneForDict(
+      std::string&& dict_data) const override {
+    return std::make_unique<BuiltinZlibCompressorV2>(opts_,
+                                                     std::move(dict_data));
+  }
+
+  Status CompressBlock(Slice uncompressed_data, char* compressed_output,
+                       size_t* compressed_output_size,
+                       CompressionType* out_compression_type,
+                       ManagedWorkingArea*) override {
+#ifdef ZLIB
+    auto [alg_output, alg_max_output_size] = StartCompressBlockV2(
+        uncompressed_data, compressed_output, *compressed_output_size);
+    if (alg_max_output_size == 0) {
+      // Compression bypassed
+      *compressed_output_size = 0;
+      *out_compression_type = kNoCompression;
+      return Status::OK();
+    }
+
+    // The memLevel parameter specifies how much memory should be allocated for
+    // the internal compression state.
+    // memLevel=1 uses minimum memory but is slow and reduces compression ratio.
+    // memLevel=9 uses maximum memory for optimal speed.
+    // The default value is 8. See zconf.h for more details.
+    static const int memLevel = 8;
+    int level = opts_.level;
+    if (level == CompressionOptions::kDefaultCompressionLevel) {
+      level = Z_DEFAULT_COMPRESSION;
+    }
+
+    z_stream stream;
+    memset(&stream, 0, sizeof(z_stream));
+
+    // Initialize the zlib stream
+    int st = deflateInit2(&stream, level, Z_DEFLATED, opts_.window_bits,
+                          memLevel, opts_.strategy);
+    if (st != Z_OK) {
+      *compressed_output_size = 0;
+      *out_compression_type = kNoCompression;
+      return Status::OK();
+    }
+
+    // Set dictionary if available
+    if (!dict_data_.empty()) {
+      st = deflateSetDictionary(
+          &stream, reinterpret_cast<const Bytef*>(dict_data_.data()),
+          static_cast<unsigned int>(dict_data_.size()));
+      if (st != Z_OK) {
+        deflateEnd(&stream);
+        *compressed_output_size = 0;
+        *out_compression_type = kNoCompression;
+        return Status::OK();
+      }
+    }
+
+    // Set up input
+    stream.next_in = (Bytef*)uncompressed_data.data();
+    stream.avail_in = static_cast<unsigned int>(uncompressed_data.size());
+
+    // Set up output
+    stream.next_out = reinterpret_cast<Bytef*>(alg_output);
+    stream.avail_out = static_cast<unsigned int>(alg_max_output_size);
+
+    // Compress
+    st = deflate(&stream, Z_FINISH);
+    size_t outlen = alg_max_output_size - stream.avail_out;
+    deflateEnd(&stream);
+
+    if (st == Z_STREAM_END) {
+      // Compression kept/successful
+      *compressed_output_size =
+          outlen + /*header size*/ (alg_output - compressed_output);
+      *out_compression_type = kZlibCompression;
+      return Status::OK();
+    }
+    // Compression failed or rejected
+    *compressed_output_size = 1;
+#else
+    (void)uncompressed_data;
+    (void)compressed_output;
+    // Compression bypassed (not supported)
+    *compressed_output_size = 0;
+#endif
+    *out_compression_type = kNoCompression;
+    return Status::OK();
+  }
+};
+
+class BuiltinBZip2CompressorV2 final : public CompressorWithSimpleDictBase {
+ public:
+  using CompressorWithSimpleDictBase::CompressorWithSimpleDictBase;
+
+  const char* Name() const override { return "BuiltinBZip2CompressorV2"; }
+
+  CompressionType GetPreferredCompressionType() const override {
+    return kBZip2Compression;
+  }
+
+  std::unique_ptr<Compressor> CloneForDict(
+      std::string&& dict_data) const override {
+    return std::make_unique<BuiltinBZip2CompressorV2>(opts_,
+                                                      std::move(dict_data));
+  }
+
+  Status CompressBlock(Slice uncompressed_data, char* compressed_output,
+                       size_t* compressed_output_size,
+                       CompressionType* out_compression_type,
+                       ManagedWorkingArea*) override {
+#ifdef BZIP2
+    auto [alg_output, alg_max_output_size] = StartCompressBlockV2(
+        uncompressed_data, compressed_output, *compressed_output_size);
+    if (alg_max_output_size == 0) {
+      // Compression bypassed
+      *compressed_output_size = 0;
+      *out_compression_type = kNoCompression;
+      return Status::OK();
+    }
+
+    // BZip2 doesn't actually use the dictionary, but we store it for
+    // compatibility similar to BuiltinSnappyCompressorV2
+
+    // Initialize the bzip2 stream
+    bz_stream stream;
+    memset(&stream, 0, sizeof(bz_stream));
+
+    // Block size 1 is 100K.
+    // 0 is for silent.
+    // 30 is the default workFactor
+    int st = BZ2_bzCompressInit(&stream, 1, 0, 30);
+    if (st != BZ_OK) {
+      *compressed_output_size = 0;
+      *out_compression_type = kNoCompression;
+      return Status::OK();
+    }
+
+    // Set up input
+    stream.next_in = const_cast<char*>(uncompressed_data.data());
+    stream.avail_in = static_cast<unsigned int>(uncompressed_data.size());
+
+    // Set up output
+    stream.next_out = alg_output;
+    stream.avail_out = static_cast<unsigned int>(alg_max_output_size);
+
+    // Compress
+    st = BZ2_bzCompress(&stream, BZ_FINISH);
+    size_t outlen = alg_max_output_size - stream.avail_out;
+    BZ2_bzCompressEnd(&stream);
+
+    // Check for success
+    if (st == BZ_STREAM_END) {
+      // Compression kept/successful
+      *compressed_output_size = outlen + (alg_output - compressed_output);
+      *out_compression_type = kBZip2Compression;
+      return Status::OK();
+    }
+    // Compression failed or rejected
+    *compressed_output_size = 1;
+#else
+    (void)uncompressed_data;
+    (void)compressed_output;
+    // Compression bypassed (not supported)
+    *compressed_output_size = 0;
+#endif
+    *out_compression_type = kNoCompression;
+    return Status::OK();
+  }
+};
+
+class BuiltinLZ4CompressorV2WithDict : public CompressorWithSimpleDictBase {
+ public:
+  using CompressorWithSimpleDictBase::CompressorWithSimpleDictBase;
+
+  const char* Name() const override { return "BuiltinLZ4CompressorV2"; }
+
+  CompressionType GetPreferredCompressionType() const override {
+    return kLZ4Compression;
+  }
+
+  std::unique_ptr<Compressor> CloneForDict(
+      std::string&& dict_data) const override {
+    return std::make_unique<BuiltinLZ4CompressorV2WithDict>(
+        opts_, std::move(dict_data));
+  }
+
+  ManagedWorkingArea ObtainWorkingArea() override {
+#ifdef LZ4
+    return {reinterpret_cast<WorkingArea*>(LZ4_createStream()), this};
+#else
+    return {};
+#endif
+  }
+  void ReleaseWorkingArea(WorkingArea* wa) override {
+    if (wa) {
+#ifdef LZ4
+      LZ4_freeStream(reinterpret_cast<LZ4_stream_t*>(wa));
+#endif
+    }
+  }
+
+  Status CompressBlock(Slice uncompressed_data, char* compressed_output,
+                       size_t* compressed_output_size,
+                       CompressionType* out_compression_type,
+                       ManagedWorkingArea* wa) override {
+#ifdef LZ4
+    auto [alg_output, alg_max_output_size] = StartCompressBlockV2(
+        uncompressed_data, compressed_output, *compressed_output_size);
+    if (alg_max_output_size == 0) {
+      // Compression bypassed
+      *compressed_output_size = 0;
+      *out_compression_type = kNoCompression;
+      return Status::OK();
+    }
+
+    ManagedWorkingArea tmp_wa;
+    LZ4_stream_t* stream;
+    if (wa != nullptr && wa->owner() == this) {
+      stream = reinterpret_cast<LZ4_stream_t*>(wa->get());
+#if LZ4_VERSION_NUMBER >= 10900  // >= version 1.9.0
+      LZ4_resetStream_fast(stream);
+#else
+      LZ4_resetStream(stream);
+#endif
+    } else {
+      tmp_wa = ObtainWorkingArea();
+      stream = reinterpret_cast<LZ4_stream_t*>(tmp_wa.get());
+    }
+    if (!dict_data_.empty()) {
+      // TODO: more optimization possible here?
+      LZ4_loadDict(stream, dict_data_.data(),
+                   static_cast<int>(dict_data_.size()));
+    }
+    int acceleration;
+    if (opts_.level < 0) {
+      acceleration = -opts_.level;
+    } else {
+      acceleration = 1;
+    }
+    auto outlen = LZ4_compress_fast_continue(
+        stream, uncompressed_data.data(), alg_output,
+        static_cast<int>(uncompressed_data.size()),
+        static_cast<int>(alg_max_output_size), acceleration);
+    if (outlen > 0) {
+      // Compression kept/successful
+      size_t output_size = static_cast<size_t>(
+          outlen + /*header size*/ (alg_output - compressed_output));
+      assert(output_size <= *compressed_output_size);
+      *compressed_output_size = output_size;
+      *out_compression_type = kLZ4Compression;
+      return Status::OK();
+    }
+    // Compression rejected
+    *compressed_output_size = 1;
+#else
+    (void)uncompressed_data;
+    (void)compressed_output;
+    (void)wa;
+    // Compression bypassed (not supported)
+    *compressed_output_size = 0;
+#endif
+    *out_compression_type = kNoCompression;
+    return Status::OK();
+  }
+};
+
+class BuiltinLZ4CompressorV2NoDict final
+    : public BuiltinLZ4CompressorV2WithDict {
+ public:
+  BuiltinLZ4CompressorV2NoDict(const CompressionOptions& opts)
+      : BuiltinLZ4CompressorV2WithDict(opts, /*dict_data=*/{}) {}
+
+  std::unique_ptr<Compressor> Clone() const override {
+    return std::make_unique<BuiltinLZ4CompressorV2NoDict>(opts_);
+  }
+
+  ManagedWorkingArea ObtainWorkingArea() override {
+    // Using an LZ4_stream_t between compressions and resetting with
+    // LZ4_resetStream_fast is actually slower than using a fresh LZ4_stream_t
+    // each time, or not involving a stream at all. Similarly, using an extState
+    // does not seem to offer a performance boost, perhaps a small regression.
+    return {};
+  }
+
+  void ReleaseWorkingArea(WorkingArea* wa) override {
+    // Should not be called
+    (void)wa;
+    assert(wa == nullptr);
+  }
+
+  Status CompressBlock(Slice uncompressed_data, char* compressed_output,
+                       size_t* compressed_output_size,
+                       CompressionType* out_compression_type,
+                       ManagedWorkingArea* wa) override {
+#ifdef LZ4
+    (void)wa;
+    auto [alg_output, alg_max_output_size] = StartCompressBlockV2(
+        uncompressed_data, compressed_output, *compressed_output_size);
+    if (alg_max_output_size == 0) {
+      // Compression bypassed
+      *compressed_output_size = 0;
+      *out_compression_type = kNoCompression;
+      return Status::OK();
+    }
+    int acceleration;
+    if (opts_.level < 0) {
+      acceleration = -opts_.level;
+    } else {
+      acceleration = 1;
+    }
+    auto outlen =
+        LZ4_compress_fast(uncompressed_data.data(), alg_output,
+                          static_cast<int>(uncompressed_data.size()),
+                          static_cast<int>(alg_max_output_size), acceleration);
+    if (outlen > 0) {
+      // Compression kept/successful
+      size_t output_size = static_cast<size_t>(
+          outlen + /*header size*/ (alg_output - compressed_output));
+      assert(output_size <= *compressed_output_size);
+      *compressed_output_size = output_size;
+      *out_compression_type = kLZ4Compression;
+      return Status::OK();
+    }
+    // Compression rejected
+    *compressed_output_size = 1;
+#else
+    (void)uncompressed_data;
+    (void)compressed_output;
+    (void)wa;
+    // Compression bypassed (not supported)
+    *compressed_output_size = 0;
+#endif
+    *out_compression_type = kNoCompression;
+    return Status::OK();
+  }
+};
+
+class BuiltinLZ4HCCompressorV2 final : public CompressorWithSimpleDictBase {
+ public:
+  using CompressorWithSimpleDictBase::CompressorWithSimpleDictBase;
+
+  const char* Name() const override { return "BuiltinLZ4HCCompressorV2"; }
+
+  CompressionType GetPreferredCompressionType() const override {
+    return kLZ4HCCompression;
+  }
+
+  std::unique_ptr<Compressor> CloneForDict(
+      std::string&& dict_data) const override {
+    return std::make_unique<BuiltinLZ4HCCompressorV2>(opts_,
+                                                      std::move(dict_data));
+  }
+
+  ManagedWorkingArea ObtainWorkingArea() override {
+#ifdef LZ4
+    return {reinterpret_cast<WorkingArea*>(LZ4_createStreamHC()), this};
+#else
+    return {};
+#endif
+  }
+  void ReleaseWorkingArea(WorkingArea* wa) override {
+    if (wa) {
+#ifdef LZ4
+      LZ4_freeStreamHC(reinterpret_cast<LZ4_streamHC_t*>(wa));
+#endif
+    }
+  }
+
+  Status CompressBlock(Slice uncompressed_data, char* compressed_output,
+                       size_t* compressed_output_size,
+                       CompressionType* out_compression_type,
+                       ManagedWorkingArea* wa) override {
+#ifdef LZ4
+    auto [alg_output, alg_max_output_size] = StartCompressBlockV2(
+        uncompressed_data, compressed_output, *compressed_output_size);
+    if (alg_max_output_size == 0) {
+      // Compression bypassed
+      *compressed_output_size = 0;
+      *out_compression_type = kNoCompression;
+      return Status::OK();
+    }
+
+    int level = opts_.level;
+    if (level == CompressionOptions::kDefaultCompressionLevel) {
+      level = 0;  // lz4hc.h says any value < 1 will be sanitized to default
+    }
+
+    ManagedWorkingArea tmp_wa;
+    LZ4_streamHC_t* stream;
+    if (wa != nullptr && wa->owner() == this) {
+      stream = reinterpret_cast<LZ4_streamHC_t*>(wa->get());
+    } else {
+      tmp_wa = ObtainWorkingArea();
+      stream = reinterpret_cast<LZ4_streamHC_t*>(tmp_wa.get());
+    }
+#if LZ4_VERSION_NUMBER >= 10900  // >= version 1.9.0
+    LZ4_resetStreamHC_fast(stream, level);
+#else
+    LZ4_resetStreamHC(stream, level);
+#endif
+    if (dict_data_.size() > 0) {
+      // TODO: more optimization possible here?
+      LZ4_loadDictHC(stream, dict_data_.data(),
+                     static_cast<int>(dict_data_.size()));
+    }
+
+    auto outlen =
+        LZ4_compress_HC_continue(stream, uncompressed_data.data(), alg_output,
+                                 static_cast<int>(uncompressed_data.size()),
+                                 static_cast<int>(alg_max_output_size));
+    if (outlen > 0) {
+      // Compression kept/successful
+      size_t output_size = static_cast<size_t>(
+          outlen + /*header size*/ (alg_output - compressed_output));
+      assert(output_size <= *compressed_output_size);
+      *compressed_output_size = output_size;
+      *out_compression_type = kLZ4HCCompression;
+      return Status::OK();
+    }
+    // Compression rejected
+    *compressed_output_size = 1;
+#else
+    (void)uncompressed_data;
+    (void)compressed_output;
+    (void)wa;
+    // Compression bypassed (not supported)
+    *compressed_output_size = 0;
+#endif
+    *out_compression_type = kNoCompression;
+    return Status::OK();
+  }
+};
+
+class BuiltinXpressCompressorV2 final : public CompressorWithSimpleDictBase {
+ public:
+  using CompressorWithSimpleDictBase::CompressorWithSimpleDictBase;
+
+  const char* Name() const override { return "BuiltinXpressCompressorV2"; }
+
+  CompressionType GetPreferredCompressionType() const override {
+    return kXpressCompression;
+  }
+
+  std::unique_ptr<Compressor> CloneForDict(
+      std::string&& dict_data) const override {
+    return std::make_unique<BuiltinXpressCompressorV2>(opts_,
+                                                       std::move(dict_data));
+  }
+
+  Status CompressBlock(Slice uncompressed_data, char* compressed_output,
+                       size_t* compressed_output_size,
+                       CompressionType* out_compression_type,
+                       ManagedWorkingArea*) override {
+#ifdef XPRESS
+    // XPRESS doesn't actually use the dictionary, but we store it for
+    // compatibility similar to BuiltinSnappyCompressorV2
+
+    // Use the new CompressWithMaxSize function that writes directly to the
+    // output buffer
+    size_t compressed_size = port::xpress::CompressWithMaxSize(
+        uncompressed_data.data(), uncompressed_data.size(), compressed_output,
+        *compressed_output_size);
+
+    if (compressed_size > 0) {
+      // Compression kept/successful
+      *compressed_output_size = compressed_size;
+      *out_compression_type = kXpressCompression;
+      return Status::OK();
+    }
+
+    // Compression rejected or failed
+    *compressed_output_size = 1;
+#else
+    (void)uncompressed_data;
+    (void)compressed_output;
+    // Compression bypassed (not supported)
+    *compressed_output_size = 0;
+#endif
+    *out_compression_type = kNoCompression;
+    return Status::OK();
+  }
+};
+
+class BuiltinZSTDCompressorV2 final : public CompressorBase {
+ public:
+  explicit BuiltinZSTDCompressorV2(const CompressionOptions& opts,
+                                   CompressionDict&& dict = {})
+      : CompressorBase(opts), dict_(std::move(dict)) {}
+
+  const char* Name() const override { return "BuiltinZSTDCompressorV2"; }
+
+  CompressionType GetPreferredCompressionType() const override { return kZSTD; }
+
+  std::unique_ptr<Compressor> Clone() const override {
+    CompressionDict dict_copy{dict_.GetRawDict().ToString(), kZSTD,
+                              opts_.level};
+    return std::make_unique<BuiltinZSTDCompressorV2>(opts_,
+                                                     std::move(dict_copy));
+  }
+
+  DictConfig GetDictGuidance(CacheEntryRole /*block_type*/) const override {
+    if (opts_.max_dict_bytes == 0) {
+      // Dictionary compression disabled
+      return DictDisabled{};
+    } else {
+      size_t max_sample_bytes = opts_.zstd_max_train_bytes > 0
+                                    ? opts_.zstd_max_train_bytes
+                                    : opts_.max_dict_bytes;
+      return DictSampling{max_sample_bytes};
+    }
+  }
+
+  // NOTE: empty dict is equivalent to no dict
+  Slice GetSerializedDict() const override { return dict_.GetRawDict(); }
+
+  ManagedWorkingArea ObtainWorkingArea() override {
+#ifdef ZSTD
+    ZSTD_CCtx* ctx =
+#ifdef ROCKSDB_ZSTD_CUSTOM_MEM
+        ZSTD_createCCtx_advanced(port::GetJeZstdAllocationOverrides());
+#else   // ROCKSDB_ZSTD_CUSTOM_MEM
+        ZSTD_createCCtx();
+#endif  // ROCKSDB_ZSTD_CUSTOM_MEM
+    auto level = opts_.level;
+    if (level == CompressionOptions::kDefaultCompressionLevel) {
+      // NB: ZSTD_CLEVEL_DEFAULT is historically == 3
+      level = ZSTD_CLEVEL_DEFAULT;
+    }
+    size_t err = ZSTD_CCtx_setParameter(ctx, ZSTD_c_compressionLevel, level);
+    if (ZSTD_isError(err)) {
+      assert(false);
+      ZSTD_freeCCtx(ctx);
+      ctx = ZSTD_createCCtx();
+    }
+    if (opts_.checksum) {
+      err = ZSTD_CCtx_setParameter(ctx, ZSTD_c_checksumFlag, 1);
+      if (ZSTD_isError(err)) {
+        assert(false);
+        ZSTD_freeCCtx(ctx);
+        ctx = ZSTD_createCCtx();
+      }
+    }
+    return ManagedWorkingArea(reinterpret_cast<WorkingArea*>(ctx), this);
+#else
+    return {};
+#endif  // ZSTD
+  }
+
+  void ReleaseWorkingArea(WorkingArea* wa) override {
+    if (wa) {
+#ifdef ZSTD
+      ZSTD_freeCCtx(reinterpret_cast<ZSTD_CCtx*>(wa));
+#endif  // ZSTD
+    }
+  }
+
+  Status CompressBlock(Slice uncompressed_data, char* compressed_output,
+                       size_t* compressed_output_size,
+                       CompressionType* out_compression_type,
+                       ManagedWorkingArea* wa) override {
+#ifdef ZSTD
+    auto [alg_output, alg_max_output_size] = StartCompressBlockV2(
+        uncompressed_data, compressed_output, *compressed_output_size);
+    if (alg_max_output_size == 0) {
+      // Compression bypassed
+      *compressed_output_size = 0;
+      *out_compression_type = kNoCompression;
+      return Status::OK();
+    }
+
+    ManagedWorkingArea tmp_wa;
+    if (wa == nullptr || wa->owner() != this) {
+      tmp_wa = ObtainWorkingArea();
+      wa = &tmp_wa;
+    }
+    assert(wa->get() != nullptr);
+    ZSTD_CCtx* ctx = reinterpret_cast<ZSTD_CCtx*>(wa->get());
+
+    if (dict_.GetDigestedZstdCDict() != nullptr) {
+      ZSTD_CCtx_refCDict(ctx, dict_.GetDigestedZstdCDict());
+    } else {
+      ZSTD_CCtx_loadDictionary(ctx, dict_.GetRawDict().data(),
+                               dict_.GetRawDict().size());
+    }
+
+    // Compression level is set in `contex` during ObtainWorkingArea()
+    size_t outlen =
+        ZSTD_compress2(ctx, alg_output, alg_max_output_size,
+                       uncompressed_data.data(), uncompressed_data.size());
+    if (!ZSTD_isError(outlen)) {
+      // Compression kept/successful
+      size_t output_size = static_cast<size_t>(
+          outlen + /*header size*/ (alg_output - compressed_output));
+      assert(output_size <= *compressed_output_size);
+      *compressed_output_size = output_size;
+      *out_compression_type = kZSTD;
+      return Status::OK();
+    }
+    if (ZSTD_getErrorCode(outlen) != ZSTD_error_dstSize_tooSmall) {
+      return Status::Corruption(std::string("ZSTD_compress2 failed: ") +
+                                ZSTD_getErrorName(outlen));
+    }
+    // Compression rejected
+    *compressed_output_size = 1;
+#else
+    (void)uncompressed_data;
+    (void)compressed_output;
+    (void)wa;
+    // Compression bypassed (not supported)
+    *compressed_output_size = 0;
+#endif
+    *out_compression_type = kNoCompression;
+    return Status::OK();
+  }
+
+  std::unique_ptr<Compressor> MaybeCloneSpecialized(
+      CacheEntryRole /*block_type*/,
+      DictConfigArgs&& dict_config) const override {
+    // Handle DictDisabled
+    // TODO: use holds_alternative
+    if (auto* disabled = std::get_if<DictDisabled>(&dict_config)) {
+      (void)disabled;
+      return nullptr;
+    }
+
+    std::string dict_data;
+
+    // Handle DictPreDefined - use the pre-defined dictionary directly
+    if (auto* predef = std::get_if<DictPreDefined>(&dict_config)) {
+      if (predef->dict_data.empty()) {
+        return nullptr;
+      }
+      dict_data = std::move(predef->dict_data);
+    }
+
+    // Handle DictSamples - train dictionary from samples
+    if (auto* samples = std::get_if<DictSamples>(&dict_config)) {
+      assert(samples->Verify());
+      if (samples->empty()) {
+        return nullptr;
+      }
+      // Migrated from BlockBasedTableBuilder::EnterUnbuffered()
+      if (opts_.zstd_max_train_bytes > 0) {
+        assert(samples->sample_data.size() <= opts_.zstd_max_train_bytes);
+        if (opts_.use_zstd_dict_trainer) {
+          dict_data = ZSTD_TrainDictionary(
+              samples->sample_data, samples->sample_lens, opts_.max_dict_bytes);
+        } else {
+          dict_data = ZSTD_FinalizeDictionary(
+              samples->sample_data, samples->sample_lens, opts_.max_dict_bytes,
+              opts_.level);
+        }
+      } else {
+        assert(samples->sample_data.size() <= opts_.max_dict_bytes);
+        // ZSTD "raw content dictionary" - "Any buffer is a valid raw content
+        // dictionary." Or similar for other compressions.
+        dict_data = std::move(samples->sample_data);
+      }
+    }
+
+    CompressionDict dict{std::move(dict_data), kZSTD, opts_.level};
+    return std::make_unique<BuiltinZSTDCompressorV2>(opts_, std::move(dict));
+  }
+
+  std::shared_ptr<Decompressor> GetOptimizedDecompressor() const override;
+
+ protected:
+  const CompressionDict dict_;
+};
+
+// Subroutines for BuiltinDecompressorV2
+
+Status Snappy_DecompressBlock(const Decompressor::Args& args,
+                              char* uncompressed_output) {
+#ifdef SNAPPY
+  if (!snappy::RawUncompress(args.compressed_data.data(),
+                             args.compressed_data.size(),
+                             uncompressed_output)) {
+    return Status::Corruption("Error decompressing snappy data");
+  }
+  return Status::OK();
+#else
+  (void)args;
+  (void)uncompressed_output;
+  return Status::NotSupported("Snappy not supported in this build");
+#endif
+}
+
+Status Zlib_DecompressBlock(const Decompressor::Args& args, Slice dict,
+                            char* uncompressed_output) {
+#ifdef ZLIB
+  // NOTE: uses "raw" format
+  constexpr int kWindowBits = -14;
+
+  z_stream _stream;
+  memset(&_stream, 0, sizeof(z_stream));
+
+  // For raw inflate, the windowBits should be -8..-15.
+  // If windowBits is bigger than zero, it will use either zlib
+  // header or gzip header. Adding 32 to it will do automatic detection.
+  int st = inflateInit2(&_stream, kWindowBits);
+  if (UNLIKELY(st != Z_OK)) {
+    return Status::Corruption("Failed to initialize zlib inflate: " +
+                              std::to_string(st));
+  }
+
+  if (!dict.empty()) {
+    // Initialize the compression library's dictionary
+    st = inflateSetDictionary(&_stream,
+                              reinterpret_cast<const Bytef*>(dict.data()),
+                              static_cast<unsigned int>(dict.size()));
+    if (UNLIKELY(st != Z_OK)) {
+      return Status::Corruption("Failed to initialize zlib dictionary: " +
+                                std::to_string(st));
+    }
+  }
+
+  _stream.next_in = const_cast<Bytef*>(
+      reinterpret_cast<const Bytef*>(args.compressed_data.data()));
+  _stream.avail_in = static_cast<unsigned int>(args.compressed_data.size());
+
+  _stream.next_out = reinterpret_cast<Bytef*>(uncompressed_output);
+  _stream.avail_out = static_cast<unsigned int>(args.uncompressed_size);
+
+  st = inflate(&_stream, Z_SYNC_FLUSH);
+  if (UNLIKELY(st != Z_STREAM_END)) {
+    inflateEnd(&_stream);
+    // NOTE: Z_OK is still corruption because it means we got the size wrong
+    return Status::Corruption("Failed zlib inflate: " + std::to_string(st));
+  }
+
+  // We should have no bytes left
+  if (_stream.avail_out != 0) {
+    inflateEnd(&_stream);
+    return Status::Corruption("Size mismatch decompressing zlib data");
+  }
+
+  inflateEnd(&_stream);
+  return Status::OK();
+#else
+  (void)args;
+  (void)dict;
+  (void)uncompressed_output;
+  return Status::NotSupported("Zlib not supported in this build");
+#endif
+}
+
+Status BZip2_DecompressBlock(const Decompressor::Args& args,
+                             char* uncompressed_output) {
+#ifdef BZIP2
+  auto uncompressed_size = static_cast<unsigned int>(args.uncompressed_size);
+  if (BZ_OK != BZ2_bzBuffToBuffDecompress(
+                   uncompressed_output, &uncompressed_size,
+                   const_cast<char*>(args.compressed_data.data()),
+                   static_cast<unsigned int>(args.compressed_data.size()),
+                   0 /*small mem*/, 0 /*verbosity*/)) {
+    return Status::Corruption("Error decompressing bzip2 data");
+  }
+  if (uncompressed_size != args.uncompressed_size) {
+    return Status::Corruption("Size mismatch decompressing bzip2 data");
+  }
+  return Status::OK();
+#else
+  (void)args;
+  (void)uncompressed_output;
+  return Status::NotSupported("BZip2 not supported in this build");
+#endif
+}
+
+Status LZ4_DecompressBlock(const Decompressor::Args& args, Slice dict,
+                           char* uncompressed_output) {
+#ifdef LZ4
+  int expected_uncompressed_size = static_cast<int>(args.uncompressed_size);
+  LZ4_streamDecode_t* stream = LZ4_createStreamDecode();
+  if (!dict.empty()) {
+    LZ4_setStreamDecode(stream, dict.data(), static_cast<int>(dict.size()));
+  }
+  int uncompressed_size = LZ4_decompress_safe_continue(
+      stream, args.compressed_data.data(), uncompressed_output,
+      static_cast<int>(args.compressed_data.size()),
+      expected_uncompressed_size);
+  LZ4_freeStreamDecode(stream);
+
+  if (uncompressed_size != expected_uncompressed_size) {
+    if (uncompressed_size < 0) {
+      return Status::Corruption("Error decompressing LZ4 data");
+    } else {
+      return Status::Corruption("Size mismatch decompressing LZ4 data");
+    }
+  }
+  return Status::OK();
+#else
+  (void)args;
+  (void)dict;
+  (void)uncompressed_output;
+  return Status::NotSupported("LZ4 not supported in this build");
+#endif
+}
+
+Status XPRESS_DecompressBlock(const Decompressor::Args& args,
+                              char* uncompressed_output) {
+#ifdef XPRESS
+  int64_t actual_uncompressed_size = port::xpress::DecompressToBuffer(
+      args.compressed_data.data(), args.compressed_data.size(),
+      uncompressed_output, args.uncompressed_size);
+  if (actual_uncompressed_size !=
+      static_cast<int64_t>(args.uncompressed_size)) {
+    if (actual_uncompressed_size < 0) {
+      return Status::Corruption("Error decompressing XPRESS data");
+    } else {
+      return Status::Corruption("Size mismatch decompressing XPRESS data");
+    }
+  }
+  return Status::OK();
+#else
+  (void)args;
+  (void)uncompressed_output;
+  return Status::NotSupported("XPRESS not supported in this build");
+#endif
+}
+
+template <bool kIsDigestedDict = false>
+Status ZSTD_DecompressBlockWithContext(
+    const Decompressor::Args& args,
+    std::conditional_t<kIsDigestedDict, void*, Slice> dict,
+    ZSTDUncompressCachedData::ZSTDNativeContext zstd_context,
+    char* uncompressed_output) {
+#ifdef ZSTD
+  size_t uncompressed_size;
+  assert(zstd_context != nullptr);
+  if constexpr (kIsDigestedDict) {
+#ifdef ROCKSDB_ZSTD_DDICT
+    uncompressed_size = ZSTD_decompress_usingDDict(
+        zstd_context, uncompressed_output, args.uncompressed_size,
+        args.compressed_data.data(), args.compressed_data.size(),
+        static_cast<ZSTD_DDict*>(dict));
+#else
+    static_assert(!kIsDigestedDict,
+                  "Inconsistent expectation of ZSTD digested dict support");
+#endif  // ROCKSDB_ZSTD_DDICT
+  } else if (dict.empty()) {
+    uncompressed_size = ZSTD_decompressDCtx(
+        zstd_context, uncompressed_output, args.uncompressed_size,
+        args.compressed_data.data(), args.compressed_data.size());
+  } else {
+    uncompressed_size = ZSTD_decompress_usingDict(
+        zstd_context, uncompressed_output, args.uncompressed_size,
+        args.compressed_data.data(), args.compressed_data.size(), dict.data(),
+        dict.size());
+  }
+  if (ZSTD_isError(uncompressed_size)) {
+    return Status::Corruption(std::string("ZSTD ") +
+                              ZSTD_getErrorName(uncompressed_size));
+  } else if (uncompressed_size != args.uncompressed_size) {
+    return Status::Corruption("ZSTD decompression size mismatch");
+  } else {
+    return Status::OK();
+  }
+#else
+  (void)args;
+  (void)dict;
+  (void)zstd_context;
+  (void)uncompressed_output;
+  return Status::NotSupported("ZSTD not supported in this build");
+#endif
+}
+
+template <bool kIsDigestedDict = false>
+Status ZSTD_DecompressBlock(
+    const Decompressor::Args& args,
+    std::conditional_t<kIsDigestedDict, void*, Slice> dict,
+    const Decompressor* decompressor, char* uncompressed_output) {
+  if (args.working_area && args.working_area->owner() == decompressor) {
+    auto ctx = static_cast<UncompressionContext*>(args.working_area->get());
+    assert(ctx != nullptr);
+    if (ctx->GetZSTDContext() != nullptr) {
+      return ZSTD_DecompressBlockWithContext<kIsDigestedDict>(
+          args, dict, ctx->GetZSTDContext(), uncompressed_output);
+    }
+  }
+  UncompressionContext tmp_ctx{kZSTD};
+  return ZSTD_DecompressBlockWithContext<kIsDigestedDict>(
+      args, dict, tmp_ctx.GetZSTDContext(), uncompressed_output);
+}
+
+class BuiltinDecompressorV2 : public Decompressor {
+ public:
+  const char* Name() const override { return "BuiltinDecompressorV2"; }
+
+  Status ExtractUncompressedSize(Args& args) override {
+    assert(args.compression_type != kNoCompression);
+    if (args.compression_type == kSnappyCompression) {
+      // 1st exception to encoding of uncompressed size
+#ifdef SNAPPY
+      size_t uncompressed_length = 0;
+      if (!snappy::GetUncompressedLength(args.compressed_data.data(),
+                                         args.compressed_data.size(),
+                                         &uncompressed_length)) {
+        return Status::Corruption("Error reading snappy compressed length");
+      }
+      args.uncompressed_size = uncompressed_length;
+      return Status::OK();
+#else
+      return Status::NotSupported("Snappy not supported in this build");
+#endif
+    } else if (args.compression_type == kXpressCompression) {
+      // 2nd exception to encoding of uncompressed size
+#ifdef XPRESS
+      int64_t result = port::xpress::GetDecompressedSize(
+          args.compressed_data.data(), args.compressed_data.size());
+      if (result < 0) {
+        return Status::Corruption("Error reading XPRESS compressed length");
+      }
+      args.uncompressed_size = static_cast<size_t>(result);
+      return Status::OK();
+#else
+      return Status::NotSupported("XPRESS not supported in this build");
+#endif
+
+    } else {
+      // Extract encoded uncompressed size
+      return Decompressor::ExtractUncompressedSize(args);
+    }
+  }
+
+  Status DecompressBlock(const Args& args, char* uncompressed_output) override {
+    switch (args.compression_type) {
+      case kSnappyCompression:
+        return Snappy_DecompressBlock(args, uncompressed_output);
+      case kZlibCompression:
+        return Zlib_DecompressBlock(args, /*dict=*/Slice{},
+                                    uncompressed_output);
+      case kBZip2Compression:
+        return BZip2_DecompressBlock(args, uncompressed_output);
+      case kLZ4Compression:
+      case kLZ4HCCompression:
+        return LZ4_DecompressBlock(args, /*dict=*/Slice{}, uncompressed_output);
+      case kXpressCompression:
+        return XPRESS_DecompressBlock(args, uncompressed_output);
+      case kZSTD:
+        return ZSTD_DecompressBlock(args, /*dict=*/Slice{}, this,
+                                    uncompressed_output);
+      default:
+        return Status::NotSupported(
+            "Compression type not supported or not built-in: " +
+            CompressionTypeToString(args.compression_type));
+    }
+  }
+
+  Status MaybeCloneForDict(const Slice&,
+                           std::unique_ptr<Decompressor>*) override;
+
+  size_t ApproximateOwnedMemoryUsage() const override {
+    return sizeof(BuiltinDecompressorV2);
+  }
+};
+
+class BuiltinDecompressorV2SnappyOnly final : public BuiltinDecompressorV2 {
+ public:
+  const char* Name() const override {
+    return "BuiltinDecompressorV2SnappyOnly";
+  }
+
+  Status ExtractUncompressedSize(Args& args) override {
+    assert(args.compression_type == kSnappyCompression);
+#ifdef SNAPPY
+    size_t uncompressed_length = 0;
+    if (!snappy::GetUncompressedLength(args.compressed_data.data(),
+                                       args.compressed_data.size(),
+                                       &uncompressed_length)) {
+      return Status::Corruption("Error reading snappy compressed length");
+    }
+    args.uncompressed_size = uncompressed_length;
+    return Status::OK();
+#else
+    return Status::NotSupported("Snappy not supported in this build");
+#endif
+  }
+
+  Status DecompressBlock(const Args& args, char* uncompressed_output) override {
+    assert(args.compression_type == kSnappyCompression);
+    return Snappy_DecompressBlock(args, uncompressed_output);
+  }
+};
+
+class BuiltinDecompressorV2WithDict final : public BuiltinDecompressorV2 {
+ public:
+  explicit BuiltinDecompressorV2WithDict(const Slice& dict) : dict_(dict) {}
+
+  const char* Name() const override { return "BuiltinDecompressorV2WithDict"; }
+
+  Status DecompressBlock(const Args& args, char* uncompressed_output) override {
+    switch (args.compression_type) {
+      case kSnappyCompression:
+        // NOTE: quietly ignores the dictionary (for compatibility)
+        return Snappy_DecompressBlock(args, uncompressed_output);
+      case kZlibCompression:
+        return Zlib_DecompressBlock(args, dict_, uncompressed_output);
+      case kBZip2Compression:
+        // NOTE: quietly ignores the dictionary (for compatibility)
+        return BZip2_DecompressBlock(args, uncompressed_output);
+      case kLZ4Compression:
+      case kLZ4HCCompression:
+        return LZ4_DecompressBlock(args, dict_, uncompressed_output);
+      case kXpressCompression:
+        // NOTE: quietly ignores the dictionary (for compatibility)
+        return XPRESS_DecompressBlock(args, uncompressed_output);
+      case kZSTD:
+        return ZSTD_DecompressBlock(args, dict_, this, uncompressed_output);
+      default:
+        return Status::NotSupported(
+            "Compression type not supported or not built-in: " +
+            CompressionTypeToString(args.compression_type));
+    }
+  }
+
+  const Slice& GetSerializedDict() const override { return dict_; }
+
+  size_t ApproximateOwnedMemoryUsage() const override {
+    return sizeof(BuiltinDecompressorV2WithDict);
+  }
+
+ protected:
+  const Slice dict_;
+};
+
+Status BuiltinDecompressorV2::MaybeCloneForDict(
+    const Slice& dict, std::unique_ptr<Decompressor>* out) {
+  // Check RocksDB-promised precondition
+  assert(dict.size() > 0);
+  // Because of unfortunate decisions in handling built-in compression types,
+  // all the compression types before ZSTD that do not actually support
+  // dictionary compression pretend to support it. Specifically, we have to be
+  // able to read files with a compression dictionary block using those
+  // compression types even though the compression dictionary is ignored by
+  // the compression algorithm. And the Decompressor has to return the
+  // configured dictionary from GetSerializedDict() even if it is ignored. This
+  // unfortunately means that a new schema version (BuiltinV3?) would be needed
+  // toactually support dictionary compression in the future for these
+  // algorithms (if the libraries add support).
+  // TODO: can we make this a better/cleaner experience?
+  *out = std::make_unique<BuiltinDecompressorV2WithDict>(dict);
+  return Status::OK();
+}
+
+class BuiltinDecompressorV2OptimizeZstd : public BuiltinDecompressorV2 {
+ public:
+  const char* Name() const override {
+    return "BuiltinDecompressorV2OptimizeZstd";
+  }
+
+  ManagedWorkingArea ObtainWorkingArea(CompressionType preferred) override {
+    if (preferred == kZSTD) {
+      // TODO: evaluate whether it makes sense to use core local cache here.
+      // (Perhaps not, because explicit WorkingArea could be long-running.)
+      return ManagedWorkingArea(new UncompressionContext(kZSTD), this);
+    } else {
+      return {};
+    }
+  }
+
+  void ReleaseWorkingArea(WorkingArea* wa) override {
+    delete static_cast<UncompressionContext*>(wa);
+  }
+
+  Status DecompressBlock(const Args& args, char* uncompressed_output) override {
+    if (LIKELY(args.compression_type == kZSTD)) {
+      return ZSTD_DecompressBlock(args, /*dict=*/Slice{}, this,
+                                  uncompressed_output);
+    } else {
+      return BuiltinDecompressorV2::DecompressBlock(args, uncompressed_output);
+    }
+  }
+
+  Status MaybeCloneForDict(const Slice& /*serialized_dict*/,
+                           std::unique_ptr<Decompressor>* /*out*/) override;
+};
+
+class BuiltinDecompressorV2OptimizeZstdWithDict final
+    : public BuiltinDecompressorV2OptimizeZstd {
+ public:
+  explicit BuiltinDecompressorV2OptimizeZstdWithDict(const Slice& dict)
+      :
+#ifdef ROCKSDB_ZSTD_DDICT
+        dict_(dict),
+        ddict_(ZSTD_createDDict_byReference(dict.data(), dict.size())) {
+    assert(ddict_ != nullptr);
+  }
+#else
+        dict_(dict) {
+  }
+#endif  // ROCKSDB_ZSTD_DDICT
+
+  const char* Name() const override {
+    return "BuiltinDecompressorV2OptimizeZstdWithDict";
+  }
+
+  ~BuiltinDecompressorV2OptimizeZstdWithDict() override {
+#ifdef ROCKSDB_ZSTD_DDICT
+    size_t res = ZSTD_freeDDict(ddict_);
+    assert(res == 0);  // Last I checked they can't fail
+    (void)res;         // prevent unused var warning
+#endif                 // ROCKSDB_ZSTD_DDICT
+  }
+
+  const Slice& GetSerializedDict() const override { return dict_; }
+
+  size_t ApproximateOwnedMemoryUsage() const override {
+    size_t sz = sizeof(BuiltinDecompressorV2WithDict);
+#ifdef ROCKSDB_ZSTD_DDICT
+    sz += ZSTD_sizeof_DDict(ddict_);
+#endif  // ROCKSDB_ZSTD_DDICT
+    return sz;
+  }
+
+  Status DecompressBlock(const Args& args, char* uncompressed_output) override {
+    if (LIKELY(args.compression_type == kZSTD)) {
+#ifdef ROCKSDB_ZSTD_DDICT
+      return ZSTD_DecompressBlock</*kIsDigestedDict=*/true>(
+          args, ddict_, this, uncompressed_output);
+#else
+      return ZSTD_DecompressBlock(args, dict_, this, uncompressed_output);
+#endif  // ROCKSDB_ZSTD_DDICT
+    } else {
+      return BuiltinDecompressorV2WithDict(dict_).DecompressBlock(
+          args, uncompressed_output);
+    }
+  }
+
+ protected:
+  const Slice dict_;
+#ifdef ROCKSDB_ZSTD_DDICT
+  ZSTD_DDict* const ddict_;
+#endif  // ROCKSDB_ZSTD_DDICT
+};
+
+Status BuiltinDecompressorV2OptimizeZstd::MaybeCloneForDict(
+    const Slice& serialized_dict, std::unique_ptr<Decompressor>* out) {
+  *out = std::make_unique<BuiltinDecompressorV2OptimizeZstdWithDict>(
+      serialized_dict);
+  return Status::OK();
+}
+class BuiltinCompressionManagerV2 final : public CompressionManager {
+ public:
+  BuiltinCompressionManagerV2() = default;
+  ~BuiltinCompressionManagerV2() override = default;
+
+  const char* Name() const override { return "BuiltinCompressionManagerV2"; }
+
+  const char* CompatibilityName() const override { return "BuiltinV2"; }
+
+  std::unique_ptr<Compressor> GetCompressor(const CompressionOptions& opts,
+                                            CompressionType type) override {
+    if (opts.max_compressed_bytes_per_kb <= 0) {
+      // No acceptable compression ratio => no compression
+      return nullptr;
+    }
+    if (!SupportsCompressionType(type)) {
+      // Unrecognized or support not compiled in. Fall back on default
+      type = ColumnFamilyOptions{}.compression;
+    }
+    switch (type) {
+      case kNoCompression:
+      default:
+        assert(type == kNoCompression);  // Others should be excluded above
+        return nullptr;
+      case kSnappyCompression:
+        return std::make_unique<BuiltinSnappyCompressorV2>(opts);
+      case kZlibCompression:
+        return std::make_unique<BuiltinZlibCompressorV2>(opts);
+      case kBZip2Compression:
+        return std::make_unique<BuiltinBZip2CompressorV2>(opts);
+      case kLZ4Compression:
+        return std::make_unique<BuiltinLZ4CompressorV2NoDict>(opts);
+      case kLZ4HCCompression:
+        return std::make_unique<BuiltinLZ4HCCompressorV2>(opts);
+      case kXpressCompression:
+        return std::make_unique<BuiltinXpressCompressorV2>(opts);
+      case kZSTD:
+        return std::make_unique<BuiltinZSTDCompressorV2>(opts);
+    }
+  }
+
+  std::shared_ptr<Decompressor> GetDecompressor() override {
+    return GetGeneralDecompressor();
+  }
+
+  std::shared_ptr<Decompressor> GetDecompressorOptimizeFor(
+      CompressionType optimize_for_type) override {
+    if (optimize_for_type == kZSTD) {
+      return GetZstdDecompressor();
+    } else {
+      return GetGeneralDecompressor();
+    }
+  }
+
+  std::shared_ptr<Decompressor> GetDecompressorForTypes(
+      const CompressionType* types_begin,
+      const CompressionType* types_end) override {
+    if (types_begin == types_end) {
+      return nullptr;
+    } else if (types_begin + 1 == types_end &&
+               *types_begin == kSnappyCompression) {
+      // Exclusively Snappy
+      return GetSnappyDecompressor();
+    } else if (std::find(types_begin, types_end, kZSTD) != types_end) {
+      // Includes ZSTD
+      return GetZstdDecompressor();
+    } else {
+      // Everything else
+      return GetGeneralDecompressor();
+    }
+  }
+
+  bool SupportsCompressionType(CompressionType type) const override {
+    return CompressionTypeSupported(type);
+  }
+
+ protected:
+  BuiltinDecompressorV2 decompressor_;
+  BuiltinDecompressorV2OptimizeZstd zstd_decompressor_;
+  BuiltinDecompressorV2SnappyOnly snappy_decompressor_;
+
+ public:
+  inline std::shared_ptr<Decompressor> GetGeneralDecompressor() {
+    return std::shared_ptr<Decompressor>(shared_from_this(), &decompressor_);
+  }
+
+  inline std::shared_ptr<Decompressor> GetZstdDecompressor() {
+    return std::shared_ptr<Decompressor>(shared_from_this(),
+                                         &zstd_decompressor_);
+  }
+
+  inline std::shared_ptr<Decompressor> GetSnappyDecompressor() {
+    return std::shared_ptr<Decompressor>(shared_from_this(),
+                                         &snappy_decompressor_);
+  }
+};
+
+const std::shared_ptr<BuiltinCompressionManagerV2>
+    kBuiltinCompressionManagerV2 =
+        std::make_shared<BuiltinCompressionManagerV2>();
+
+std::shared_ptr<Decompressor>
+BuiltinZSTDCompressorV2::GetOptimizedDecompressor() const {
+  return kBuiltinCompressionManagerV2->GetZstdDecompressor();
+}
+
+std::shared_ptr<Decompressor>
+BuiltinSnappyCompressorV2::GetOptimizedDecompressor() const {
+  return kBuiltinCompressionManagerV2->GetSnappyDecompressor();
+}
+
+}  // namespace
+
+Status CompressionManager::CreateFromString(
+    const ConfigOptions& config_options, const std::string& value,
+    std::shared_ptr<CompressionManager>* result) {
+  if (value == kNullptrString || value.empty()) {
+    result->reset();
+    return Status::OK();
+  }
+
+  static std::once_flag loaded;
+  std::call_once(loaded, [&]() {
+    auto& library = *ObjectLibrary::Default();
+    // TODO: try to enhance ObjectLibrary to support singletons
+    library.AddFactory<CompressionManager>(
+        kBuiltinCompressionManagerV2->CompatibilityName(),
+        [](const std::string& /*uri*/,
+           std::unique_ptr<CompressionManager>* guard,
+           std::string* /*errmsg*/) {
+          *guard = std::make_unique<BuiltinCompressionManagerV2>();
+          return guard->get();
+        });
+  });
+
+  std::string id;
+  std::unordered_map<std::string, std::string> opt_map;
+  Status status = Customizable::GetOptionsMap(config_options, result->get(),
+                                              value, &id, &opt_map);
+  if (!status.ok()) {  // GetOptionsMap failed
+    return status;
+  } else if (id.empty()) {  // We have no Id but have options.  Not good
+    return Status::NotSupported("Cannot reset object ", id);
+  } else {
+    status = config_options.registry->NewSharedObject(id, result);
+  }
+  if (config_options.ignore_unsupported_options && status.IsNotSupported()) {
+    return Status::OK();
+  } else if (status.ok()) {
+    status = Customizable::ConfigureNewObject(config_options, result->get(),
+                                              opt_map);
+  }
+  return status;
+}
+
+std::shared_ptr<CompressionManager>
+CompressionManager::FindCompatibleCompressionManager(Slice compatibility_name) {
+  if (compatibility_name.compare(CompatibilityName()) == 0) {
+    return shared_from_this();
+  } else {
+    std::shared_ptr<CompressionManager> out;
+    Status s =
+        CreateFromString(ConfigOptions(), compatibility_name.ToString(), &out);
+    if (s.ok()) {
+      return out;
+    } else {
+      return nullptr;
+    }
+  }
+}
+
+const std::shared_ptr<CompressionManager>& GetBuiltinV2CompressionManager() {
+  static const std::shared_ptr<CompressionManager> v2_as_base =
+      kBuiltinCompressionManagerV2;
+  return v2_as_base;
+}
+
+// ***********************************************************************
+// END built-in implementation of customization interface
+// ***********************************************************************
+
+Status LegacyForceBuiltinCompression(
+    Compressor& builtin_compressor,
+    Compressor::ManagedWorkingArea* working_area, Slice from,
+    GrowableBuffer* to) {
+  // For legacy cases that store compressed data even when it's larger than the
+  // uncompressed data (!!!), we need a reliable upper bound on the compressed
+  // size. This is based on consulting various algorithms documentation etc.
+  // and adding ~4 bytes for encoded uncompressed size. (Snappy is the worst
+  // case for multiplicative overhead at n + n/6, bounded by 19*n/16 to avoid
+  // costly division. Bzip2 is the worst case for additive overhead at 600
+  // bytes.)
+  size_t n = from.size();
+  size_t upper_bound = ((19 * n) >> 4) + 604;
+  // The upper bound has only been established considering built-in compression
+  // types through kZSTD. (Might need updating if this fails.)
+  assert(builtin_compressor.GetPreferredCompressionType() <= kZSTD);
+
+  to->ResetForSize(upper_bound);
+  CompressionType actual_type = kNoCompression;
+  Status s = builtin_compressor.CompressBlock(
+      from, to->data(), &to->MutableSize(), &actual_type, working_area);
+  TEST_SYNC_POINT_CALLBACK("LegacyForceBuiltinCompression:TamperWithStatus",
+                           &s);
+
+  if (!s.ok()) {
+    return s;
+  }
+  if (actual_type == kNoCompression) {
+    // abort in debug builds
+    assert(actual_type != kNoCompression);
+    return Status::Corruption("Compression unexpectedly declined or aborted");
+  }
+  assert(actual_type == builtin_compressor.GetPreferredCompressionType());
+  return Status::OK();
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/util/compression.h b/util/compression.h
index e7ddcc2ff8b7..beb07c8de694 100644
--- a/util/compression.h
+++ b/util/compression.h
@@ -10,44 +10,18 @@
 #pragma once
 
 #include <algorithm>
-#include <limits>
-#ifdef ROCKSDB_MALLOC_USABLE_SIZE
-#ifdef OS_FREEBSD
-#include <malloc_np.h>
-#else  // OS_FREEBSD
-#include <malloc.h>
-#endif  // OS_FREEBSD
-#endif  // ROCKSDB_MALLOC_USABLE_SIZE
-#include <string>
 
 #include "memory/memory_allocator_impl.h"
+#include "rocksdb/advanced_compression.h"
 #include "rocksdb/options.h"
-#include "rocksdb/table.h"
 #include "table/block_based/block_type.h"
-#include "test_util/sync_point.h"
+#include "util/aligned_buffer.h"
 #include "util/coding.h"
 #include "util/compression_context_cache.h"
-#include "util/string_util.h"
-
-#ifdef SNAPPY
-#include <snappy.h>
-#endif
-
-#ifdef ZLIB
-#include <zlib.h>
-#endif
-
-#ifdef BZIP2
-#include <bzlib.h>
-#endif
-
-#if defined(LZ4)
-#include <lz4.h>
-#include <lz4hc.h>
-#endif
 
 #ifdef ZSTD
 #include <zstd.h>
+#include <zstd_errors.h>
 // ZSTD_Compress2(), ZSTD_compressStream2() and frame parameters all belong to
 // advanced APIs and require v1.4.0+, which is from April 2019.
 // https://github.com/facebook/zstd/blob/eb9f881eb810f2242f1ef36b3f3e7014eecb8fa6/lib/zstd.h#L297C40-L297C45
@@ -144,6 +118,100 @@ class ZSTDUncompressCachedData {
 
 namespace ROCKSDB_NAMESPACE {
 
+class FailureDecompressor : public Decompressor {
+ public:
+  explicit FailureDecompressor(Status&& status) : status_(std::move(status)) {
+    assert(!status_.ok());
+  }
+  ~FailureDecompressor() override { status_.PermitUncheckedError(); }
+
+  const char* Name() const override { return "FailureDecompressor"; }
+
+  Status ExtractUncompressedSize(Args& /*args*/) override { return status_; }
+
+  Status DecompressBlock(const Args& /*args*/,
+                         char* /*uncompressed_output*/) override {
+    return status_;
+  }
+
+ protected:
+  Status status_;
+};
+
+// Owns a decompression dictionary, and associated Decompressor, for storing
+// in the block cache.
+//
+// Justification: for a "processed" dictionary to be saved in block cache, we
+// also need a reference to the decompressor that processed it, to ensure it
+// is recognized properly. At that point, we might as well have the dictionary
+// part of the decompressor identity and track an associated decompressor along
+// with a decompression dictionary in the block cache, and the decompressor
+// hides potential details of processing the dictionary.
+struct DecompressorDict {
+  // Block containing the data for the compression dictionary in case the
+  // constructor that takes a string parameter is used.
+  std::string dict_str_;
+
+  // Block containing the data for the compression dictionary in case the
+  // constructor that takes a Slice parameter is used and the passed in
+  // CacheAllocationPtr is not nullptr.
+  CacheAllocationPtr dict_allocation_;
+
+  // A Decompressor referencing and using the dictionary owned by this.
+  std::unique_ptr<Decompressor> decompressor_;
+
+  // Approximate owned memory usage
+  size_t memory_usage_;
+
+  DecompressorDict(std::string&& dict, Decompressor& from_decompressor)
+      : dict_str_(std::move(dict)) {
+    Populate(from_decompressor, dict_str_);
+  }
+
+  DecompressorDict(Slice slice, CacheAllocationPtr&& allocation,
+                   Decompressor& from_decompressor)
+      : dict_allocation_(std::move(allocation)) {
+    Populate(from_decompressor, slice);
+  }
+
+  DecompressorDict(DecompressorDict&& rhs) noexcept
+      : dict_str_(std::move(rhs.dict_str_)),
+        dict_allocation_(std::move(rhs.dict_allocation_)),
+        decompressor_(std::move(rhs.decompressor_)),
+        memory_usage_(std::move(rhs.memory_usage_)) {}
+
+  DecompressorDict& operator=(DecompressorDict&& rhs) noexcept {
+    if (this == &rhs) {
+      return *this;
+    }
+    dict_str_ = std::move(rhs.dict_str_);
+    dict_allocation_ = std::move(rhs.dict_allocation_);
+    decompressor_ = std::move(rhs.decompressor_);
+    return *this;
+  }
+  // Disable copy
+  DecompressorDict(const DecompressorDict&) = delete;
+  DecompressorDict& operator=(const DecompressorDict&) = delete;
+
+  // The object is self-contained if the string constructor is used, or the
+  // Slice constructor is invoked with a non-null allocation. Otherwise, it
+  // is the caller's responsibility to ensure that the underlying storage
+  // outlives this object.
+  bool own_bytes() const { return !dict_str_.empty() || dict_allocation_; }
+
+  const Slice& GetRawDict() const { return decompressor_->GetSerializedDict(); }
+
+  // For TypedCacheInterface
+  const Slice& ContentSlice() const { return GetRawDict(); }
+  static constexpr CacheEntryRole kCacheEntryRole = CacheEntryRole::kOtherBlock;
+  static constexpr BlockType kBlockType = BlockType::kCompressionDictionary;
+
+  size_t ApproximateMemoryUsage() const { return memory_usage_; }
+
+ private:
+  void Populate(Decompressor& from_decompressor, Slice dict);
+};
+
 // Holds dictionary and related data, like ZSTD's digested compression
 // dictionary.
 struct CompressionDict {
@@ -153,7 +221,8 @@ struct CompressionDict {
   std::string dict_;
 
  public:
-  CompressionDict(std::string dict, CompressionType type, int level) {
+  CompressionDict() = default;
+  CompressionDict(std::string&& dict, CompressionType type, int level) {
     dict_ = std::move(dict);
 #ifdef ZSTD
     zstd_cdict_ = nullptr;
@@ -173,6 +242,25 @@ struct CompressionDict {
 #endif  // ZSTD
   }
 
+  CompressionDict(CompressionDict&& other) {
+#ifdef ZSTD
+    zstd_cdict_ = other.zstd_cdict_;
+    other.zstd_cdict_ = nullptr;
+#endif  // ZSTD
+    dict_ = std::move(other.dict_);
+  }
+  CompressionDict& operator=(CompressionDict&& other) {
+    if (this == &other) {
+      return *this;
+    }
+#ifdef ZSTD
+    zstd_cdict_ = other.zstd_cdict_;
+    other.zstd_cdict_ = nullptr;
+#endif  // ZSTD
+    dict_ = std::move(other.dict_);
+    return *this;
+  }
+
   ~CompressionDict() {
 #ifdef ZSTD
     size_t res = 0;
@@ -189,155 +277,19 @@ struct CompressionDict {
 #endif  // ZSTD
 
   Slice GetRawDict() const { return dict_; }
+  bool empty() const { return dict_.empty(); }
 
   static const CompressionDict& GetEmptyDict() {
     static CompressionDict empty_dict{};
     return empty_dict;
   }
 
-  CompressionDict() = default;
-  // Disable copy/move
+  // Disable copy
   CompressionDict(const CompressionDict&) = delete;
   CompressionDict& operator=(const CompressionDict&) = delete;
-  CompressionDict(CompressionDict&&) = delete;
-  CompressionDict& operator=(CompressionDict&&) = delete;
 };
 
-// Holds dictionary and related data, like ZSTD's digested uncompression
-// dictionary.
-struct UncompressionDict {
-  // Block containing the data for the compression dictionary in case the
-  // constructor that takes a string parameter is used.
-  std::string dict_;
-
-  // Block containing the data for the compression dictionary in case the
-  // constructor that takes a Slice parameter is used and the passed in
-  // CacheAllocationPtr is not nullptr.
-  CacheAllocationPtr allocation_;
-
-  // Slice pointing to the compression dictionary data. Can point to
-  // dict_, allocation_, or some other memory location, depending on how
-  // the object was constructed.
-  Slice slice_;
-
-#ifdef ROCKSDB_ZSTD_DDICT
-  // Processed version of the contents of slice_ for ZSTD compression.
-  ZSTD_DDict* zstd_ddict_ = nullptr;
-#endif  // ROCKSDB_ZSTD_DDICT
-
-  UncompressionDict(std::string dict, bool using_zstd)
-      : dict_(std::move(dict)), slice_(dict_) {
-#ifdef ROCKSDB_ZSTD_DDICT
-    if (!slice_.empty() && using_zstd) {
-      zstd_ddict_ = ZSTD_createDDict_byReference(slice_.data(), slice_.size());
-      assert(zstd_ddict_ != nullptr);
-    }
-#else
-    (void)using_zstd;
-#endif  // ROCKSDB_ZSTD_DDICT
-  }
-
-  UncompressionDict(Slice slice, CacheAllocationPtr&& allocation,
-                    bool using_zstd)
-      : allocation_(std::move(allocation)), slice_(std::move(slice)) {
-#ifdef ROCKSDB_ZSTD_DDICT
-    if (!slice_.empty() && using_zstd) {
-      zstd_ddict_ = ZSTD_createDDict_byReference(slice_.data(), slice_.size());
-      assert(zstd_ddict_ != nullptr);
-    }
-#else
-    (void)using_zstd;
-#endif  // ROCKSDB_ZSTD_DDICT
-  }
-
-  UncompressionDict(UncompressionDict&& rhs)
-      : dict_(std::move(rhs.dict_)),
-        allocation_(std::move(rhs.allocation_)),
-        slice_(std::move(rhs.slice_))
-#ifdef ROCKSDB_ZSTD_DDICT
-        ,
-        zstd_ddict_(rhs.zstd_ddict_)
-#endif
-  {
-#ifdef ROCKSDB_ZSTD_DDICT
-    rhs.zstd_ddict_ = nullptr;
-#endif
-  }
-
-  ~UncompressionDict() {
-#ifdef ROCKSDB_ZSTD_DDICT
-    size_t res = 0;
-    if (zstd_ddict_ != nullptr) {
-      res = ZSTD_freeDDict(zstd_ddict_);
-    }
-    assert(res == 0);  // Last I checked they can't fail
-    (void)res;         // prevent unused var warning
-#endif                 // ROCKSDB_ZSTD_DDICT
-  }
-
-  UncompressionDict& operator=(UncompressionDict&& rhs) {
-    if (this == &rhs) {
-      return *this;
-    }
-
-    dict_ = std::move(rhs.dict_);
-    allocation_ = std::move(rhs.allocation_);
-    slice_ = std::move(rhs.slice_);
-
-#ifdef ROCKSDB_ZSTD_DDICT
-    zstd_ddict_ = rhs.zstd_ddict_;
-    rhs.zstd_ddict_ = nullptr;
-#endif
-
-    return *this;
-  }
-
-  // The object is self-contained if the string constructor is used, or the
-  // Slice constructor is invoked with a non-null allocation. Otherwise, it
-  // is the caller's responsibility to ensure that the underlying storage
-  // outlives this object.
-  bool own_bytes() const { return !dict_.empty() || allocation_; }
-
-  const Slice& GetRawDict() const { return slice_; }
-
-  // For TypedCacheInterface
-  const Slice& ContentSlice() const { return slice_; }
-  static constexpr CacheEntryRole kCacheEntryRole = CacheEntryRole::kOtherBlock;
-  static constexpr BlockType kBlockType = BlockType::kCompressionDictionary;
-
-#ifdef ROCKSDB_ZSTD_DDICT
-  const ZSTD_DDict* GetDigestedZstdDDict() const { return zstd_ddict_; }
-#endif  // ROCKSDB_ZSTD_DDICT
-
-  static const UncompressionDict& GetEmptyDict() {
-    static UncompressionDict empty_dict{};
-    return empty_dict;
-  }
-
-  size_t ApproximateMemoryUsage() const {
-    size_t usage = sizeof(struct UncompressionDict);
-    usage += dict_.size();
-    if (allocation_) {
-      auto allocator = allocation_.get_deleter().allocator;
-      if (allocator) {
-        usage += allocator->UsableSize(allocation_.get(), slice_.size());
-      } else {
-        usage += slice_.size();
-      }
-    }
-#ifdef ROCKSDB_ZSTD_DDICT
-    usage += ZSTD_sizeof_DDict(zstd_ddict_);
-#endif  // ROCKSDB_ZSTD_DDICT
-    return usage;
-  }
-
-  UncompressionDict() = default;
-  // Disable copy
-  UncompressionDict(const CompressionDict&) = delete;
-  UncompressionDict& operator=(const CompressionDict&) = delete;
-};
-
-class CompressionContext {
+class CompressionContext : public Compressor::WorkingArea {
  private:
 #ifdef ZSTD
   ZSTD_CCtx* zstd_ctx_ = nullptr;
@@ -408,32 +360,9 @@ class CompressionContext {
   CompressionContext& operator=(const CompressionContext&) = delete;
 };
 
-class CompressionInfo {
-  const CompressionOptions& opts_;
-  const CompressionContext& context_;
-  const CompressionDict& dict_;
-  const CompressionType type_;
-  const uint64_t sample_for_compression_;
-
- public:
-  CompressionInfo(const CompressionOptions& _opts,
-                  const CompressionContext& _context,
-                  const CompressionDict& _dict, CompressionType _type,
-                  uint64_t _sample_for_compression)
-      : opts_(_opts),
-        context_(_context),
-        dict_(_dict),
-        type_(_type),
-        sample_for_compression_(_sample_for_compression) {}
-
-  const CompressionOptions& options() const { return opts_; }
-  const CompressionContext& context() const { return context_; }
-  const CompressionDict& dict() const { return dict_; }
-  CompressionType type() const { return type_; }
-  uint64_t SampleForCompression() const { return sample_for_compression_; }
-};
-
-class UncompressionContext {
+// This is like a working area, reusable for different dicts, etc.
+// TODO: refactor / consolidate
+class UncompressionContext : public Decompressor::WorkingArea {
  private:
   CompressionContextCache* ctx_cache_ = nullptr;
   ZSTDUncompressCachedData uncomp_cached_data_;
@@ -460,21 +389,6 @@ class UncompressionContext {
   }
 };
 
-class UncompressionInfo {
-  const UncompressionContext& context_;
-  const UncompressionDict& dict_;
-  const CompressionType type_;
-
- public:
-  UncompressionInfo(const UncompressionContext& _context,
-                    const UncompressionDict& _dict, CompressionType _type)
-      : context_(_context), dict_(_dict), type_(_type) {}
-
-  const UncompressionContext& context() const { return context_; }
-  const UncompressionDict& dict() const { return dict_; }
-  CompressionType type() const { return type_; }
-};
-
 inline bool Snappy_Supported() {
 #ifdef SNAPPY
   return true;
@@ -563,8 +477,7 @@ inline bool CompressionTypeSupported(CompressionType compression_type) {
       return XPRESS_Supported();
     case kZSTD:
       return ZSTD_Supported();
-    default:
-      assert(false);
+    default:  // Including custom compression types
       return false;
   }
 }
@@ -592,842 +505,19 @@ inline bool DictCompressionTypeSupported(CompressionType compression_type) {
       // NB: dictionary supported since 0.5.0. See ZSTD_VERSION_NUMBER check
       // above.
       return ZSTD_Supported();
-    default:
-      assert(false);
-      return false;
-  }
-}
-
-inline std::string CompressionTypeToString(CompressionType compression_type) {
-  switch (compression_type) {
-    case kNoCompression:
-      return "NoCompression";
-    case kSnappyCompression:
-      return "Snappy";
-    case kZlibCompression:
-      return "Zlib";
-    case kBZip2Compression:
-      return "BZip2";
-    case kLZ4Compression:
-      return "LZ4";
-    case kLZ4HCCompression:
-      return "LZ4HC";
-    case kXpressCompression:
-      return "Xpress";
-    case kZSTD:
-      return "ZSTD";
-    case kDisableCompressionOption:
-      return "DisableOption";
-    default:
-      assert(false);
-      return "";
-  }
-}
-
-inline std::string CompressionOptionsToString(
-    CompressionOptions& compression_options) {
-  std::string result;
-  result.reserve(512);
-  result.append("window_bits=")
-      .append(std::to_string(compression_options.window_bits))
-      .append("; ");
-  result.append("level=")
-      .append(std::to_string(compression_options.level))
-      .append("; ");
-  result.append("strategy=")
-      .append(std::to_string(compression_options.strategy))
-      .append("; ");
-  result.append("max_dict_bytes=")
-      .append(std::to_string(compression_options.max_dict_bytes))
-      .append("; ");
-  result.append("zstd_max_train_bytes=")
-      .append(std::to_string(compression_options.zstd_max_train_bytes))
-      .append("; ");
-  result.append("enabled=")
-      .append(std::to_string(compression_options.enabled))
-      .append("; ");
-  result.append("max_dict_buffer_bytes=")
-      .append(std::to_string(compression_options.max_dict_buffer_bytes))
-      .append("; ");
-  result.append("use_zstd_dict_trainer=")
-      .append(std::to_string(compression_options.use_zstd_dict_trainer))
-      .append("; ");
-  return result;
-}
-
-// compress_format_version can have two values:
-// 1 -- decompressed sizes for BZip2 and Zlib are not included in the compressed
-// block. Also, decompressed sizes for LZ4 are encoded in platform-dependent
-// way.
-// 2 -- Zlib, BZip2 and LZ4 encode decompressed size as Varint32 just before the
-// start of compressed block. Snappy format is the same as version 1.
-
-inline bool Snappy_Compress(const CompressionInfo& /*info*/, const char* input,
-                            size_t length, ::std::string* output) {
-#ifdef SNAPPY
-  output->resize(snappy::MaxCompressedLength(length));
-  size_t outlen;
-  snappy::RawCompress(input, length, &(*output)[0], &outlen);
-  output->resize(outlen);
-  return true;
-#else
-  (void)input;
-  (void)length;
-  (void)output;
-  return false;
-#endif
-}
-
-inline CacheAllocationPtr Snappy_Uncompress(
-    const char* input, size_t length, size_t* uncompressed_size,
-    MemoryAllocator* allocator = nullptr) {
-#ifdef SNAPPY
-  size_t uncompressed_length = 0;
-  if (!snappy::GetUncompressedLength(input, length, &uncompressed_length)) {
-    return nullptr;
-  }
-
-  CacheAllocationPtr output = AllocateBlock(uncompressed_length, allocator);
-
-  if (!snappy::RawUncompress(input, length, output.get())) {
-    return nullptr;
-  }
-
-  *uncompressed_size = uncompressed_length;
-
-  return output;
-#else
-  (void)input;
-  (void)length;
-  (void)uncompressed_size;
-  (void)allocator;
-  return nullptr;
-#endif
-}
-
-namespace compression {
-// returns size
-inline size_t PutDecompressedSizeInfo(std::string* output, uint32_t length) {
-  PutVarint32(output, length);
-  return output->size();
-}
-
-inline bool GetDecompressedSizeInfo(const char** input_data,
-                                    size_t* input_length,
-                                    uint32_t* output_len) {
-  auto new_input_data =
-      GetVarint32Ptr(*input_data, *input_data + *input_length, output_len);
-  if (new_input_data == nullptr) {
-    return false;
-  }
-  *input_length -= (new_input_data - *input_data);
-  *input_data = new_input_data;
-  return true;
-}
-}  // namespace compression
-
-// compress_format_version == 1 -- decompressed size is not included in the
-// block header
-// compress_format_version == 2 -- decompressed size is included in the block
-// header in varint32 format
-// @param compression_dict Data for presetting the compression library's
-//    dictionary.
-inline bool Zlib_Compress(const CompressionInfo& info,
-                          uint32_t compress_format_version, const char* input,
-                          size_t length, ::std::string* output) {
-#ifdef ZLIB
-  if (length > std::numeric_limits<uint32_t>::max()) {
-    // Can't compress more than 4GB
-    return false;
-  }
-
-  size_t output_header_len = 0;
-  if (compress_format_version == 2) {
-    output_header_len = compression::PutDecompressedSizeInfo(
-        output, static_cast<uint32_t>(length));
-  }
-
-  // The memLevel parameter specifies how much memory should be allocated for
-  // the internal compression state.
-  // memLevel=1 uses minimum memory but is slow and reduces compression ratio.
-  // memLevel=9 uses maximum memory for optimal speed.
-  // The default value is 8. See zconf.h for more details.
-  static const int memLevel = 8;
-  int level;
-  if (info.options().level == CompressionOptions::kDefaultCompressionLevel) {
-    level = Z_DEFAULT_COMPRESSION;
-  } else {
-    level = info.options().level;
-  }
-  z_stream _stream;
-  memset(&_stream, 0, sizeof(z_stream));
-  int st = deflateInit2(&_stream, level, Z_DEFLATED, info.options().window_bits,
-                        memLevel, info.options().strategy);
-  if (st != Z_OK) {
-    return false;
-  }
-
-  Slice compression_dict = info.dict().GetRawDict();
-  if (compression_dict.size()) {
-    // Initialize the compression library's dictionary
-    st = deflateSetDictionary(
-        &_stream, reinterpret_cast<const Bytef*>(compression_dict.data()),
-        static_cast<unsigned int>(compression_dict.size()));
-    if (st != Z_OK) {
-      deflateEnd(&_stream);
+    default:  // Including custom compression types
       return false;
-    }
-  }
-
-  // Get an upper bound on the compressed size.
-  size_t upper_bound =
-      deflateBound(&_stream, static_cast<unsigned long>(length));
-  output->resize(output_header_len + upper_bound);
-
-  // Compress the input, and put compressed data in output.
-  _stream.next_in = (Bytef*)input;
-  _stream.avail_in = static_cast<unsigned int>(length);
-
-  // Initialize the output size.
-  _stream.avail_out = static_cast<unsigned int>(upper_bound);
-  _stream.next_out = reinterpret_cast<Bytef*>(&(*output)[output_header_len]);
-
-  bool compressed = false;
-  st = deflate(&_stream, Z_FINISH);
-  if (st == Z_STREAM_END) {
-    compressed = true;
-    output->resize(output->size() - _stream.avail_out);
-  }
-  // The only return value we really care about is Z_STREAM_END.
-  // Z_OK means insufficient output space. This means the compression is
-  // bigger than decompressed size. Just fail the compression in that case.
-
-  deflateEnd(&_stream);
-  return compressed;
-#else
-  (void)info;
-  (void)compress_format_version;
-  (void)input;
-  (void)length;
-  (void)output;
-  return false;
-#endif
-}
-
-// compress_format_version == 1 -- decompressed size is not included in the
-// block header
-// compress_format_version == 2 -- decompressed size is included in the block
-// header in varint32 format
-// @param compression_dict Data for presetting the compression library's
-//    dictionary.
-inline CacheAllocationPtr Zlib_Uncompress(
-    const UncompressionInfo& info, const char* input_data, size_t input_length,
-    size_t* uncompressed_size, uint32_t compress_format_version,
-    MemoryAllocator* allocator = nullptr, int windowBits = -14) {
-#ifdef ZLIB
-  uint32_t output_len = 0;
-  if (compress_format_version == 2) {
-    if (!compression::GetDecompressedSizeInfo(&input_data, &input_length,
-                                              &output_len)) {
-      return nullptr;
-    }
-  } else {
-    // Assume the decompressed data size will 5x of compressed size, but round
-    // to the page size
-    size_t proposed_output_len = ((input_length * 5) & (~(4096 - 1))) + 4096;
-    output_len = static_cast<uint32_t>(
-        std::min(proposed_output_len,
-                 static_cast<size_t>(std::numeric_limits<uint32_t>::max())));
-  }
-
-  z_stream _stream;
-  memset(&_stream, 0, sizeof(z_stream));
-
-  // For raw inflate, the windowBits should be -8..-15.
-  // If windowBits is bigger than zero, it will use either zlib
-  // header or gzip header. Adding 32 to it will do automatic detection.
-  int st =
-      inflateInit2(&_stream, windowBits > 0 ? windowBits + 32 : windowBits);
-  if (st != Z_OK) {
-    return nullptr;
-  }
-
-  const Slice& compression_dict = info.dict().GetRawDict();
-  if (compression_dict.size()) {
-    // Initialize the compression library's dictionary
-    st = inflateSetDictionary(
-        &_stream, reinterpret_cast<const Bytef*>(compression_dict.data()),
-        static_cast<unsigned int>(compression_dict.size()));
-    if (st != Z_OK) {
-      return nullptr;
-    }
-  }
-
-  _stream.next_in = (Bytef*)input_data;
-  _stream.avail_in = static_cast<unsigned int>(input_length);
-
-  auto output = AllocateBlock(output_len, allocator);
-
-  _stream.next_out = (Bytef*)output.get();
-  _stream.avail_out = static_cast<unsigned int>(output_len);
-
-  bool done = false;
-  while (!done) {
-    st = inflate(&_stream, Z_SYNC_FLUSH);
-    switch (st) {
-      case Z_STREAM_END:
-        done = true;
-        break;
-      case Z_OK: {
-        // No output space. Increase the output space by 20%.
-        // We should never run out of output space if
-        // compress_format_version == 2
-        assert(compress_format_version != 2);
-        size_t old_sz = output_len;
-        uint32_t output_len_delta = output_len / 5;
-        output_len += output_len_delta < 10 ? 10 : output_len_delta;
-        auto tmp = AllocateBlock(output_len, allocator);
-        memcpy(tmp.get(), output.get(), old_sz);
-        output = std::move(tmp);
-
-        // Set more output.
-        _stream.next_out = (Bytef*)(output.get() + old_sz);
-        _stream.avail_out = static_cast<unsigned int>(output_len - old_sz);
-        break;
-      }
-      case Z_BUF_ERROR:
-      default:
-        inflateEnd(&_stream);
-        return nullptr;
-    }
-  }
-
-  // If we encoded decompressed block size, we should have no bytes left
-  assert(compress_format_version != 2 || _stream.avail_out == 0);
-  assert(output_len >= _stream.avail_out);
-  *uncompressed_size = output_len - _stream.avail_out;
-  inflateEnd(&_stream);
-  return output;
-#else
-  (void)info;
-  (void)input_data;
-  (void)input_length;
-  (void)uncompressed_size;
-  (void)compress_format_version;
-  (void)allocator;
-  (void)windowBits;
-  return nullptr;
-#endif
-}
-
-// compress_format_version == 1 -- decompressed size is not included in the
-// block header
-// compress_format_version == 2 -- decompressed size is included in the block
-// header in varint32 format
-inline bool BZip2_Compress(const CompressionInfo& /*info*/,
-                           uint32_t compress_format_version, const char* input,
-                           size_t length, ::std::string* output) {
-#ifdef BZIP2
-  if (length > std::numeric_limits<uint32_t>::max()) {
-    // Can't compress more than 4GB
-    return false;
-  }
-  size_t output_header_len = 0;
-  if (compress_format_version == 2) {
-    output_header_len = compression::PutDecompressedSizeInfo(
-        output, static_cast<uint32_t>(length));
-  }
-  // Resize output to be the plain data length.
-  // This may not be big enough if the compression actually expands data.
-  output->resize(output_header_len + length);
-
-  bz_stream _stream;
-  memset(&_stream, 0, sizeof(bz_stream));
-
-  // Block size 1 is 100K.
-  // 0 is for silent.
-  // 30 is the default workFactor
-  int st = BZ2_bzCompressInit(&_stream, 1, 0, 30);
-  if (st != BZ_OK) {
-    return false;
-  }
-
-  // Compress the input, and put compressed data in output.
-  _stream.next_in = (char*)input;
-  _stream.avail_in = static_cast<unsigned int>(length);
-
-  // Initialize the output size.
-  _stream.avail_out = static_cast<unsigned int>(length);
-  _stream.next_out = reinterpret_cast<char*>(&(*output)[output_header_len]);
-
-  bool compressed = false;
-  st = BZ2_bzCompress(&_stream, BZ_FINISH);
-  if (st == BZ_STREAM_END) {
-    compressed = true;
-    output->resize(output->size() - _stream.avail_out);
-  }
-  // The only return value we really care about is BZ_STREAM_END.
-  // BZ_FINISH_OK means insufficient output space. This means the compression
-  // is bigger than decompressed size. Just fail the compression in that case.
-
-  BZ2_bzCompressEnd(&_stream);
-  return compressed;
-#else
-  (void)compress_format_version;
-  (void)input;
-  (void)length;
-  (void)output;
-  return false;
-#endif
-}
-
-// compress_format_version == 1 -- decompressed size is not included in the
-// block header
-// compress_format_version == 2 -- decompressed size is included in the block
-// header in varint32 format
-inline CacheAllocationPtr BZip2_Uncompress(
-    const char* input_data, size_t input_length, size_t* uncompressed_size,
-    uint32_t compress_format_version, MemoryAllocator* allocator = nullptr) {
-#ifdef BZIP2
-  uint32_t output_len = 0;
-  if (compress_format_version == 2) {
-    if (!compression::GetDecompressedSizeInfo(&input_data, &input_length,
-                                              &output_len)) {
-      return nullptr;
-    }
-  } else {
-    // Assume the decompressed data size will 5x of compressed size, but round
-    // to the next page size
-    size_t proposed_output_len = ((input_length * 5) & (~(4096 - 1))) + 4096;
-    output_len = static_cast<uint32_t>(
-        std::min(proposed_output_len,
-                 static_cast<size_t>(std::numeric_limits<uint32_t>::max())));
-  }
-
-  bz_stream _stream;
-  memset(&_stream, 0, sizeof(bz_stream));
-
-  int st = BZ2_bzDecompressInit(&_stream, 0, 0);
-  if (st != BZ_OK) {
-    return nullptr;
-  }
-
-  _stream.next_in = (char*)input_data;
-  _stream.avail_in = static_cast<unsigned int>(input_length);
-
-  auto output = AllocateBlock(output_len, allocator);
-
-  _stream.next_out = (char*)output.get();
-  _stream.avail_out = static_cast<unsigned int>(output_len);
-
-  bool done = false;
-  while (!done) {
-    st = BZ2_bzDecompress(&_stream);
-    switch (st) {
-      case BZ_STREAM_END:
-        done = true;
-        break;
-      case BZ_OK: {
-        // No output space. Increase the output space by 20%.
-        // We should never run out of output space if
-        // compress_format_version == 2
-        assert(compress_format_version != 2);
-        uint32_t old_sz = output_len;
-        output_len = output_len * 1.2;
-        auto tmp = AllocateBlock(output_len, allocator);
-        memcpy(tmp.get(), output.get(), old_sz);
-        output = std::move(tmp);
-
-        // Set more output.
-        _stream.next_out = (char*)(output.get() + old_sz);
-        _stream.avail_out = static_cast<unsigned int>(output_len - old_sz);
-        break;
-      }
-      default:
-        BZ2_bzDecompressEnd(&_stream);
-        return nullptr;
-    }
-  }
-
-  // If we encoded decompressed block size, we should have no bytes left
-  assert(compress_format_version != 2 || _stream.avail_out == 0);
-  assert(output_len >= _stream.avail_out);
-  *uncompressed_size = output_len - _stream.avail_out;
-  BZ2_bzDecompressEnd(&_stream);
-  return output;
-#else
-  (void)input_data;
-  (void)input_length;
-  (void)uncompressed_size;
-  (void)compress_format_version;
-  (void)allocator;
-  return nullptr;
-#endif
-}
-
-// compress_format_version == 1 -- decompressed size is included in the
-// block header using memcpy, which makes database non-portable)
-// compress_format_version == 2 -- decompressed size is included in the block
-// header in varint32 format
-// @param compression_dict Data for presetting the compression library's
-//    dictionary.
-inline bool LZ4_Compress(const CompressionInfo& info,
-                         uint32_t compress_format_version, const char* input,
-                         size_t length, ::std::string* output) {
-#ifdef LZ4
-  if (length > std::numeric_limits<uint32_t>::max()) {
-    // Can't compress more than 4GB
-    return false;
-  }
-
-  size_t output_header_len = 0;
-  if (compress_format_version == 2) {
-    // new encoding, using varint32 to store size information
-    output_header_len = compression::PutDecompressedSizeInfo(
-        output, static_cast<uint32_t>(length));
-  } else {
-    // legacy encoding, which is not really portable (depends on big/little
-    // endianness)
-    output_header_len = 8;
-    output->resize(output_header_len);
-    char* p = const_cast<char*>(output->c_str());
-    memcpy(p, &length, sizeof(length));
   }
-  int compress_bound = LZ4_compressBound(static_cast<int>(length));
-  output->resize(static_cast<size_t>(output_header_len + compress_bound));
-
-  int outlen;
-#if LZ4_VERSION_NUMBER >= 10400  // r124+
-  LZ4_stream_t* stream = LZ4_createStream();
-  Slice compression_dict = info.dict().GetRawDict();
-  if (compression_dict.size()) {
-    LZ4_loadDict(stream, compression_dict.data(),
-                 static_cast<int>(compression_dict.size()));
-  }
-#if LZ4_VERSION_NUMBER >= 10700  // r129+
-  int acceleration;
-  if (info.options().level < 0) {
-    acceleration = -info.options().level;
-  } else {
-    acceleration = 1;
-  }
-  outlen = LZ4_compress_fast_continue(
-      stream, input, &(*output)[output_header_len], static_cast<int>(length),
-      compress_bound, acceleration);
-#else  // up to r128
-  outlen = LZ4_compress_limitedOutput_continue(
-      stream, input, &(*output)[output_header_len], static_cast<int>(length),
-      compress_bound);
-#endif
-  LZ4_freeStream(stream);
-#else   // up to r123
-  outlen = LZ4_compress_limitedOutput(input, &(*output)[output_header_len],
-                                      static_cast<int>(length), compress_bound);
-#endif  // LZ4_VERSION_NUMBER >= 10400
-
-  if (outlen == 0) {
-    return false;
-  }
-  output->resize(static_cast<size_t>(output_header_len + outlen));
-  return true;
-#else  // LZ4
-  (void)info;
-  (void)compress_format_version;
-  (void)input;
-  (void)length;
-  (void)output;
-  return false;
-#endif
 }
 
-// compress_format_version == 1 -- decompressed size is included in the
-// block header using memcpy, which makes database non-portable)
-// compress_format_version == 2 -- decompressed size is included in the block
-// header in varint32 format
-// @param compression_dict Data for presetting the compression library's
-//    dictionary.
-inline CacheAllocationPtr LZ4_Uncompress(const UncompressionInfo& info,
-                                         const char* input_data,
-                                         size_t input_length,
-                                         size_t* uncompressed_size,
-                                         uint32_t compress_format_version,
-                                         MemoryAllocator* allocator = nullptr) {
-#ifdef LZ4
-  uint32_t output_len = 0;
-  if (compress_format_version == 2) {
-    // new encoding, using varint32 to store size information
-    if (!compression::GetDecompressedSizeInfo(&input_data, &input_length,
-                                              &output_len)) {
-      return nullptr;
-    }
-  } else {
-    // legacy encoding, which is not really portable (depends on big/little
-    // endianness)
-    if (input_length < 8) {
-      return nullptr;
-    }
-    if (port::kLittleEndian) {
-      memcpy(&output_len, input_data, sizeof(output_len));
-    } else {
-      memcpy(&output_len, input_data + 4, sizeof(output_len));
-    }
-    input_length -= 8;
-    input_data += 8;
-  }
-
-  auto output = AllocateBlock(output_len, allocator);
+// WART: does not match OptionsHelper::compression_type_string_map
+std::string CompressionTypeToString(CompressionType compression_type);
 
-  int decompress_bytes = 0;
+// WART: does not match OptionsHelper::compression_type_string_map
+CompressionType CompressionTypeFromString(std::string compression_type_str);
 
-#if LZ4_VERSION_NUMBER >= 10400  // r124+
-  LZ4_streamDecode_t* stream = LZ4_createStreamDecode();
-  const Slice& compression_dict = info.dict().GetRawDict();
-  if (compression_dict.size()) {
-    LZ4_setStreamDecode(stream, compression_dict.data(),
-                        static_cast<int>(compression_dict.size()));
-  }
-  decompress_bytes = LZ4_decompress_safe_continue(
-      stream, input_data, output.get(), static_cast<int>(input_length),
-      static_cast<int>(output_len));
-  LZ4_freeStreamDecode(stream);
-#else   // up to r123
-  decompress_bytes = LZ4_decompress_safe(input_data, output.get(),
-                                         static_cast<int>(input_length),
-                                         static_cast<int>(output_len));
-#endif  // LZ4_VERSION_NUMBER >= 10400
-
-  if (decompress_bytes < 0) {
-    return nullptr;
-  }
-  assert(decompress_bytes == static_cast<int>(output_len));
-  *uncompressed_size = decompress_bytes;
-  return output;
-#else  // LZ4
-  (void)info;
-  (void)input_data;
-  (void)input_length;
-  (void)uncompressed_size;
-  (void)compress_format_version;
-  (void)allocator;
-  return nullptr;
-#endif
-}
-
-// compress_format_version == 1 -- decompressed size is included in the
-// block header using memcpy, which makes database non-portable)
-// compress_format_version == 2 -- decompressed size is included in the block
-// header in varint32 format
-// @param compression_dict Data for presetting the compression library's
-//    dictionary.
-inline bool LZ4HC_Compress(const CompressionInfo& info,
-                           uint32_t compress_format_version, const char* input,
-                           size_t length, ::std::string* output) {
-#ifdef LZ4
-  if (length > std::numeric_limits<uint32_t>::max()) {
-    // Can't compress more than 4GB
-    return false;
-  }
-
-  size_t output_header_len = 0;
-  if (compress_format_version == 2) {
-    // new encoding, using varint32 to store size information
-    output_header_len = compression::PutDecompressedSizeInfo(
-        output, static_cast<uint32_t>(length));
-  } else {
-    // legacy encoding, which is not really portable (depends on big/little
-    // endianness)
-    output_header_len = 8;
-    output->resize(output_header_len);
-    char* p = const_cast<char*>(output->c_str());
-    memcpy(p, &length, sizeof(length));
-  }
-  int compress_bound = LZ4_compressBound(static_cast<int>(length));
-  output->resize(static_cast<size_t>(output_header_len + compress_bound));
-
-  int outlen;
-  int level;
-  if (info.options().level == CompressionOptions::kDefaultCompressionLevel) {
-    level = 0;  // lz4hc.h says any value < 1 will be sanitized to default
-  } else {
-    level = info.options().level;
-  }
-#if LZ4_VERSION_NUMBER >= 10400  // r124+
-  LZ4_streamHC_t* stream = LZ4_createStreamHC();
-  LZ4_resetStreamHC(stream, level);
-  Slice compression_dict = info.dict().GetRawDict();
-  const char* compression_dict_data =
-      compression_dict.size() > 0 ? compression_dict.data() : nullptr;
-  size_t compression_dict_size = compression_dict.size();
-  if (compression_dict_data != nullptr) {
-    LZ4_loadDictHC(stream, compression_dict_data,
-                   static_cast<int>(compression_dict_size));
-  }
-
-#if LZ4_VERSION_NUMBER >= 10700  // r129+
-  outlen =
-      LZ4_compress_HC_continue(stream, input, &(*output)[output_header_len],
-                               static_cast<int>(length), compress_bound);
-#else   // r124-r128
-  outlen = LZ4_compressHC_limitedOutput_continue(
-      stream, input, &(*output)[output_header_len], static_cast<int>(length),
-      compress_bound);
-#endif  // LZ4_VERSION_NUMBER >= 10700
-  LZ4_freeStreamHC(stream);
-
-#elif LZ4_VERSION_MAJOR  // r113-r123
-  outlen = LZ4_compressHC2_limitedOutput(input, &(*output)[output_header_len],
-                                         static_cast<int>(length),
-                                         compress_bound, level);
-#else                    // up to r112
-  outlen =
-      LZ4_compressHC_limitedOutput(input, &(*output)[output_header_len],
-                                   static_cast<int>(length), compress_bound);
-#endif                   // LZ4_VERSION_NUMBER >= 10400
-
-  if (outlen == 0) {
-    return false;
-  }
-  output->resize(static_cast<size_t>(output_header_len + outlen));
-  return true;
-#else  // LZ4
-  (void)info;
-  (void)compress_format_version;
-  (void)input;
-  (void)length;
-  (void)output;
-  return false;
-#endif
-}
-
-#ifdef XPRESS
-inline bool XPRESS_Compress(const char* input, size_t length,
-                            std::string* output) {
-  return port::xpress::Compress(input, length, output);
-}
-#else
-inline bool XPRESS_Compress(const char* /*input*/, size_t /*length*/,
-                            std::string* /*output*/) {
-  return false;
-}
-#endif
-
-#ifdef XPRESS
-inline char* XPRESS_Uncompress(const char* input_data, size_t input_length,
-                               size_t* uncompressed_size) {
-  return port::xpress::Decompress(input_data, input_length, uncompressed_size);
-}
-#else
-inline char* XPRESS_Uncompress(const char* /*input_data*/,
-                               size_t /*input_length*/,
-                               size_t* /*uncompressed_size*/) {
-  return nullptr;
-}
-#endif
-
-inline bool ZSTD_Compress(const CompressionInfo& info, const char* input,
-                          size_t length, ::std::string* output) {
-#ifdef ZSTD
-  if (length > std::numeric_limits<uint32_t>::max()) {
-    // Can't compress more than 4GB
-    return false;
-  }
-
-  size_t output_header_len = compression::PutDecompressedSizeInfo(
-      output, static_cast<uint32_t>(length));
-
-  size_t compressBound = ZSTD_compressBound(length);
-  output->resize(static_cast<size_t>(output_header_len + compressBound));
-  size_t outlen = 0;
-  ZSTD_CCtx* context = info.context().ZSTDPreallocCtx();
-  assert(context != nullptr);
-  if (info.dict().GetDigestedZstdCDict() != nullptr) {
-    ZSTD_CCtx_refCDict(context, info.dict().GetDigestedZstdCDict());
-  } else {
-    ZSTD_CCtx_loadDictionary(context, info.dict().GetRawDict().data(),
-                             info.dict().GetRawDict().size());
-  }
-
-  // Compression level is set in `contex` during CreateNativeContext()
-  outlen = ZSTD_compress2(context, &(*output)[output_header_len], compressBound,
-                          input, length);
-  if (outlen == 0) {
-    return false;
-  }
-  output->resize(output_header_len + outlen);
-  return true;
-#else  // ZSTD
-  (void)info;
-  (void)input;
-  (void)length;
-  (void)output;
-  return false;
-#endif
-}
-
-// @param compression_dict Data for presetting the compression library's
-//    dictionary.
-// @param error_message If not null, will be set if decompression fails.
-//
-// Returns nullptr if decompression fails.
-inline CacheAllocationPtr ZSTD_Uncompress(
-    const UncompressionInfo& info, const char* input_data, size_t input_length,
-    size_t* uncompressed_size, MemoryAllocator* allocator = nullptr,
-    const char** error_message = nullptr) {
-#ifdef ZSTD
-  static const char* const kErrorDecodeOutputSize =
-      "Cannot decode output size.";
-  static const char* const kErrorOutputLenMismatch =
-      "Decompressed size does not match header.";
-  uint32_t output_len = 0;
-  if (!compression::GetDecompressedSizeInfo(&input_data, &input_length,
-                                            &output_len)) {
-    if (error_message) {
-      *error_message = kErrorDecodeOutputSize;
-    }
-    return nullptr;
-  }
-
-  CacheAllocationPtr output = AllocateBlock(output_len, allocator);
-  size_t actual_output_length = 0;
-  ZSTD_DCtx* context = info.context().GetZSTDContext();
-  assert(context != nullptr);
-#ifdef ROCKSDB_ZSTD_DDICT
-  if (info.dict().GetDigestedZstdDDict() != nullptr) {
-    actual_output_length = ZSTD_decompress_usingDDict(
-        context, output.get(), output_len, input_data, input_length,
-        info.dict().GetDigestedZstdDDict());
-  } else {
-#endif  // ROCKSDB_ZSTD_DDICT
-    actual_output_length = ZSTD_decompress_usingDict(
-        context, output.get(), output_len, input_data, input_length,
-        info.dict().GetRawDict().data(), info.dict().GetRawDict().size());
-#ifdef ROCKSDB_ZSTD_DDICT
-  }
-#endif  // ROCKSDB_ZSTD_DDICT
-  if (ZSTD_isError(actual_output_length)) {
-    if (error_message) {
-      *error_message = ZSTD_getErrorName(actual_output_length);
-    }
-    return nullptr;
-  } else if (actual_output_length != output_len) {
-    if (error_message) {
-      *error_message = kErrorOutputLenMismatch;
-    }
-    return nullptr;
-  }
-
-  *uncompressed_size = actual_output_length;
-  return output;
-#else  // ZSTD
-  (void)info;
-  (void)input_data;
-  (void)input_length;
-  (void)uncompressed_size;
-  (void)allocator;
-  (void)error_message;
-  return nullptr;
-#endif
-}
+std::string CompressionOptionsToString(
+    const CompressionOptions& compression_options);
 
 inline bool ZSTD_TrainDictionarySupported() {
 #ifdef ZSTD
@@ -1440,50 +530,6 @@ inline bool ZSTD_TrainDictionarySupported() {
 #endif
 }
 
-inline std::string ZSTD_TrainDictionary(const std::string& samples,
-                                        const std::vector<size_t>& sample_lens,
-                                        size_t max_dict_bytes) {
-#ifdef ZSTD
-  assert(samples.empty() == sample_lens.empty());
-  if (samples.empty()) {
-    return "";
-  }
-  std::string dict_data(max_dict_bytes, '\0');
-  size_t dict_len = ZDICT_trainFromBuffer(
-      &dict_data[0], max_dict_bytes, &samples[0], &sample_lens[0],
-      static_cast<unsigned>(sample_lens.size()));
-  if (ZDICT_isError(dict_len)) {
-    return "";
-  }
-  assert(dict_len <= max_dict_bytes);
-  dict_data.resize(dict_len);
-  return dict_data;
-#else
-  assert(false);
-  (void)samples;
-  (void)sample_lens;
-  (void)max_dict_bytes;
-  return "";
-#endif  // ZSTD
-}
-
-inline std::string ZSTD_TrainDictionary(const std::string& samples,
-                                        size_t sample_len_shift,
-                                        size_t max_dict_bytes) {
-#ifdef ZSTD
-  // skips potential partial sample at the end of "samples"
-  size_t num_samples = samples.size() >> sample_len_shift;
-  std::vector<size_t> sample_lens(num_samples, size_t(1) << sample_len_shift);
-  return ZSTD_TrainDictionary(samples, sample_lens, max_dict_bytes);
-#else
-  assert(false);
-  (void)samples;
-  (void)sample_len_shift;
-  (void)max_dict_bytes;
-  return "";
-#endif  // ZSTD
-}
-
 inline bool ZSTD_FinalizeDictionarySupported() {
 #ifdef ROCKSDB_ZDICT_FINALIZE
   return true;
@@ -1492,119 +538,16 @@ inline bool ZSTD_FinalizeDictionarySupported() {
 #endif
 }
 
-inline std::string ZSTD_FinalizeDictionary(
-    const std::string& samples, const std::vector<size_t>& sample_lens,
-    size_t max_dict_bytes, int level) {
-#ifdef ROCKSDB_ZDICT_FINALIZE
-  assert(samples.empty() == sample_lens.empty());
-  if (samples.empty()) {
-    return "";
-  }
-  if (level == CompressionOptions::kDefaultCompressionLevel) {
-    // NB: ZSTD_CLEVEL_DEFAULT is historically == 3
-    level = ZSTD_CLEVEL_DEFAULT;
-  }
-  std::string dict_data(max_dict_bytes, '\0');
-  size_t dict_len = ZDICT_finalizeDictionary(
-      dict_data.data(), max_dict_bytes, samples.data(),
-      std::min(static_cast<size_t>(samples.size()), max_dict_bytes),
-      samples.data(), sample_lens.data(),
-      static_cast<unsigned>(sample_lens.size()),
-      {level, 0 /* notificationLevel */, 0 /* dictID */});
-  if (ZDICT_isError(dict_len)) {
-    return "";
-  } else {
-    assert(dict_len <= max_dict_bytes);
-    dict_data.resize(dict_len);
-    return dict_data;
-  }
-#else
-  assert(false);
-  (void)samples;
-  (void)sample_lens;
-  (void)max_dict_bytes;
-  (void)level;
-  return "";
-#endif  // ROCKSDB_ZDICT_FINALIZE
-}
-
-inline bool CompressData(const Slice& raw,
-                         const CompressionInfo& compression_info,
-                         uint32_t compress_format_version,
-                         std::string* compressed_output) {
-  bool ret = false;
-
-  // Will return compressed block contents if (1) the compression method is
-  // supported in this platform and (2) the compression rate is "good enough".
-  switch (compression_info.type()) {
-    case kSnappyCompression:
-      ret = Snappy_Compress(compression_info, raw.data(), raw.size(),
-                            compressed_output);
-      break;
-    case kZlibCompression:
-      ret = Zlib_Compress(compression_info, compress_format_version, raw.data(),
-                          raw.size(), compressed_output);
-      break;
-    case kBZip2Compression:
-      ret = BZip2_Compress(compression_info, compress_format_version,
-                           raw.data(), raw.size(), compressed_output);
-      break;
-    case kLZ4Compression:
-      ret = LZ4_Compress(compression_info, compress_format_version, raw.data(),
-                         raw.size(), compressed_output);
-      break;
-    case kLZ4HCCompression:
-      ret = LZ4HC_Compress(compression_info, compress_format_version,
-                           raw.data(), raw.size(), compressed_output);
-      break;
-    case kXpressCompression:
-      ret = XPRESS_Compress(raw.data(), raw.size(), compressed_output);
-      break;
-    case kZSTD:
-      ret = ZSTD_Compress(compression_info, raw.data(), raw.size(),
-                          compressed_output);
-      break;
-    default:
-      // Do not recognize this compression type
-      break;
-  }
-
-  TEST_SYNC_POINT_CALLBACK("CompressData:TamperWithReturnValue",
-                           static_cast<void*>(&ret));
-
-  return ret;
-}
-
-inline CacheAllocationPtr UncompressData(
-    const UncompressionInfo& uncompression_info, const char* data, size_t n,
-    size_t* uncompressed_size, uint32_t compress_format_version,
-    MemoryAllocator* allocator = nullptr,
-    const char** error_message = nullptr) {
-  switch (uncompression_info.type()) {
-    case kSnappyCompression:
-      return Snappy_Uncompress(data, n, uncompressed_size, allocator);
-    case kZlibCompression:
-      return Zlib_Uncompress(uncompression_info, data, n, uncompressed_size,
-                             compress_format_version, allocator);
-    case kBZip2Compression:
-      return BZip2_Uncompress(data, n, uncompressed_size,
-                              compress_format_version, allocator);
-    case kLZ4Compression:
-    case kLZ4HCCompression:
-      return LZ4_Uncompress(uncompression_info, data, n, uncompressed_size,
-                            compress_format_version, allocator);
-    case kXpressCompression:
-      // XPRESS allocates memory internally, thus no support for custom
-      // allocator.
-      return CacheAllocationPtr(XPRESS_Uncompress(data, n, uncompressed_size));
-    case kZSTD:
-      // TODO(cbi): error message handling for other compression algorithms.
-      return ZSTD_Uncompress(uncompression_info, data, n, uncompressed_size,
-                             allocator, error_message);
-    default:
-      return CacheAllocationPtr();
-  }
-}
+// The new compression APIs intentionally make it difficult to generate
+// compressed data larger than the original. (It is better to store the
+// uncompressed version in that case.) For legacy cases that must store
+// compressed data even when larger than the uncompressed, this is a convenient
+// wrapper to support that, with a compressor from BuiltinCompressionManager and
+// a GrowableBuffer.
+Status LegacyForceBuiltinCompression(
+    Compressor& builtin_compressor,
+    Compressor::ManagedWorkingArea* working_area, Slice from,
+    GrowableBuffer* to);
 
 // Records the compression type for subsequent WAL records.
 class CompressionTypeRecord {
diff --git a/util/compression_test.cc b/util/compression_test.cc
new file mode 100644
index 000000000000..e87e4195feb2
--- /dev/null
+++ b/util/compression_test.cc
@@ -0,0 +1,2373 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Testing various compression features
+
+#include <cstdlib>
+#include <memory>
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/flush_block_policy.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "table/block_based/block_builder.h"
+#include "test_util/testutil.h"
+#include "util/auto_tune_compressor.h"
+#include "util/coding.h"
+#include "util/random.h"
+#include "util/simple_mixed_compressor.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBCompressionTest : public DBTestBase {
+ public:
+  DBCompressionTest() : DBTestBase("compression_test", /*env_do_fsync=*/true) {}
+};
+
+TEST_F(DBCompressionTest, PresetCompressionDict) {
+  // Verifies that compression ratio improves when dictionary is enabled, and
+  // improves even further when the dictionary is trained by ZSTD.
+  const size_t kBlockSizeBytes = 4 << 10;
+  const size_t kL0FileBytes = 128 << 10;
+  const size_t kApproxPerBlockOverheadBytes = 50;
+  const int kNumL0Files = 5;
+
+  Options options;
+  // Make sure to use any custom env that the test is configured with.
+  options.env = CurrentOptions().env;
+  options.allow_concurrent_memtable_write = false;
+  options.arena_block_size = kBlockSizeBytes;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kL0FileBytes / kBlockSizeBytes));
+  options.num_levels = 2;
+  options.target_file_size_base = kL0FileBytes;
+  options.target_file_size_multiplier = 2;
+  options.write_buffer_size = kL0FileBytes;
+  BlockBasedTableOptions table_options;
+  table_options.block_size = kBlockSizeBytes;
+  std::vector<CompressionType> compression_types;
+  if (Zlib_Supported()) {
+    compression_types.push_back(kZlibCompression);
+  }
+#if LZ4_VERSION_NUMBER >= 10400  // r124+
+  compression_types.push_back(kLZ4Compression);
+  compression_types.push_back(kLZ4HCCompression);
+#endif  // LZ4_VERSION_NUMBER >= 10400
+  if (ZSTD_Supported()) {
+    compression_types.push_back(kZSTD);
+  }
+
+  enum DictionaryTypes : int {
+    kWithoutDict,
+    kWithDict,
+    kWithZSTDfinalizeDict,
+    kWithZSTDTrainedDict,
+    kDictEnd,
+  };
+
+  for (auto compression_type : compression_types) {
+    options.compression = compression_type;
+    size_t bytes_without_dict = 0;
+    size_t bytes_with_dict = 0;
+    size_t bytes_with_zstd_finalize_dict = 0;
+    size_t bytes_with_zstd_trained_dict = 0;
+    for (int i = kWithoutDict; i < kDictEnd; i++) {
+      // First iteration: compress without preset dictionary
+      // Second iteration: compress with preset dictionary
+      // Third iteration (zstd only): compress with zstd-trained dictionary
+      //
+      // To make sure the compression dictionary has the intended effect, we
+      // verify the compressed size is smaller in successive iterations. Also in
+      // the non-first iterations, verify the data we get out is the same data
+      // we put in.
+      switch (i) {
+        case kWithoutDict:
+          options.compression_opts.max_dict_bytes = 0;
+          options.compression_opts.zstd_max_train_bytes = 0;
+          break;
+        case kWithDict:
+          options.compression_opts.max_dict_bytes = kBlockSizeBytes;
+          options.compression_opts.zstd_max_train_bytes = 0;
+          break;
+        case kWithZSTDfinalizeDict:
+          if (compression_type != kZSTD ||
+              !ZSTD_FinalizeDictionarySupported()) {
+            continue;
+          }
+          options.compression_opts.max_dict_bytes = kBlockSizeBytes;
+          options.compression_opts.zstd_max_train_bytes = kL0FileBytes;
+          options.compression_opts.use_zstd_dict_trainer = false;
+          break;
+        case kWithZSTDTrainedDict:
+          if (compression_type != kZSTD || !ZSTD_TrainDictionarySupported()) {
+            continue;
+          }
+          options.compression_opts.max_dict_bytes = kBlockSizeBytes;
+          options.compression_opts.zstd_max_train_bytes = kL0FileBytes;
+          options.compression_opts.use_zstd_dict_trainer = true;
+          break;
+        default:
+          assert(false);
+      }
+
+      options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+      CreateAndReopenWithCF({"pikachu"}, options);
+      Random rnd(301);
+      std::string seq_datas[10];
+      for (int j = 0; j < 10; ++j) {
+        seq_datas[j] =
+            rnd.RandomString(kBlockSizeBytes - kApproxPerBlockOverheadBytes);
+      }
+
+      ASSERT_EQ(0, NumTableFilesAtLevel(0, 1));
+      for (int j = 0; j < kNumL0Files; ++j) {
+        for (size_t k = 0; k < kL0FileBytes / kBlockSizeBytes + 1; ++k) {
+          auto key_num = j * (kL0FileBytes / kBlockSizeBytes) + k;
+          ASSERT_OK(Put(1, Key(static_cast<int>(key_num)),
+                        seq_datas[(key_num / 10) % 10]));
+        }
+        ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+        ASSERT_EQ(j + 1, NumTableFilesAtLevel(0, 1));
+      }
+      ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1],
+                                            true /* disallow_trivial_move */));
+      ASSERT_EQ(0, NumTableFilesAtLevel(0, 1));
+      ASSERT_GT(NumTableFilesAtLevel(1, 1), 0);
+
+      // Get the live sst files size
+      size_t total_sst_bytes = TotalSize(1);
+      if (i == kWithoutDict) {
+        bytes_without_dict = total_sst_bytes;
+      } else if (i == kWithDict) {
+        bytes_with_dict = total_sst_bytes;
+      } else if (i == kWithZSTDfinalizeDict) {
+        bytes_with_zstd_finalize_dict = total_sst_bytes;
+      } else if (i == kWithZSTDTrainedDict) {
+        bytes_with_zstd_trained_dict = total_sst_bytes;
+      }
+
+      for (size_t j = 0; j < kNumL0Files * (kL0FileBytes / kBlockSizeBytes);
+           j++) {
+        ASSERT_EQ(seq_datas[(j / 10) % 10], Get(1, Key(static_cast<int>(j))));
+      }
+      if (i == kWithDict) {
+        ASSERT_GT(bytes_without_dict, bytes_with_dict);
+      } else if (i == kWithZSTDTrainedDict) {
+        // In zstd compression, it is sometimes possible that using a finalized
+        // dictionary does not get as good a compression ratio as raw content
+        // dictionary. But using a dictionary should always get better
+        // compression ratio than not using one.
+        ASSERT_TRUE(bytes_with_dict > bytes_with_zstd_finalize_dict ||
+                    bytes_without_dict > bytes_with_zstd_finalize_dict);
+      } else if (i == kWithZSTDTrainedDict) {
+        // In zstd compression, it is sometimes possible that using a trained
+        // dictionary does not get as good a compression ratio as without
+        // training.
+        // But using a dictionary (with or without training) should always get
+        // better compression ratio than not using one.
+        ASSERT_TRUE(bytes_with_dict > bytes_with_zstd_trained_dict ||
+                    bytes_without_dict > bytes_with_zstd_trained_dict);
+      }
+
+      DestroyAndReopen(options);
+    }
+  }
+}
+
+TEST_F(DBCompressionTest, PresetCompressionDictLocality) {
+  if (!ZSTD_Supported()) {
+    return;
+  }
+  // Verifies that compression dictionary is generated from local data. The
+  // verification simply checks all output SSTs have different compression
+  // dictionaries. We do not verify effectiveness as that'd likely be flaky in
+  // the future.
+  const int kNumEntriesPerFile = 1 << 10;  // 1KB
+  const int kNumBytesPerEntry = 1 << 10;   // 1KB
+  const int kNumFiles = 4;
+  Options options = CurrentOptions();
+  options.compression = kZSTD;
+  options.compression_opts.max_dict_bytes = 1 << 14;        // 16KB
+  options.compression_opts.zstd_max_train_bytes = 1 << 18;  // 256KB
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.target_file_size_base = kNumEntriesPerFile * kNumBytesPerEntry;
+  BlockBasedTableOptions table_options;
+  table_options.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < kNumFiles; ++i) {
+    for (int j = 0; j < kNumEntriesPerFile; ++j) {
+      ASSERT_OK(Put(Key(i * kNumEntriesPerFile + j),
+                    rnd.RandomString(kNumBytesPerEntry)));
+    }
+    ASSERT_OK(Flush());
+    MoveFilesToLevel(1);
+    ASSERT_EQ(NumTableFilesAtLevel(1), i + 1);
+  }
+
+  // Store all the dictionaries generated during a full compaction.
+  std::vector<std::string> compression_dicts;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict",
+      [&](void* arg) {
+        compression_dicts.emplace_back(static_cast<Slice*>(arg)->ToString());
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  CompactRangeOptions compact_range_opts;
+  compact_range_opts.bottommost_level_compaction =
+      BottommostLevelCompaction::kForceOptimized;
+  ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr));
+
+  // Dictionary compression should not be so good as to compress four totally
+  // random files into one. If it does then there's probably something wrong
+  // with the test.
+  ASSERT_GT(NumTableFilesAtLevel(1), 1);
+
+  // Furthermore, there should be one compression dictionary generated per file.
+  // And they should all be different from each other.
+  ASSERT_EQ(NumTableFilesAtLevel(1),
+            static_cast<int>(compression_dicts.size()));
+  for (size_t i = 1; i < compression_dicts.size(); ++i) {
+    std::string& a = compression_dicts[i - 1];
+    std::string& b = compression_dicts[i];
+    size_t alen = a.size();
+    size_t blen = b.size();
+    ASSERT_TRUE(alen != blen || memcmp(a.data(), b.data(), alen) != 0);
+  }
+}
+
+static std::string CompressibleString(Random* rnd, int len) {
+  std::string r;
+  test::CompressibleString(rnd, 0.8, len, &r);
+  return r;
+}
+
+TEST_F(DBCompressionTest, DynamicLevelCompressionPerLevel) {
+  if (!Snappy_Supported()) {
+    return;
+  }
+  const int kNKeys = 120;
+  int keys[kNKeys];
+  for (int i = 0; i < kNKeys; i++) {
+    keys[i] = i;
+  }
+
+  Random rnd(301);
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  options.db_write_buffer_size = 20480;
+  options.write_buffer_size = 20480;
+  options.max_write_buffer_number = 2;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_stop_writes_trigger = 2;
+  options.target_file_size_base = 20480;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.max_bytes_for_level_base = 102400;
+  options.max_bytes_for_level_multiplier = 4;
+  options.max_background_compactions = 1;
+  options.num_levels = 5;
+  options.statistics = CreateDBStatistics();
+
+  options.compression_per_level.resize(3);
+  // No compression for L0
+  options.compression_per_level[0] = kNoCompression;
+  // No compression for the Ln whre L0 is compacted to
+  options.compression_per_level[1] = kNoCompression;
+  // Snappy compression for Ln+1
+  options.compression_per_level[2] = kSnappyCompression;
+
+  OnFileDeletionListener* listener = new OnFileDeletionListener();
+  options.listeners.emplace_back(listener);
+
+  DestroyAndReopen(options);
+
+  // Insert more than 80K. L4 should be base level. Neither L0 nor L4 should
+  // be compressed, so there shouldn't be any compression.
+  for (int i = 0; i < 20; i++) {
+    ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000)));
+    ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(3), 0);
+  ASSERT_TRUE(NumTableFilesAtLevel(0) > 0 || NumTableFilesAtLevel(4) > 0);
+
+  // Verify there was no compression
+  auto num_block_compressed =
+      options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED);
+  ASSERT_EQ(num_block_compressed, 0);
+
+  // Insert 400KB and there will be some files end up in L3. According to the
+  // above compression settings for each level, there will be some compression.
+  ASSERT_OK(options.statistics->Reset());
+  ASSERT_EQ(num_block_compressed, 0);
+  for (int i = 20; i < 120; i++) {
+    ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000)));
+    ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+  ASSERT_GE(NumTableFilesAtLevel(3), 1);
+  ASSERT_GE(NumTableFilesAtLevel(4), 1);
+
+  // Verify there was compression
+  num_block_compressed =
+      options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED);
+  ASSERT_GT(num_block_compressed, 0);
+
+  // Make sure data in files in L3 is not compacted by removing all files
+  // in L4 and calculate number of rows
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "true"},
+  }));
+  ColumnFamilyMetaData cf_meta;
+  db_->GetColumnFamilyMetaData(&cf_meta);
+
+  // Ensure that L1+ files are non-overlapping and together with L0 encompass
+  // full key range between smallestkey and largestkey from CF file metadata.
+  int largestkey_in_prev_level = -1;
+  int keys_found = 0;
+  for (int level = (int)cf_meta.levels.size() - 1; level >= 0; level--) {
+    int files_in_level = (int)cf_meta.levels[level].files.size();
+    int largestkey_in_prev_file = -1;
+    for (int j = 0; j < files_in_level; j++) {
+      int smallestkey = IdFromKey(cf_meta.levels[level].files[j].smallestkey);
+      int largestkey = IdFromKey(cf_meta.levels[level].files[j].largestkey);
+      int num_entries = (int)cf_meta.levels[level].files[j].num_entries;
+      ASSERT_EQ(num_entries, largestkey - smallestkey + 1);
+      keys_found += num_entries;
+      if (level > 0) {
+        if (j == 0) {
+          ASSERT_GT(smallestkey, largestkey_in_prev_level);
+        }
+        if (j > 0) {
+          ASSERT_GT(smallestkey, largestkey_in_prev_file);
+        }
+        if (j == files_in_level - 1) {
+          largestkey_in_prev_level = largestkey;
+        }
+      }
+      largestkey_in_prev_file = largestkey;
+    }
+  }
+  ASSERT_EQ(keys_found, kNKeys);
+
+  for (const auto& file : cf_meta.levels[4].files) {
+    listener->SetExpectedFileName(dbname_ + file.name);
+    const RangeOpt ranges(file.smallestkey, file.largestkey);
+    // Given verification from above, we're guaranteed that by deleting all the
+    // files in [<smallestkey>, <largestkey>] range, we're effectively deleting
+    // that very single file and nothing more.
+    EXPECT_OK(dbfull()->DeleteFilesInRanges(dbfull()->DefaultColumnFamily(),
+                                            &ranges, true /* include_end */));
+  }
+  listener->VerifyMatchedCount(cf_meta.levels[4].files.size());
+
+  int num_keys = 0;
+  std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    num_keys++;
+  }
+  ASSERT_OK(iter->status());
+
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+  ASSERT_GE(NumTableFilesAtLevel(3), 1);
+  ASSERT_EQ(NumTableFilesAtLevel(4), 0);
+
+  ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(3), num_keys * 4000U + num_keys * 10U);
+}
+
+TEST_F(DBCompressionTest, DynamicLevelCompressionPerLevel2) {
+  if (!Snappy_Supported() || !LZ4_Supported() || !Zlib_Supported()) {
+    return;
+  }
+  const int kNKeys = 500;
+  int keys[kNKeys];
+  for (int i = 0; i < kNKeys; i++) {
+    keys[i] = i;
+  }
+  RandomShuffle(std::begin(keys), std::end(keys));
+
+  Random rnd(301);
+  Options options;
+  options.create_if_missing = true;
+  options.db_write_buffer_size = 6000000;
+  options.write_buffer_size = 600000;
+  options.max_write_buffer_number = 2;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_stop_writes_trigger = 2;
+  options.soft_pending_compaction_bytes_limit = 1024 * 1024;
+  options.target_file_size_base = 20;
+  options.env = env_;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.max_bytes_for_level_base = 200;
+  options.max_bytes_for_level_multiplier = 8;
+  options.max_background_compactions = 1;
+  options.num_levels = 5;
+  std::shared_ptr<mock::MockTableFactory> mtf(new mock::MockTableFactory);
+  options.table_factory = mtf;
+
+  options.compression_per_level.resize(3);
+  options.compression_per_level[0] = kNoCompression;
+  options.compression_per_level[1] = kLZ4Compression;
+  options.compression_per_level[2] = kZlibCompression;
+
+  DestroyAndReopen(options);
+  // When base level is L4, L4 is LZ4.
+  std::atomic<int> num_zlib(0);
+  std::atomic<int> num_lz4(0);
+  std::atomic<int> num_no(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = static_cast<Compaction*>(arg);
+        if (compaction->output_level() == 4) {
+          ASSERT_TRUE(compaction->output_compression() == kLZ4Compression);
+          num_lz4.fetch_add(1);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) {
+        auto* compression = static_cast<CompressionType*>(arg);
+        ASSERT_TRUE(*compression == kNoCompression);
+        num_no.fetch_add(1);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  for (int i = 0; i < 100; i++) {
+    std::string value = rnd.RandomString(200);
+    ASSERT_OK(Put(Key(keys[i]), value));
+    if (i % 25 == 24) {
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    }
+  }
+
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(3), 0);
+  ASSERT_GT(NumTableFilesAtLevel(4), 0);
+  ASSERT_GT(num_no.load(), 2);
+  ASSERT_GT(num_lz4.load(), 0);
+  int prev_num_files_l4 = NumTableFilesAtLevel(4);
+
+  // After base level turn L4->L3, L3 becomes LZ4 and L4 becomes Zlib
+  num_lz4.store(0);
+  num_no.store(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = static_cast<Compaction*>(arg);
+        if (compaction->output_level() == 4 && compaction->start_level() == 3) {
+          ASSERT_TRUE(compaction->output_compression() == kZlibCompression);
+          num_zlib.fetch_add(1);
+        } else {
+          ASSERT_TRUE(compaction->output_compression() == kLZ4Compression);
+          num_lz4.fetch_add(1);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) {
+        auto* compression = static_cast<CompressionType*>(arg);
+        ASSERT_TRUE(*compression == kNoCompression);
+        num_no.fetch_add(1);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  for (int i = 101; i < 500; i++) {
+    std::string value = rnd.RandomString(200);
+    ASSERT_OK(Put(Key(keys[i]), value));
+    if (i % 100 == 99) {
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    }
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+  ASSERT_GT(NumTableFilesAtLevel(3), 0);
+  ASSERT_GT(NumTableFilesAtLevel(4), prev_num_files_l4);
+  ASSERT_GT(num_no.load(), 2);
+  ASSERT_GT(num_lz4.load(), 0);
+  ASSERT_GT(num_zlib.load(), 0);
+}
+
+class PresetCompressionDictTest
+    : public DBTestBase,
+      public testing::WithParamInterface<std::tuple<CompressionType, bool>> {
+ public:
+  PresetCompressionDictTest()
+      : DBTestBase("db_test2", false /* env_do_fsync */),
+        compression_type_(std::get<0>(GetParam())),
+        bottommost_(std::get<1>(GetParam())) {}
+
+ protected:
+  const CompressionType compression_type_;
+  const bool bottommost_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+    DBCompressionTest, PresetCompressionDictTest,
+    ::testing::Combine(::testing::ValuesIn(GetSupportedDictCompressions()),
+                       ::testing::Bool()));
+
+TEST_P(PresetCompressionDictTest, Flush) {
+  // Verifies that dictionary is generated and written during flush only when
+  // `ColumnFamilyOptions::compression` enables dictionary. Also verifies the
+  // size of the dictionary is within expectations according to the limit on
+  // buffering set by `CompressionOptions::max_dict_buffer_bytes`.
+  const size_t kValueLen = 256;
+  const size_t kKeysPerFile = 1 << 10;
+  const size_t kDictLen = 16 << 10;
+  const size_t kBlockLen = 4 << 10;
+
+  Options options = CurrentOptions();
+  if (bottommost_) {
+    options.bottommost_compression = compression_type_;
+    options.bottommost_compression_opts.enabled = true;
+    options.bottommost_compression_opts.max_dict_bytes = kDictLen;
+    options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen;
+  } else {
+    options.compression = compression_type_;
+    options.compression_opts.max_dict_bytes = kDictLen;
+    options.compression_opts.max_dict_buffer_bytes = kBlockLen;
+  }
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(kKeysPerFile));
+  options.statistics = CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.block_size = kBlockLen;
+  bbto.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+
+  Random rnd(301);
+  for (size_t i = 0; i <= kKeysPerFile; ++i) {
+    ASSERT_OK(Put(Key(static_cast<int>(i)), rnd.RandomString(kValueLen)));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  // We can use `BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT` to detect whether a
+  // compression dictionary exists since dictionaries would be preloaded when
+  // the flush finishes.
+  if (bottommost_) {
+    // Flush is never considered bottommost. This should change in the future
+    // since flushed files may have nothing underneath them, like the one in
+    // this test case.
+    ASSERT_EQ(
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+        0);
+  } else {
+    ASSERT_GT(
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+        0);
+    ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD), 1);
+    // Although we stop buffering after `kBlockLen` bytes, there may be up to
+    // two blocks of data included in the dictionary since we only check limit
+    // after each block is built. And because block cache charges for bytes used
+    // by ZSTD's digested dictionary, we need a larger factor for the memory
+    // overheads in that case.
+    ASSERT_LE(
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+        (compression_type_ == kZSTD ? 10 : 2) * kBlockLen);
+  }
+}
+
+TEST_P(PresetCompressionDictTest, CompactNonBottommost) {
+  // Verifies that dictionary is generated and written during compaction to
+  // non-bottommost level only when `ColumnFamilyOptions::compression` enables
+  // dictionary. Also verifies the size of the dictionary is within expectations
+  // according to the limit on buffering set by
+  // `CompressionOptions::max_dict_buffer_bytes`.
+  const size_t kValueLen = 256;
+  const size_t kKeysPerFile = 1 << 10;
+  const size_t kDictLen = 16 << 10;
+  const size_t kBlockLen = 4 << 10;
+
+  Options options = CurrentOptions();
+  if (bottommost_) {
+    options.bottommost_compression = compression_type_;
+    options.bottommost_compression_opts.enabled = true;
+    options.bottommost_compression_opts.max_dict_bytes = kDictLen;
+    options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen;
+  } else {
+    options.compression = compression_type_;
+    options.compression_opts.max_dict_bytes = kDictLen;
+    options.compression_opts.max_dict_buffer_bytes = kBlockLen;
+  }
+  options.disable_auto_compactions = true;
+  options.statistics = CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.block_size = kBlockLen;
+  bbto.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+
+  Random rnd(301);
+  for (size_t j = 0; j <= kKeysPerFile; ++j) {
+    ASSERT_OK(Put(Key(static_cast<int>(j)), rnd.RandomString(kValueLen)));
+  }
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (size_t j = 0; j <= kKeysPerFile; ++j) {
+      ASSERT_OK(Put(Key(static_cast<int>(j)), rnd.RandomString(kValueLen)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ("2,0,1", FilesPerLevel(0));
+
+  PopTicker(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT);
+  PopTicker(options, BLOCK_CACHE_COMPRESSION_DICT_ADD);
+
+  // This L0->L1 compaction merges the two L0 files into L1. The produced L1
+  // file is not bottommost due to the existing L2 file covering the same key-
+  // range.
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+  ASSERT_EQ("0,1,1", FilesPerLevel(0));
+  // We can use `BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT` to detect whether a
+  // compression dictionary exists since dictionaries would be preloaded when
+  // the compaction finishes.
+  if (bottommost_) {
+    ASSERT_EQ(
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+        0);
+  } else {
+    ASSERT_GT(
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+        0);
+    ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD), 1);
+    // Although we stop buffering after `kBlockLen` bytes, there may be up to
+    // two blocks of data included in the dictionary since we only check limit
+    // after each block is built. And because block cache charges for bytes used
+    // by ZSTD's digested dictionary, we need a larger factor for the memory
+    // overheads in that case.
+    ASSERT_LE(
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+        (compression_type_ == kZSTD ? 10 : 2) * kBlockLen);
+  }
+}
+
+TEST_P(PresetCompressionDictTest, CompactBottommost) {
+  // Verifies that dictionary is generated and written during compaction to
+  // non-bottommost level only when either `ColumnFamilyOptions::compression` or
+  // `ColumnFamilyOptions::bottommost_compression` enables dictionary. Also
+  // verifies the size of the dictionary is within expectations according to the
+  // limit on buffering set by `CompressionOptions::max_dict_buffer_bytes`.
+  const size_t kValueLen = 256;
+  const size_t kKeysPerFile = 1 << 10;
+  const size_t kDictLen = 16 << 10;
+  const size_t kBlockLen = 4 << 10;
+
+  Options options = CurrentOptions();
+  if (bottommost_) {
+    options.bottommost_compression = compression_type_;
+    options.bottommost_compression_opts.enabled = true;
+    options.bottommost_compression_opts.max_dict_bytes = kDictLen;
+    options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen;
+  } else {
+    options.compression = compression_type_;
+    options.compression_opts.max_dict_bytes = kDictLen;
+    options.compression_opts.max_dict_buffer_bytes = kBlockLen;
+  }
+  options.disable_auto_compactions = true;
+  options.statistics = CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.block_size = kBlockLen;
+  bbto.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < 2; ++i) {
+    for (size_t j = 0; j <= kKeysPerFile; ++j) {
+      ASSERT_OK(Put(Key(static_cast<int>(j)), rnd.RandomString(kValueLen)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ("2", FilesPerLevel(0));
+
+  PopTicker(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT);
+  PopTicker(options, BLOCK_CACHE_COMPRESSION_DICT_ADD);
+
+  CompactRangeOptions cro;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+  ASSERT_GT(
+      TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+      0);
+  ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD), 1);
+  // Although we stop buffering after `kBlockLen` bytes, there may be up to
+  // two blocks of data included in the dictionary since we only check limit
+  // after each block is built. And because block cache charges for bytes used
+  // by ZSTD's digested dictionary, we need a larger factor for the memory
+  // overheads in that case.
+  ASSERT_LE(
+      TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+      (compression_type_ == kZSTD ? 10 : 2) * kBlockLen);
+}
+
+class CompactionCompressionListener : public EventListener {
+ public:
+  explicit CompactionCompressionListener(Options* db_options)
+      : db_options_(db_options) {}
+
+  void OnCompactionCompleted(DB* db, const CompactionJobInfo& ci) override {
+    // Figure out last level with files
+    int bottommost_level = 0;
+    for (int level = 0; level < db->NumberLevels(); level++) {
+      std::string files_at_level;
+      ASSERT_TRUE(
+          db->GetProperty("rocksdb.num-files-at-level" + std::to_string(level),
+                          &files_at_level));
+      if (files_at_level != "0") {
+        bottommost_level = level;
+      }
+    }
+
+    if (db_options_->bottommost_compression != kDisableCompressionOption &&
+        ci.output_level == bottommost_level) {
+      ASSERT_EQ(ci.compression, db_options_->bottommost_compression);
+    } else if (db_options_->compression_per_level.size() != 0) {
+      ASSERT_EQ(ci.compression,
+                db_options_->compression_per_level[ci.output_level]);
+    } else {
+      ASSERT_EQ(ci.compression, db_options_->compression);
+    }
+    max_level_checked = std::max(max_level_checked, ci.output_level);
+  }
+
+  int max_level_checked = 0;
+  const Options* db_options_;
+};
+
+enum CompressionFailureType {
+  kTestCompressionFail,
+  kTestDecompressionFail,
+  kTestDecompressionCorruption,
+  kTestStartOfFinishFail,
+};
+
+class CompressionFailuresTest
+    : public DBCompressionTest,
+      public testing::WithParamInterface<std::tuple<
+          CompressionFailureType, CompressionType, uint32_t, uint32_t>> {
+ public:
+  CompressionFailuresTest() {
+    std::tie(compression_failure_type_, compression_type_,
+             compression_max_dict_bytes_, compression_parallel_threads_) =
+        GetParam();
+  }
+
+  CompressionFailureType compression_failure_type_ = kTestCompressionFail;
+  CompressionType compression_type_ = kNoCompression;
+  uint32_t compression_max_dict_bytes_ = 0;
+  uint32_t compression_parallel_threads_ = 0;
+};
+
+INSTANTIATE_TEST_CASE_P(
+    DBCompressionTest, CompressionFailuresTest,
+    ::testing::Combine(::testing::Values(kTestCompressionFail,
+                                         kTestDecompressionFail,
+                                         kTestDecompressionCorruption,
+                                         kTestStartOfFinishFail),
+                       ::testing::ValuesIn(GetSupportedCompressions()),
+                       ::testing::Values(0, 10), ::testing::Values(1, 4)));
+
+TEST_P(CompressionFailuresTest, CompressionFailures) {
+  if (compression_type_ == kNoCompression) {
+    return;
+  }
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  options.max_bytes_for_level_base = 1024;
+  options.max_bytes_for_level_multiplier = 2;
+  options.num_levels = 7;
+  options.max_background_compactions = 1;
+  options.target_file_size_base = 512;
+
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 512;
+  table_options.verify_compression = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  options.compression = compression_type_;
+  options.compression_opts.parallel_threads = compression_parallel_threads_;
+  options.compression_opts.max_dict_bytes = compression_max_dict_bytes_;
+  options.bottommost_compression_opts.parallel_threads =
+      compression_parallel_threads_;
+  options.bottommost_compression_opts.max_dict_bytes =
+      compression_max_dict_bytes_;
+
+  if (compression_failure_type_ == kTestCompressionFail) {
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "BlockBasedTableBuilder::CompressAndVerifyBlock:TamperWithResultType",
+        [](void* arg) {
+          CompressionType* ret = static_cast<CompressionType*>(arg);
+          *ret = kNoCompression;
+        });
+  } else if (compression_failure_type_ == kTestDecompressionFail) {
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "DecompressBlockData:TamperWithReturnValue", [](void* arg) {
+          Status* ret = static_cast<Status*>(arg);
+          ASSERT_OK(*ret);
+          *ret = Status::Corruption("kTestDecompressionFail");
+        });
+  } else if (compression_failure_type_ == kTestDecompressionCorruption) {
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "DecompressBlockData:TamperWithDecompressionOutput", [](void* arg) {
+          BlockContents* contents = static_cast<BlockContents*>(arg);
+          // Ensure uncompressed data != original data
+          const size_t len = contents->data.size() + 1;
+          std::unique_ptr<char[]> fake_data(new char[len]());
+          *contents = BlockContents(std::move(fake_data), len);
+        });
+  } else if (compression_failure_type_ == kTestStartOfFinishFail) {
+    if (compression_parallel_threads_ <= 1) {
+      // skip this configuration
+      return;
+    }
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "BlockBasedTableBuilder::Finish:ParallelIOStatus", [&](void* arg) {
+          *static_cast<IOStatus*>(arg) = IOStatus::Corruption("Seeded failure");
+        });
+  } else {
+    abort();
+  }
+
+  std::map<std::string, std::string> key_value_written;
+
+  const int kKeySize = 5;
+  const int kValUnitSize = 16;
+  const int kValSize = 256;
+  Random rnd(405);
+
+  Status s = Status::OK();
+
+  DestroyAndReopen(options);
+  // Write 10 random files
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 5; j++) {
+      std::string key = rnd.RandomString(kKeySize);
+      // Ensure good compression ratio
+      std::string valueUnit = rnd.RandomString(kValUnitSize);
+      std::string value;
+      for (int k = 0; k < kValSize; k += kValUnitSize) {
+        value += valueUnit;
+      }
+      s = Put(key, value);
+      if (compression_failure_type_ == kTestCompressionFail) {
+        key_value_written[key] = value;
+        ASSERT_OK(s);
+      }
+    }
+    s = Flush();
+    if (compression_failure_type_ == kTestCompressionFail) {
+      ASSERT_OK(s);
+    }
+    s = dbfull()->TEST_WaitForCompact();
+    if (compression_failure_type_ == kTestCompressionFail) {
+      ASSERT_OK(s);
+    }
+    if (i == 4) {
+      // Make compression fail at the mid of table building
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+    }
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  auto st = s.getState();
+  if (compression_failure_type_ == kTestCompressionFail) {
+    // Should be kNoCompression, check content consistency
+    std::unique_ptr<Iterator> db_iter(db_->NewIterator(ReadOptions()));
+    for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
+      std::string key = db_iter->key().ToString();
+      std::string value = db_iter->value().ToString();
+      ASSERT_NE(key_value_written.find(key), key_value_written.end());
+      ASSERT_EQ(key_value_written[key], value);
+      key_value_written.erase(key);
+    }
+    ASSERT_OK(db_iter->status());
+    ASSERT_EQ(0, key_value_written.size());
+  } else if (compression_failure_type_ == kTestDecompressionFail) {
+    ASSERT_EQ(s.code(), Status::kCorruption);
+    ASSERT_NE(st, nullptr);
+    ASSERT_EQ(std::string(st), "Could not decompress: kTestDecompressionFail");
+  } else if (compression_failure_type_ == kTestDecompressionCorruption) {
+    ASSERT_EQ(s.code(), Status::kCorruption);
+    ASSERT_NE(st, nullptr);
+    ASSERT_EQ(std::string(st),
+              "Decompressed block did not match pre-compression block");
+  } else if (compression_failure_type_ == kTestStartOfFinishFail) {
+    ASSERT_EQ(s.code(), Status::kCorruption);
+    ASSERT_NE(st, nullptr);
+    ASSERT_EQ(std::string(st), "Seeded failure");
+  }
+}
+
+TEST_F(DBCompressionTest, CompressionOptions) {
+  if (!Zlib_Supported() || !Snappy_Supported()) {
+    return;
+  }
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  options.max_bytes_for_level_base = 100;
+  options.max_bytes_for_level_multiplier = 2;
+  options.num_levels = 7;
+  options.max_background_compactions = 1;
+
+  CompactionCompressionListener* listener =
+      new CompactionCompressionListener(&options);
+  options.listeners.emplace_back(listener);
+
+  const int kKeySize = 5;
+  const int kValSize = 20;
+  Random rnd(301);
+
+  std::vector<uint32_t> compression_parallel_threads = {1, 4};
+
+  std::map<std::string, std::string> key_value_written;
+
+  for (int iter = 0; iter <= 2; iter++) {
+    listener->max_level_checked = 0;
+
+    if (iter == 0) {
+      // Use different compression algorithms for different levels but
+      // always use Zlib for bottommost level
+      options.compression_per_level = {kNoCompression,     kNoCompression,
+                                       kNoCompression,     kSnappyCompression,
+                                       kSnappyCompression, kSnappyCompression,
+                                       kZlibCompression};
+      options.compression = kNoCompression;
+      options.bottommost_compression = kZlibCompression;
+    } else if (iter == 1) {
+      // Use Snappy except for bottommost level use ZLib
+      options.compression_per_level = {};
+      options.compression = kSnappyCompression;
+      options.bottommost_compression = kZlibCompression;
+    } else if (iter == 2) {
+      // Use Snappy everywhere
+      options.compression_per_level = {};
+      options.compression = kSnappyCompression;
+      options.bottommost_compression = kDisableCompressionOption;
+    }
+
+    for (auto num_threads : compression_parallel_threads) {
+      options.compression_opts.parallel_threads = num_threads;
+      options.bottommost_compression_opts.parallel_threads = num_threads;
+
+      DestroyAndReopen(options);
+      // Write 10 random files
+      for (int i = 0; i < 10; i++) {
+        for (int j = 0; j < 5; j++) {
+          std::string key = rnd.RandomString(kKeySize);
+          std::string value = rnd.RandomString(kValSize);
+          key_value_written[key] = value;
+          ASSERT_OK(Put(key, value));
+        }
+        ASSERT_OK(Flush());
+        ASSERT_OK(dbfull()->TEST_WaitForCompact());
+      }
+
+      // Make sure that we wrote enough to check all 7 levels
+      ASSERT_EQ(listener->max_level_checked, 6);
+
+      // Make sure database content is the same as key_value_written
+      std::unique_ptr<Iterator> db_iter(db_->NewIterator(ReadOptions()));
+      for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
+        std::string key = db_iter->key().ToString();
+        std::string value = db_iter->value().ToString();
+        ASSERT_NE(key_value_written.find(key), key_value_written.end());
+        ASSERT_EQ(key_value_written[key], value);
+        key_value_written.erase(key);
+      }
+      ASSERT_OK(db_iter->status());
+      ASSERT_EQ(0, key_value_written.size());
+    }
+  }
+}
+
+TEST_F(DBCompressionTest, RoundRobinManager) {
+  if (ZSTD_Supported()) {
+    auto mgr =
+        std::make_shared<RoundRobinManager>(GetBuiltinV2CompressionManager());
+
+    std::vector<std::string> values;
+    for (bool use_wrapper : {true}) {
+      SCOPED_TRACE((use_wrapper ? "With " : "No ") + std::string("wrapper"));
+
+      Options options = CurrentOptions();
+      options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+      options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
+      BlockBasedTableOptions bbto;
+      bbto.enable_index_compression = false;
+      options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+      options.compression_manager = use_wrapper ? mgr : nullptr;
+      DestroyAndReopen(options);
+
+      Random rnd(301);
+      constexpr int kCount = 13;
+
+      // Highly compressible blocks, except 1 non-compressible. Half of the
+      // compressible are morked for bypass and 1 marked for rejection. Values
+      // are large enough to ensure just 1 k-v per block.
+      for (int i = 0; i < kCount; ++i) {
+        std::string value;
+        if (i == 6) {
+          // One non-compressible block
+          value = rnd.RandomBinaryString(20000);
+        } else {
+          test::CompressibleString(&rnd, 0.1, 20000, &value);
+        }
+        values.push_back(value);
+        ASSERT_OK(Put(Key(i), value));
+        ASSERT_EQ(Get(Key(i)), value);
+      }
+      ASSERT_OK(Flush());
+
+      // Ensure well-formed for reads
+      for (int i = 0; i < kCount; ++i) {
+        ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+        ASSERT_EQ(Get(Key(i)), values[i]);
+      }
+      ASSERT_EQ(Get(Key(kCount)), "NOT_FOUND");
+    }
+  }
+}
+
+TEST_F(DBCompressionTest, RandomMixedCompressionManager) {
+  if (ZSTD_Supported()) {
+    auto mgr = std::make_shared<RandomMixedCompressionManager>(
+        GetBuiltinV2CompressionManager());
+    std::vector<std::string> values;
+    for (bool use_wrapper : {true}) {
+      SCOPED_TRACE((use_wrapper ? "With " : "No ") + std::string("wrapper"));
+
+      Options options = CurrentOptions();
+      options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+      options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
+      BlockBasedTableOptions bbto;
+      bbto.enable_index_compression = false;
+      options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+      options.compression_manager = use_wrapper ? mgr : nullptr;
+      DestroyAndReopen(options);
+
+      Random rnd(301);
+      constexpr int kCount = 13;
+
+      // Highly compressible blocks, except 1 non-compressible. Half of the
+      // compressible are morked for bypass and 1 marked for rejection. Values
+      // are large enough to ensure just 1 k-v per block.
+      for (int i = 0; i < kCount; ++i) {
+        std::string value;
+        if (i == 6) {
+          // One non-compressible block
+          value = rnd.RandomBinaryString(20000);
+        } else {
+          test::CompressibleString(&rnd, 0.1, 20000, &value);
+        }
+        values.push_back(value);
+        ASSERT_OK(Put(Key(i), value));
+        ASSERT_EQ(Get(Key(i)), value);
+      }
+      ASSERT_OK(Flush());
+
+      // Ensure well-formed for reads
+      for (int i = 0; i < kCount; ++i) {
+        ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+        ASSERT_EQ(Get(Key(i)), values[i]);
+      }
+      ASSERT_EQ(Get(Key(kCount)), "NOT_FOUND");
+    }
+  }
+}
+
+namespace {
+// Template parameter to distinguish data blocks vs. v4+ index blocks
+template <bool kIndexBlockV4>
+static Status ValidateRocksBlock(Slice data) {
+  const char* src = data.data();
+  size_t srcSize = data.size();
+  const char* const block_type_str =
+      kIndexBlockV4 ? "Index block" : "Data block";
+
+  // Minimum RocksDB block content size: at least 1 entry + restarts
+  if (srcSize < 8) {
+    return Status::Corruption(std::string(block_type_str) + " too small");
+  }
+
+  uint32_t numRestarts = DecodeFixed32(src + srcSize - sizeof(uint32_t));
+
+  // Sanity check: num_restarts should be reasonable
+  // TODO: also support data block hash index
+  if (numRestarts > srcSize / 4 || numRestarts == 0) {
+    return Status::Corruption(std::string("Invalid num_restarts in ") +
+                              block_type_str);
+  }
+
+  size_t restartsSize = numRestarts * sizeof(uint32_t) + sizeof(uint32_t);
+  if (srcSize < restartsSize) {
+    return Status::Corruption(std::string(block_type_str) +
+                              " too small for restarts array");
+  }
+
+  size_t entriesSize = srcSize - restartsSize;
+  const char* entriesEnd = src + entriesSize;
+
+  // Parse entries
+  const char* p = src;
+  while (p < entriesEnd) {
+    // Parse shared_bytes varint
+    uint32_t shared;
+    const char* next = GetVarint32Ptr(p, entriesEnd, &shared);
+    if (next == nullptr) {
+      return Status::Corruption(std::string("Invalid shared_bytes varint in ") +
+                                block_type_str);
+    }
+    p = next;
+
+    // Parse unshared_bytes varint
+    uint32_t unshared;
+    next = GetVarint32Ptr(p, entriesEnd, &unshared);
+    if (next == nullptr) {
+      return Status::Corruption(
+          std::string("Invalid unshared_bytes varint in ") + block_type_str);
+    }
+    p = next;
+
+    uint32_t valueLen = 0;
+    if constexpr (!kIndexBlockV4) {
+      // For data blocks, parse value_length varint
+      next = GetVarint32Ptr(p, entriesEnd, &valueLen);
+      if (next == nullptr) {
+        return Status::Corruption(
+            std::string("Invalid value_length varint in ") + block_type_str);
+      }
+      p = next;
+    }
+
+    // Validate key delta
+    if (p + unshared > entriesEnd) {
+      return Status::Corruption(
+          std::string("Key delta exceeds end of entries in ") + block_type_str);
+    }
+    p += unshared;
+
+    if constexpr (kIndexBlockV4) {
+      // For v4 index blocks, value is self-describing (varints)
+      // Parse first varint (always present)
+      uint32_t v1;
+      next = GetVarint32Ptr(p, entriesEnd, &v1);
+      if (next == nullptr) {
+        return Status::Corruption(std::string("Invalid value varint in ") +
+                                  block_type_str);
+      }
+      p = next;
+
+      // If shared_bytes == 0, there's a second varint
+      if (shared == 0) {
+        uint32_t v2;
+        next = GetVarint32Ptr(p, entriesEnd, &v2);
+        if (next == nullptr) {
+          return Status::Corruption(
+              std::string("Invalid second value varint in ") + block_type_str);
+        }
+        p = next;
+      }
+    } else {
+      // For data blocks, validate value
+      if (p + valueLen > entriesEnd) {
+        return Status::Corruption(
+            std::string("Value exceeds end of entries in ") + block_type_str);
+      }
+      p += valueLen;
+    }
+  }
+
+  return Status::OK();
+}
+}  // anonymous namespace
+
+class DBCompressionTestMaybeParallel
+    : public DBCompressionTest,
+      public testing::WithParamInterface<std::tuple<int, bool>> {
+ public:
+  DBCompressionTestMaybeParallel()
+      : DBCompressionTest(),
+        parallel_threads_(std::get<0>(GetParam())),
+        use_dict_(std::get<1>(GetParam())) {}
+
+ protected:
+  int parallel_threads_;
+  bool use_dict_;
+};
+
+INSTANTIATE_TEST_CASE_P(DBCompressionTest, DBCompressionTestMaybeParallel,
+                        ::testing::Combine(::testing::Values(1, 4),
+                                           ::testing::Values(false, true)));
+
+TEST_P(DBCompressionTestMaybeParallel, CompressionManagerWrapper) {
+  // Test that we can use a custom CompressionManager to wrap the built-in
+  // CompressionManager, thus adopting a custom *strategy* based on existing
+  // algorithms. This will "mark" some blocks (in their contents) as "do not
+  // compress", i.e. no attempt to compress, and some blocks as "reject
+  // compression", i.e. compression attempted but rejected because of ratio
+  // or otherwise. These cases are distinguishable for statistics that
+  // approximate "wasted effort".
+  static std::string kDoNotCompress = "do_not_compress";
+  static std::string kRejectCompression = "reject_compression";
+
+  static RelaxedAtomic<int> dataCheckedCount{0};
+  static RelaxedAtomic<int> indexCheckedCount{0};
+  static RelaxedAtomic<int> compressCalledCount{0};
+
+  // We also have wrappers here to help verify that when RocksDB asks to
+  // specialize the Compressor for a particular kind of block, it only passes in
+  // that kind of block to ensure proper grouping of related data for
+  // compression. We check this by parsing the subtly distinct schemas of data
+  // blocks vs. v4+ index blocks. This also ensures that structure-aware
+  // compressions like OpenZL can parse the data block and index block formats.
+  struct CheckDataBlockCompressorWrapper : public CompressorWrapper {
+    using CompressorWrapper::CompressorWrapper;
+    const char* Name() const override { return "CheckDataBlockCompressor"; }
+
+    std::unique_ptr<Compressor> Clone() const override {
+      return std::make_unique<CheckDataBlockCompressorWrapper>(
+          wrapped_->Clone());
+    }
+
+    Status CompressBlock(Slice uncompressed_data, char* compressed_output,
+                         size_t* compressed_output_size,
+                         CompressionType* out_compression_type,
+                         ManagedWorkingArea* working_area) override {
+      dataCheckedCount.FetchAddRelaxed(1);
+      // Parse and validate data block format before compressing
+      Status s = ValidateRocksBlock</*kIndexBlockV4=*/false>(uncompressed_data);
+      if (!s.ok()) {
+        return s;
+      }
+      // Delegate to wrapped compressor on success
+      return wrapped_->CompressBlock(uncompressed_data, compressed_output,
+                                     compressed_output_size,
+                                     out_compression_type, working_area);
+    }
+  };
+
+  struct CheckIndexBlockCompressorWrapper : public CompressorWrapper {
+    using CompressorWrapper::CompressorWrapper;
+    const char* Name() const override { return "CheckIndexBlockCompressor"; }
+
+    std::unique_ptr<Compressor> Clone() const override {
+      return std::make_unique<CheckIndexBlockCompressorWrapper>(
+          wrapped_->Clone());
+    }
+
+    Status CompressBlock(Slice uncompressed_data, char* compressed_output,
+                         size_t* compressed_output_size,
+                         CompressionType* out_compression_type,
+                         ManagedWorkingArea* working_area) override {
+      indexCheckedCount.FetchAddRelaxed(1);
+      // Parse and validate index block v4 format before compressing
+      Status s = ValidateRocksBlock</*kIndexBlockV4=*/true>(uncompressed_data);
+      if (!s.ok()) {
+        return s;
+      }
+      // Delegate to wrapped compressor on success
+      return wrapped_->CompressBlock(uncompressed_data, compressed_output,
+                                     compressed_output_size,
+                                     out_compression_type, working_area);
+    }
+  };
+
+  struct MyCompressor : public CompressorWrapper {
+    using CompressorWrapper::CompressorWrapper;
+    const char* Name() const override { return "MyCompressor"; }
+
+    std::unique_ptr<Compressor> Clone() const override {
+      return std::make_unique<MyCompressor>(wrapped_->Clone());
+    }
+
+    Status CompressBlock(Slice uncompressed_data, char* compressed_output,
+                         size_t* compressed_output_size,
+                         CompressionType* out_compression_type,
+                         ManagedWorkingArea* working_area) override {
+      compressCalledCount.FetchAddRelaxed(1);
+      auto begin = uncompressed_data.data();
+      auto end = uncompressed_data.data() + uncompressed_data.size();
+      if (std::search(begin, end, kDoNotCompress.begin(),
+                      kDoNotCompress.end()) != end) {
+        // Do not attempt compression
+        *compressed_output_size = 0;
+        EXPECT_EQ(*out_compression_type, kNoCompression);
+        return Status::OK();
+      } else if (std::search(begin, end, kRejectCompression.begin(),
+                             kRejectCompression.end()) != end) {
+        // Simulate attempted & rejected compression
+        *compressed_output_size = 1;
+        EXPECT_EQ(*out_compression_type, kNoCompression);
+        return Status::OK();
+      } else {
+        return wrapped_->CompressBlock(uncompressed_data, compressed_output,
+                                       compressed_output_size,
+                                       out_compression_type, working_area);
+      }
+    }
+
+    // Also check WorkingArea handling
+    struct MyWorkingArea : public WorkingArea {
+      explicit MyWorkingArea(ManagedWorkingArea&& wrapped)
+          : wrapped_(std::move(wrapped)) {}
+      ManagedWorkingArea wrapped_;
+    };
+
+    ManagedWorkingArea ObtainWorkingArea() override {
+      ManagedWorkingArea rv{
+          new MyWorkingArea{CompressorWrapper::ObtainWorkingArea()}, this};
+      if (GetPreferredCompressionType() == kZSTD) {
+        // ZSTD should always use WorkingArea, so this is our chance to ensure
+        // CompressorWrapper::ObtainWorkingArea() is properly connected
+        assert(rv.get() != nullptr);
+      }
+      return rv;
+    }
+
+    void ReleaseWorkingArea(WorkingArea* wa) override {
+      delete static_cast<MyWorkingArea*>(wa);
+    }
+
+    std::unique_ptr<Compressor> MaybeCloneSpecialized(
+        CacheEntryRole block_type,
+        DictConfigArgs&& dict_config) const override {
+      std::unique_ptr<Compressor> result = std::make_unique<MyCompressor>(
+          wrapped_->CloneMaybeSpecialized(block_type, std::move(dict_config)));
+      if (block_type == CacheEntryRole::kDataBlock) {
+        result = std::make_unique<CheckDataBlockCompressorWrapper>(
+            std::move(result));
+      } else if (block_type == CacheEntryRole::kIndexBlock) {
+        result = std::make_unique<CheckIndexBlockCompressorWrapper>(
+            std::move(result));
+      }
+      return result;
+    }
+  };
+  struct MyManager : public CompressionManagerWrapper {
+    using CompressionManagerWrapper::CompressionManagerWrapper;
+    const char* Name() const override { return "MyManager"; }
+    std::unique_ptr<Compressor> GetCompressorForSST(
+        const FilterBuildingContext& context, const CompressionOptions& opts,
+        CompressionType preferred) override {
+      return std::make_unique<MyCompressor>(
+          wrapped_->GetCompressorForSST(context, opts, preferred));
+    }
+  };
+  auto mgr = std::make_shared<MyManager>(GetBuiltinV2CompressionManager());
+
+  for (CompressionType type : GetSupportedCompressions()) {
+    for (bool use_wrapper : {false, true}) {
+      if (type == kNoCompression) {
+        continue;
+      }
+      SCOPED_TRACE("Compression type: " + std::to_string(type) +
+                   (use_wrapper ? " with " : " no ") + "wrapper");
+
+      Options options = CurrentOptions();
+      options.compression = type;
+      options.compression_opts.parallel_threads = parallel_threads_;
+      options.compression_opts.max_dict_bytes = use_dict_ ? 4096 : 0;
+      options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+      options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
+      BlockBasedTableOptions bbto;
+      bbto.enable_index_compression = true;
+      bbto.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
+      bbto.partition_filters = true;
+      bbto.filter_policy.reset(NewBloomFilterPolicy(5));
+      options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+      options.compression_manager = use_wrapper ? mgr : nullptr;
+      DestroyAndReopen(options);
+
+      auto PopStat = [&](Tickers t) -> uint64_t {
+        return options.statistics->getAndResetTickerCount(t);
+      };
+
+      Random rnd(301);
+      constexpr int kCount = 13;
+
+      // Highly compressible blocks, except 1 non-compressible. Half of the
+      // compressible are morked for bypass and 1 marked for rejection. Values
+      // are large enough to ensure just 1 k-v per block.
+      for (int i = 0; i < kCount; ++i) {
+        std::string value;
+        if (i == 6) {
+          // One non-compressible block
+          value = rnd.RandomBinaryString(20000);
+        } else {
+          test::CompressibleString(&rnd, 0.1, 20000, &value);
+          if ((i % 2) == 0) {
+            // Half for bypass
+            value += kDoNotCompress;
+          } else if (i == 7) {
+            // One for rejection
+            value += kRejectCompression;
+          }
+        }
+        ASSERT_OK(Put(Key(i), value));
+      }
+      ASSERT_OK(Flush());
+
+      // Index partition is compressed
+      constexpr int kIdxComp = 1;
+      // Top level index block is rejected for compression
+      constexpr int kIdxRej = 1;
+
+      if (use_dict_) {
+        // FIXME: why don't the stats match? (for now, checking for crashes)
+      } else if (use_wrapper) {
+        EXPECT_EQ(kCount / 2 - 1 + kIdxComp, PopStat(NUMBER_BLOCK_COMPRESSED));
+        EXPECT_EQ(kCount / 2, PopStat(NUMBER_BLOCK_COMPRESSION_BYPASSED));
+        EXPECT_EQ(1 + 1 + kIdxRej, PopStat(NUMBER_BLOCK_COMPRESSION_REJECTED));
+      } else {
+        EXPECT_EQ(kCount - 1 + kIdxComp, PopStat(NUMBER_BLOCK_COMPRESSED));
+        EXPECT_EQ(0, PopStat(NUMBER_BLOCK_COMPRESSION_BYPASSED));
+        EXPECT_EQ(1 + kIdxRej, PopStat(NUMBER_BLOCK_COMPRESSION_REJECTED));
+      }
+
+      // Ensure well-formed for reads
+      for (int i = 0; i < kCount; ++i) {
+        ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+      }
+      ASSERT_EQ(Get(Key(kCount)), "NOT_FOUND");
+
+      // Ensure expected checks were performed
+      EXPECT_EQ(indexCheckedCount.ExchangeRelaxed(0),
+                use_wrapper ? kIdxComp + kIdxRej : 0);
+      EXPECT_EQ(dataCheckedCount.ExchangeRelaxed(0), use_wrapper ? kCount : 0);
+      // And every use of MyCompressor went through either the data block
+      // checker or index block checker
+      EXPECT_EQ(compressCalledCount.ExchangeRelaxed(0),
+                use_wrapper ? kIdxComp + kIdxRej + kCount : 0);
+    }
+  }
+}
+
+namespace {
+std::string UniqueName(const std::string& base) {
+  static RelaxedAtomic<int> counter{0};
+  return base + std::to_string(counter.FetchAddRelaxed(1));
+}
+}  // anonymous namespace
+
+TEST_P(DBCompressionTestMaybeParallel, CompressionManagerCustomCompression) {
+  // Test that we can use a custom CompressionManager to implement custom
+  // compression algorithms, and that there are appropriate schema guard rails
+  // to ensure data is not processed by the wrong algorithm.
+  using Compressor8A = test::CompressorCustomAlg<kCustomCompression8A>;
+  using Compressor8B = test::CompressorCustomAlg<kCustomCompression8B>;
+  using Compressor8C = test::CompressorCustomAlg<kCustomCompression8C>;
+
+  if (!Compressor8A::Supported() || !LZ4_Supported()) {
+    fprintf(stderr,
+            "Prerequisite compression library not supported. Skipping\n");
+    return;
+  }
+
+  class MyManager : public CompressionManager {
+   public:
+    explicit MyManager(const std::string& compat_name)
+        : compat_name_(compat_name), name_("MyManager:" + compat_name_) {}
+    const char* Name() const override { return name_.c_str(); }
+    const char* CompatibilityName() const override {
+      return compat_name_.c_str();
+    }
+
+    bool SupportsCompressionType(CompressionType type) const override {
+      return type == kCustomCompression8A || type == kCustomCompression8B ||
+             type == kCustomCompression8C ||
+             GetBuiltinV2CompressionManager()->SupportsCompressionType(type);
+    }
+
+    int used_compressor8A_count_ = 0;
+    int used_compressor8B_count_ = 0;
+    int used_compressor8C_count_ = 0;
+
+    std::unique_ptr<Compressor> GetCompressor(const CompressionOptions& opts,
+                                              CompressionType type) override {
+      switch (static_cast<unsigned char>(type)) {
+        case kCustomCompression8A:
+          used_compressor8A_count_++;
+          return std::make_unique<Compressor8A>();
+        case kCustomCompression8B:
+          used_compressor8B_count_++;
+          return std::make_unique<Compressor8B>();
+        case kCustomCompression8C:
+          used_compressor8C_count_++;
+          return std::make_unique<Compressor8C>();
+        // Also support built-in compression algorithms
+        default:
+          return GetBuiltinV2CompressionManager()->GetCompressor(opts, type);
+      }
+    }
+
+    std::shared_ptr<Decompressor> GetDecompressor() override {
+      return std::make_shared<test::DecompressorCustomAlg>();
+    }
+
+    RelaxedAtomic<CompressionType> last_specific_decompressor_type_{
+        kNoCompression};
+
+    std::shared_ptr<Decompressor> GetDecompressorForTypes(
+        const CompressionType* types_begin,
+        const CompressionType* types_end) override {
+      assert(types_end > types_begin);
+      last_specific_decompressor_type_.StoreRelaxed(*types_begin);
+      auto decomp = std::make_shared<test::DecompressorCustomAlg>();
+      decomp->SetAllowedTypes(types_begin, types_end);
+      return decomp;
+    }
+
+    void AddFriend(const std::shared_ptr<CompressionManager>& mgr) {
+      friends_[mgr->CompatibilityName()] = mgr;
+    }
+    std::shared_ptr<CompressionManager> FindCompatibleCompressionManager(
+        Slice compatibility_name) override {
+      std::shared_ptr<CompressionManager> rv =
+          CompressionManager::FindCompatibleCompressionManager(
+              compatibility_name);
+      if (!rv) {
+        auto it = friends_.find(compatibility_name.ToString());
+        if (it != friends_.end()) {
+          return it->second.lock();
+        }
+      }
+      return rv;
+    }
+
+   private:
+    std::string compat_name_;
+    std::string name_;
+    // weak_ptr to avoid cycles
+    std::map<std::string, std::weak_ptr<CompressionManager>> friends_;
+  };
+
+  // Although these compression managers are actually compatible, we must
+  // respect their distinct compatibility names and treat them as incompatible
+  // (or else risk processing data incorrectly)
+  // NOTE: these are not registered in ObjectRegistry to test what happens
+  // when the original CompressionManager might not be available, but
+  // mgr_bar will be registered during the test, with different names to
+  // prevent interference between iterations.
+  auto mgr_foo = std::make_shared<MyManager>("Foo");
+  auto mgr_bar = std::make_shared<MyManager>(UniqueName("Bar"));
+
+  // And this one claims to be fully compatible with the built-in compression
+  // manager when it's not fully compatible (for custom CompressionTypes)
+  auto mgr_claim_compatible = std::make_shared<MyManager>("BuiltinV2");
+
+  constexpr uint16_t kValueSize = 10000;
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 20;
+  BlockBasedTableOptions bbto;
+  bbto.enable_index_compression = false;
+  bbto.format_version = 6;  // Before custom compression alg support
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  // Claims not to use custom compression (and doesn't unless setting a custom
+  // CompressionType)
+  options.compression_manager = mgr_claim_compatible;
+  // Use a built-in compression type with dictionary support
+  options.compression = kLZ4Compression;
+  options.compression_opts.max_dict_bytes = use_dict_ ? kValueSize / 2 : 0;
+  options.compression_opts.parallel_threads = parallel_threads_;
+  DestroyAndReopen(options);
+
+  Random rnd(404);
+  std::string value;
+  ASSERT_OK(Put("a", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
+  ASSERT_OK(Flush());
+
+  // That data should be readable without access to the original compression
+  // manager, because it used the built-in CompatibilityName and a built-in
+  // CompressionType
+  options.compression_manager = nullptr;
+  Reopen(options);
+  ASSERT_EQ(Get("a"), value);
+
+  // Verify it was compressed
+  Range r = {"a", "a0"};
+  TablePropertiesCollection tables_properties;
+  ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r, 1,
+                                              &tables_properties));
+  ASSERT_EQ(tables_properties.size(), 1U);
+  EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
+  EXPECT_EQ(tables_properties.begin()->second->compression_name, "LZ4");
+
+  // Disallow setting a custom CompressionType with a CompressionManager
+  // claiming to be built-in compatible.
+  options.compression_manager = mgr_claim_compatible;
+  options.compression = kCustomCompression8A;
+  ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
+
+  options.compression_manager = nullptr;
+  options.compression = kCustomCompressionFE;
+  ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
+  options.compression =
+      static_cast<CompressionType>(kLastBuiltinCompression + 1);
+  ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
+
+  // Custom compression schema (different CompatibilityName) not supported
+  // before format_version=7
+  options.compression_manager = mgr_foo;
+  options.compression = kLZ4Compression;
+  ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
+
+  // Set format version supporting custom compression
+  bbto.format_version = 7;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  // Custom compression type not supported with built-in schema name, even
+  // with format_version=7
+  options.compression_manager = mgr_claim_compatible;
+  options.compression = kCustomCompression8B;
+  ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
+
+  // Custom compression schema, but specifying a custom compression type it
+  // doesn't support.
+  options.compression_manager = mgr_foo;
+  options.compression = kCustomCompressionF0;
+  ASSERT_EQ(TryReopen(options).code(), Status::Code::kNotSupported);
+
+  // Using a built-in compression type with fv=7 but named custom schema
+  options.compression = kLZ4Compression;
+  Reopen(options);
+  ASSERT_OK(Put("b", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
+  ASSERT_OK(Flush());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 2);
+  ASSERT_EQ(Get("b"), value);
+
+  // Verify it was compressed with LZ4
+  r = {"b", "b0"};
+  tables_properties.clear();
+  ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r, 1,
+                                              &tables_properties));
+  ASSERT_EQ(tables_properties.size(), 1U);
+  EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
+  // Uses new format for "compression_name" property
+  EXPECT_EQ(tables_properties.begin()->second->compression_name, "Foo;04;");
+  EXPECT_EQ(mgr_foo->last_specific_decompressor_type_.LoadRelaxed(),
+            kLZ4Compression);
+
+  // Custom compression type
+  options.compression = kCustomCompression8A;
+  Reopen(options);
+  ASSERT_OK(Put("c", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
+  EXPECT_EQ(mgr_foo->used_compressor8A_count_, 0);
+  ASSERT_OK(Flush());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 3);
+  ASSERT_EQ(Get("c"), value);
+  EXPECT_EQ(mgr_foo->used_compressor8A_count_, 1);
+
+  // Verify it was compressed with custom format
+  r = {"c", "c0"};
+  tables_properties.clear();
+  ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r, 1,
+                                              &tables_properties));
+  ASSERT_EQ(tables_properties.size(), 1U);
+  EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
+  EXPECT_EQ(tables_properties.begin()->second->compression_name, "Foo;8A;");
+  EXPECT_EQ(mgr_foo->last_specific_decompressor_type_.LoadRelaxed(),
+            kCustomCompression8A);
+
+  // Also dynamically changeable, because the compression manager will respect
+  // the current setting as reported under the legacy logic
+  ASSERT_OK(dbfull()->SetOptions({{"compression", "kLZ4Compression"}}));
+  ASSERT_OK(Put("d", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
+  ASSERT_OK(Flush());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 4);
+  ASSERT_EQ(Get("d"), value);
+
+  // Verify it was compressed with LZ4
+  r = {"d", "d0"};
+  tables_properties.clear();
+  ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r, 1,
+                                              &tables_properties));
+  ASSERT_EQ(tables_properties.size(), 1U);
+  EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
+  EXPECT_EQ(tables_properties.begin()->second->compression_name, "Foo;04;");
+  EXPECT_EQ(mgr_foo->last_specific_decompressor_type_.LoadRelaxed(),
+            kLZ4Compression);
+
+  // Dynamically changeable to custom compressions also
+  ASSERT_OK(dbfull()->SetOptions({{"compression", "kCustomCompression8B"}}));
+  ASSERT_OK(Put("e", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
+  ASSERT_OK(Flush());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 5);
+  ASSERT_EQ(Get("e"), value);
+
+  // Verify it was compressed with custom format
+  r = {"e", "e0"};
+  tables_properties.clear();
+  ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r, 1,
+                                              &tables_properties));
+  ASSERT_EQ(tables_properties.size(), 1U);
+  EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
+  EXPECT_EQ(tables_properties.begin()->second->compression_name, "Foo;8B;");
+  EXPECT_EQ(mgr_foo->last_specific_decompressor_type_.LoadRelaxed(),
+            kCustomCompression8B);
+
+  // Fails to re-open with incompatible compression manager (can't find
+  // compression manager Foo because it's not registered nor known by Bar)
+  options.compression_manager = mgr_bar;
+  options.compression = kLZ4Compression;
+  ASSERT_EQ(TryReopen(options).code(), Status::Code::kNotSupported);
+
+  // But should re-open if we make Bar aware of the Foo compression manager
+  mgr_bar->AddFriend(mgr_foo);
+  Reopen(options);
+
+  // Can still read everything
+  ASSERT_EQ(Get("a").size(), kValueSize);
+  ASSERT_EQ(Get("b").size(), kValueSize);
+  ASSERT_EQ(Get("c").size(), kValueSize);
+  ASSERT_EQ(Get("d").size(), kValueSize);
+  ASSERT_EQ(Get("e").size(), kValueSize);
+
+  // Add a file using mgr_bar
+  ASSERT_OK(Put("f", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
+  ASSERT_OK(Flush());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 6);
+  ASSERT_EQ(Get("f"), value);
+
+  // Verify it was compressed appropriately
+  r = {"f", "f0"};
+  tables_properties.clear();
+  ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r, 1,
+                                              &tables_properties));
+  ASSERT_EQ(tables_properties.size(), 1U);
+  EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
+  EXPECT_EQ(mgr_bar->last_specific_decompressor_type_.LoadRelaxed(),
+            kLZ4Compression);
+
+  // Fails to re-open with incompatible compression manager (can't find
+  // compression manager Bar because it's not registered nor known by Foo)
+  options.compression_manager = mgr_foo;
+  ASSERT_EQ(TryReopen(options).code(), Status::Code::kNotSupported);
+
+  // Register and re-open
+  auto& library = *ObjectLibrary::Default();
+  library.AddFactory<CompressionManager>(
+      mgr_bar->CompatibilityName(),
+      [mgr_bar](const std::string& /*uri*/,
+                std::unique_ptr<CompressionManager>* guard,
+                std::string* /*errmsg*/) {
+        *guard = std::make_unique<MyManager>(mgr_bar->CompatibilityName());
+        return guard->get();
+      });
+  Reopen(options);
+
+  // Can still read everything
+  ASSERT_EQ(Get("a").size(), kValueSize);
+  ASSERT_EQ(Get("b").size(), kValueSize);
+  ASSERT_EQ(Get("c").size(), kValueSize);
+  ASSERT_EQ(Get("d").size(), kValueSize);
+  ASSERT_EQ(Get("e").size(), kValueSize);
+  ASSERT_EQ(Get("f").size(), kValueSize);
+
+  // TODO: test old version of a compression manager unable to read a
+  // compression type
+}
+
+TEST_F(DBCompressionTest, FailWhenCompressionNotSupportedTest) {
+  CompressionType compressions[] = {kZlibCompression, kBZip2Compression,
+                                    kLZ4Compression, kLZ4HCCompression,
+                                    kXpressCompression};
+  for (auto comp : compressions) {
+    if (!CompressionTypeSupported(comp)) {
+      // not supported, we should fail the Open()
+      Options options = CurrentOptions();
+      options.compression = comp;
+      ASSERT_TRUE(!TryReopen(options).ok());
+      // Try if CreateColumnFamily also fails
+      options.compression = kNoCompression;
+      ASSERT_OK(TryReopen(options));
+      ColumnFamilyOptions cf_options(options);
+      cf_options.compression = comp;
+      ColumnFamilyHandle* handle;
+      ASSERT_TRUE(!db_->CreateColumnFamily(cf_options, "name", &handle).ok());
+    }
+  }
+}
+
+class AutoSkipTestFlushBlockPolicy : public FlushBlockPolicy {
+ public:
+  explicit AutoSkipTestFlushBlockPolicy(const int window,
+                                        const BlockBuilder& data_block_builder,
+                                        std::shared_ptr<Statistics> statistics)
+      : window_(window),
+        num_keys_(0),
+        data_block_builder_(data_block_builder),
+        statistics_(statistics) {}
+
+  bool Update(const Slice& /*key*/, const Slice& /*value*/) override {
+    auto nth_window = num_keys_ / window_;
+    if (data_block_builder_.empty()) {
+      // First key in this block
+      return false;
+    }
+    // Check every window
+    if (num_keys_ % window_ == 0) {
+      auto set_exploration = [&](void* arg) {
+        bool* exploration = static_cast<bool*>(arg);
+        *exploration = true;
+      };
+      auto unset_exploration = [&](void* arg) {
+        bool* exploration = static_cast<bool*>(arg);
+        *exploration = false;
+      };
+      SyncPoint::GetInstance()->DisableProcessing();
+      SyncPoint::GetInstance()->ClearAllCallBacks();
+      // We force exploration to set the predicted rejection ratio for odd
+      // window and then test that the prediction is exploited in the even
+      // window
+      if (nth_window % 2 == 0) {
+        SyncPoint::GetInstance()->SetCallBack(
+            "AutoSkipCompressorWrapper::CompressBlock::exploitOrExplore",
+            set_exploration);
+      } else {
+        SyncPoint::GetInstance()->SetCallBack(
+            "AutoSkipCompressorWrapper::CompressBlock::exploitOrExplore",
+            unset_exploration);
+      }
+      SyncPoint::GetInstance()->EnableProcessing();
+
+      auto compressed_count = PopStat(NUMBER_BLOCK_COMPRESSED);
+      auto bypassed_count = PopStat(NUMBER_BLOCK_COMPRESSION_BYPASSED);
+      auto rejected_count = PopStat(NUMBER_BLOCK_COMPRESSION_REJECTED);
+      auto total = compressed_count + rejected_count + bypassed_count;
+      int rejection_percentage, bypassed_percentage, compressed_percentage;
+      if (total != 0) {
+        rejection_percentage = static_cast<int>(rejected_count * 100 / total);
+        bypassed_percentage = static_cast<int>(bypassed_count * 100 / total);
+        compressed_percentage =
+            static_cast<int>(compressed_count * 100 / total);
+        // use nth window to detect test cases and set the expected
+        switch (nth_window) {
+          case 1:
+            // In first window we only explore and thus here we verify that the
+            // correct prediction has been made by the end of the window
+            // Since 6 of 10 blocks are compression unfriendly, the predicted
+            // rejection ratio should be 60%
+            EXPECT_EQ(rejection_percentage, 60);
+            EXPECT_EQ(bypassed_percentage, 0);
+            EXPECT_EQ(compressed_percentage, 40);
+            break;
+          case 2:
+            // With the rejection ratio set to 0.6 all the blocks should be
+            // bypassed in next window
+            EXPECT_EQ(rejection_percentage, 0);
+            EXPECT_EQ(bypassed_percentage, 100);
+            EXPECT_EQ(compressed_percentage, 0);
+            break;
+          case 3:
+            // In third window we only explore and verify that the correct
+            // prediction has been made by the end of the window
+            // since 4 of 10 blocks are compression ufriendly, the predicted
+            // rejection ratio should be 40%
+            EXPECT_EQ(rejection_percentage, 40);
+            EXPECT_EQ(bypassed_percentage, 0);
+            EXPECT_EQ(compressed_percentage, 60);
+            break;
+          case 4:
+            // With the rejection ratio set to 0.4 all the blocks should be
+            // attempted to be compressed
+            // 6 of 10 blocks are compression unfriendly and thus should be
+            // rejected 4 of 10 blocks are compression friendly and thus should
+            // be compressed
+            EXPECT_EQ(rejection_percentage, 60);
+            EXPECT_EQ(bypassed_percentage, 0);
+            EXPECT_EQ(compressed_percentage, 40);
+        }
+      }
+    }
+    num_keys_++;
+    return true;
+  }
+  uint64_t PopStat(Tickers t) { return statistics_->getAndResetTickerCount(t); }
+
+ private:
+  int window_;
+  int num_keys_;
+  const BlockBuilder& data_block_builder_;
+  std::shared_ptr<Statistics> statistics_;
+};
+
+class AutoSkipTestFlushBlockPolicyFactory : public FlushBlockPolicyFactory {
+ public:
+  explicit AutoSkipTestFlushBlockPolicyFactory(
+      const int window, std::shared_ptr<Statistics> statistics)
+      : window_(window), statistics_(statistics) {}
+
+  virtual const char* Name() const override {
+    return "AutoSkipTestFlushBlockPolicyFactory";
+  }
+
+  virtual FlushBlockPolicy* NewFlushBlockPolicy(
+      const BlockBasedTableOptions& /*table_options*/,
+      const BlockBuilder& data_block_builder) const override {
+    (void)data_block_builder;
+    return new AutoSkipTestFlushBlockPolicy(window_, data_block_builder,
+                                            statistics_);
+  }
+
+ private:
+  int window_;
+  std::shared_ptr<Statistics> statistics_;
+};
+
+class DBAutoSkip : public DBTestBase {
+ public:
+  Options options;
+  Random rnd_;
+  int key_index_;
+  DBAutoSkip()
+      : DBTestBase("db_auto_skip", /*env_do_fsync=*/true),
+        options(CurrentOptions()),
+        rnd_(231),
+        key_index_(0) {
+    options.compression_manager = CreateAutoSkipCompressionManager();
+    auto statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    options.statistics = statistics;
+    options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
+    BlockBasedTableOptions bbto;
+    bbto.enable_index_compression = false;
+    bbto.flush_block_policy_factory.reset(
+        new AutoSkipTestFlushBlockPolicyFactory(10, statistics));
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  }
+
+  bool CompressionFriendlyPut(const int no_of_kvs, const int size_of_value) {
+    auto value = std::string(size_of_value, 'A');
+    for (int i = 0; i < no_of_kvs; ++i) {
+      auto status = Put(Key(key_index_), value);
+      EXPECT_EQ(status.ok(), true);
+      key_index_++;
+    }
+    return true;
+  }
+  bool CompressionUnfriendlyPut(const int no_of_kvs, const int size_of_value) {
+    auto value = rnd_.RandomBinaryString(size_of_value);
+    for (int i = 0; i < no_of_kvs; ++i) {
+      auto status = Put(Key(key_index_), value);
+      EXPECT_EQ(status.ok(), true);
+      key_index_++;
+    }
+    return true;
+  }
+};
+
+TEST_F(DBAutoSkip, AutoSkipCompressionManager) {
+  for (uint32_t max_dict_bytes : {0, 10000}) {
+    for (auto type : GetSupportedCompressions()) {
+      if (type == kNoCompression) {
+        continue;
+      }
+      options.compression = type;
+      options.bottommost_compression = type;
+      options.compression_opts.max_dict_bytes = max_dict_bytes;
+      DestroyAndReopen(options);
+      const int kValueSize = 20000;
+      // This will set the rejection ratio to 60%
+      CompressionUnfriendlyPut(6, kValueSize);
+      CompressionFriendlyPut(4, kValueSize);
+      // This will verify all the data block compressions are bypassed based on
+      // previous prediction
+      CompressionUnfriendlyPut(6, kValueSize);
+      CompressionFriendlyPut(4, kValueSize);
+      // This will set the rejection ratio to 40%
+      CompressionUnfriendlyPut(4, kValueSize);
+      CompressionFriendlyPut(6, kValueSize);
+      // This will verify all the data block compression are attempted based on
+      // previous prediction
+      // Compression will be rejected for 6 compression unfriendly blocks
+      // Compression will be accepted for 4 compression friendly blocks
+      CompressionUnfriendlyPut(6, kValueSize);
+      CompressionFriendlyPut(4, kValueSize);
+      // Extra block write to ensure that the all above cases are checked
+      CompressionFriendlyPut(6, kValueSize);
+      CompressionFriendlyPut(4, kValueSize);
+      ASSERT_OK(Flush());
+    }
+  }
+}
+class CostAwareTestFlushBlockPolicy : public FlushBlockPolicy {
+ public:
+  explicit CostAwareTestFlushBlockPolicy(const int window,
+                                         const BlockBuilder& data_block_builder)
+      : window_(window),
+        num_keys_(0),
+        data_block_builder_(data_block_builder) {}
+
+  bool Update(const Slice& /*key*/, const Slice& /*value*/) override {
+    auto nth_window = num_keys_ / window_;
+    if (data_block_builder_.empty()) {
+      // First key in this block
+      return false;
+    }
+    // Check every window
+    if (num_keys_ % window_ == 0) {
+      auto get_predictor = [&](void* arg) {
+        // gets the predictor and sets the mocked cpu and io cost
+        predictor_ = static_cast<IOCPUCostPredictor*>(arg);
+        predictor_->CPUPredictor.SetPrediction(1000);
+        predictor_->IOPredictor.SetPrediction(100);
+      };
+      SyncPoint::GetInstance()->DisableProcessing();
+      SyncPoint::GetInstance()->ClearAllCallBacks();
+
+      // Add syncpoint to get the cpu and io cost
+      SyncPoint::GetInstance()->SetCallBack(
+          "CostAwareCompressor::CompressBlockAndRecord::"
+          "GetPredictor",
+          get_predictor);
+      SyncPoint::GetInstance()->EnableProcessing();
+      // use nth window to detect test cases and set the expected
+      switch (nth_window) {
+        case 0:
+          break;
+        case 1:
+          // Verify that the Mocked cpu cost and io cost are predicted correctly
+          auto predicted_cpu_time = predictor_->CPUPredictor.Predict();
+          auto predicted_io_bytes = predictor_->IOPredictor.Predict();
+          EXPECT_EQ(predicted_io_bytes, 100);
+          EXPECT_EQ(predicted_cpu_time, 1000);
+          break;
+      }
+    }
+    num_keys_++;
+    return true;
+  }
+
+ private:
+  int window_;
+  int num_keys_;
+  const BlockBuilder& data_block_builder_;
+  IOCPUCostPredictor* predictor_;
+};
+class CostAwareTestFlushBlockPolicyFactory : public FlushBlockPolicyFactory {
+ public:
+  explicit CostAwareTestFlushBlockPolicyFactory(const int window)
+      : window_(window) {}
+
+  virtual const char* Name() const override {
+    return "CostAwareTestFlushBlockPolicyFactory";
+  }
+
+  virtual FlushBlockPolicy* NewFlushBlockPolicy(
+      const BlockBasedTableOptions& /*table_options*/,
+      const BlockBuilder& data_block_builder) const override {
+    (void)data_block_builder;
+    return new CostAwareTestFlushBlockPolicy(window_, data_block_builder);
+  }
+
+ private:
+  int window_;
+};
+class DBCompressionCostPredictor : public DBTestBase {
+ public:
+  Options options;
+  DBCompressionCostPredictor()
+      : DBTestBase("db_cpuio_skip", /*env_do_fsync=*/true),
+        options(CurrentOptions()) {
+    options.compression_manager = CreateCostAwareCompressionManager();
+    auto statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    options.statistics = statistics;
+    options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
+    BlockBasedTableOptions bbto;
+    bbto.enable_index_compression = false;
+    bbto.flush_block_policy_factory.reset(
+        new CostAwareTestFlushBlockPolicyFactory(10));
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    DestroyAndReopen(options);
+  }
+};
+TEST_F(DBCompressionCostPredictor, CostAwareCompressorManager) {
+  // making sure that the compression is supported
+  if (!ZSTD_Supported()) {
+    return;
+  }
+  const int kValueSize = 20000;
+  int next_key = 0;
+  Random rnd(231);
+  auto value = rnd.RandomBinaryString(kValueSize);
+  int window_size = 10;
+  auto WindowWrite = [&]() {
+    for (auto i = 0; i < window_size; ++i) {
+      auto status = Put(Key(next_key), value);
+      EXPECT_OK(status);
+      next_key++;
+    }
+  };
+  // This denotes the first window
+  // Mocked to have specific cpu utilization and io cost
+  WindowWrite();
+  // check the predictor is predicting the correct cpu and io cost
+  WindowWrite();
+  ASSERT_OK(Flush());
+}
+
+// Test pre-defined dictionary compression with a custom CompressionManager
+TEST_F(DBCompressionTest, PreDefinedDictionaryCompression) {
+  if (!ZSTD_Supported()) {
+    ROCKSDB_GTEST_BYPASS("ZSTD compression not supported");
+    return;
+  }
+
+  // A custom compressor that returns a pre-defined dictionary
+  class PreDefinedDictCompressor : public CompressorWrapper {
+   public:
+    explicit PreDefinedDictCompressor(std::unique_ptr<Compressor> wrapped,
+                                      std::string dict_data)
+        : CompressorWrapper(std::move(wrapped)),
+          predefined_dict_(std::move(dict_data)) {}
+
+    const char* Name() const override { return "PreDefinedDictCompressor"; }
+
+    DictConfig GetDictGuidance(CacheEntryRole block_type) const override {
+      if (block_type == CacheEntryRole::kDataBlock &&
+          !predefined_dict_.empty()) {
+        return DictPreDefined{/*copy*/ predefined_dict_};
+      }
+      return DictDisabled{};
+    }
+
+    std::unique_ptr<Compressor> Clone() const override {
+      return std::make_unique<PreDefinedDictCompressor>(wrapped_->Clone(),
+                                                        predefined_dict_);
+    }
+
+    std::unique_ptr<Compressor> MaybeCloneSpecialized(
+        CacheEntryRole block_type,
+        DictConfigArgs&& dict_config) const override {
+      // Delegate to wrapped compressor for dictionary handling
+      auto specialized =
+          wrapped_->MaybeCloneSpecialized(block_type, std::move(dict_config));
+      if (specialized) {
+        return specialized;
+      }
+      return nullptr;
+    }
+
+   private:
+    std::string predefined_dict_;
+  };
+
+  // Custom CompatibilityName so the builtin compression manager won't be used
+  static const char* kTestCompatibilityName = "PreDefinedDictTest";
+
+  class PreDefinedDictManager : public CompressionManagerWrapper {
+   public:
+    explicit PreDefinedDictManager(std::shared_ptr<CompressionManager> wrapped,
+                                   std::string dict_data)
+        : CompressionManagerWrapper(std::move(wrapped)),
+          predefined_dict_(std::move(dict_data)) {}
+
+    const char* Name() const override { return "PreDefinedDictManager"; }
+
+    const char* CompatibilityName() const override {
+      return kTestCompatibilityName;
+    }
+
+    std::unique_ptr<Compressor> GetCompressorForSST(
+        const FilterBuildingContext& context, const CompressionOptions& opts,
+        CompressionType preferred) override {
+      auto base = wrapped_->GetCompressorForSST(context, opts, preferred);
+      if (base) {
+        return std::make_unique<PreDefinedDictCompressor>(std::move(base),
+                                                          predefined_dict_);
+      }
+      return nullptr;
+    }
+
+   private:
+    std::string predefined_dict_;
+  };
+
+  // A broken manager that ignores the dictionary when decompressing.
+  // This simulates a buggy decompressor that doesn't properly apply the
+  // dictionary, causing ZSTD to produce wrong output when decompressing
+  // dictionary-compressed data.
+  class BrokenDictManager : public CompressionManagerWrapper {
+   public:
+    explicit BrokenDictManager(std::shared_ptr<CompressionManager> wrapped)
+        : CompressionManagerWrapper(std::move(wrapped)) {}
+
+    const char* Name() const override { return "BrokenDictManager"; }
+
+    const char* CompatibilityName() const override {
+      return kTestCompatibilityName;
+    }
+
+    std::shared_ptr<Decompressor> GetDecompressor() override {
+      return std::make_shared<IgnoreDictDecompressor>(
+          wrapped_->GetDecompressor());
+    }
+
+    std::shared_ptr<Decompressor> GetDecompressorOptimizeFor(
+        CompressionType optimize_for_type) override {
+      return std::make_shared<IgnoreDictDecompressor>(
+          wrapped_->GetDecompressorOptimizeFor(optimize_for_type));
+    }
+
+    std::shared_ptr<Decompressor> GetDecompressorForTypes(
+        const CompressionType* types_begin,
+        const CompressionType* types_end) override {
+      return std::make_shared<IgnoreDictDecompressor>(
+          wrapped_->GetDecompressorForTypes(types_begin, types_end));
+    }
+
+   private:
+    // A decompressor that stores the dictionary (for GetSerializedDict) but
+    // ignores it during decompression, causing ZSTD to produce garbage
+    class IgnoreDictDecompressor : public DecompressorWrapper {
+     public:
+      explicit IgnoreDictDecompressor(std::shared_ptr<Decompressor> wrapped)
+          : DecompressorWrapper(std::move(wrapped)) {}
+
+      IgnoreDictDecompressor(std::shared_ptr<Decompressor> wrapped,
+                             std::string dict)
+          : DecompressorWrapper(std::move(wrapped)),
+            dict_(std::move(dict)),
+            dict_slice_(dict_) {}
+
+      const char* Name() const override { return "IgnoreDictDecompressor"; }
+
+      const Slice& GetSerializedDict() const override { return dict_slice_; }
+
+      Status MaybeCloneForDict(const Slice& serialized_dict,
+                               std::unique_ptr<Decompressor>* out) override {
+        // Store the dict but don't actually use it for decompression
+        *out = std::make_unique<IgnoreDictDecompressor>(
+            wrapped_,
+            std::string(serialized_dict.data(), serialized_dict.size()));
+        return Status::OK();
+      }
+
+     private:
+      std::string dict_;
+      Slice dict_slice_;
+    };
+  };
+
+  // Create a dictionary that will be heavily referenced. The key insight is
+  // that ZSTD dictionary compression works by finding matches between the input
+  // data and the dictionary content. To force ZSTD to create dictionary
+  // references, we need to use data that contains exact copies of dictionary
+  // content.
+  Random rnd(42);
+
+  // Create a dictionary with recognizable patterns
+  std::string predefined_dict;
+  std::vector<std::string> dict_patterns;
+  for (int i = 0; i < 50; i++) {
+    std::string pattern = rnd.RandomString(200);
+    dict_patterns.push_back(pattern);
+    predefined_dict += pattern;
+  }
+  // Total dict size: 50 * 200 = 10000 bytes
+  size_t kDictSize = predefined_dict.size();
+
+  auto mgr = std::make_shared<PreDefinedDictManager>(
+      GetBuiltinV2CompressionManager(), predefined_dict);
+
+  Options options = CurrentOptions();
+  options.compression = kZSTD;
+  options.compression_opts.max_dict_bytes = static_cast<int>(kDictSize);
+  options.compression_manager = mgr;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.enable_index_compression = true;
+  // Need format_version >= 7 for custom CompatibilityName
+  bbto.format_version = 7;
+  // Need dictionary block load statistics
+  bbto.block_cache = NewLRUCache(1 << 20);
+  bbto.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+
+  // Write data that uses the same patterns from the dictionary.
+  // This forces ZSTD to create back-references to the dictionary.
+  std::vector<std::string> expected_values;
+  for (int i = 0; i < 100; i++) {
+    std::string value;
+    // Compose value from random dictionary patterns - same content as dict
+    for (int j = 0; j < 5; j++) {
+      value +=
+          dict_patterns[rnd.Uniform(static_cast<int>(dict_patterns.size()))];
+    }
+    expected_values.push_back(value);
+    ASSERT_OK(Put(Key(i), value));
+  }
+  ASSERT_OK(Flush());
+
+  // Verify dictionary was used by checking that dict bytes were inserted
+  ASSERT_GE(
+      TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+      predefined_dict.size());
+
+  // Read back data and verify correctness
+  for (int i = 0; i < 100; i++) {
+    std::string value;
+    ASSERT_OK(db_->Get(ReadOptions(), Key(i), &value));
+    ASSERT_EQ(value, expected_values[i]);
+  }
+
+  // Now re-open with a broken decompressor that ignores dictionary.
+  // This should result in corruption on read because ZSTD will fail to
+  // decompress data that references the missing dictionary content.
+  Close();
+  auto broken_mgr =
+      std::make_shared<BrokenDictManager>(GetBuiltinV2CompressionManager());
+  options.compression_manager = broken_mgr;
+  // New block cache to ensure dictionary is re-loaded, because the
+  // dictionary block in cache is actually associated with a decompressor
+  bbto.block_cache = NewLRUCache(1 << 20);
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  ASSERT_OK(TryReopen(options));
+
+  // Read should fail with corruption because the decompressor ignores
+  // the dictionary, causing ZSTD to produce garbage output
+  std::string value;
+  ASSERT_EQ(db_->Get(ReadOptions(), Key(0), &value).code(),
+            Status::kCorruption);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/util/crc32c_arm64.cc b/util/crc32c_arm64.cc
index f9753c0aba9b..47c22c030fc5 100644
--- a/util/crc32c_arm64.cc
+++ b/util/crc32c_arm64.cc
@@ -10,7 +10,7 @@
 #if defined(__linux__)
 #include <asm/hwcap.h>
 #endif
-#ifdef ROCKSDB_AUXV_GETAUXVAL_PRESENT
+#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT)
 #include <sys/auxv.h>
 #endif
 #ifndef HWCAP_CRC32
@@ -113,10 +113,9 @@ __attribute__((__no_sanitize__("alignment")))
 __attribute__((__no_sanitize_undefined__))
 #endif
 #endif
-uint32_t
-crc32c_arm64(uint32_t crc, unsigned char const *data, size_t len) {
-  const uint8_t *buf8;
-  const uint64_t *buf64 = (uint64_t *)data;
+uint32_t crc32c_arm64(uint32_t crc, unsigned char const* data, size_t len) {
+  const uint8_t* buf8;
+  const uint64_t* buf64 = (uint64_t*)data;
   int length = (int)len;
   crc ^= 0xffffffff;
 
@@ -148,7 +147,7 @@ crc32c_arm64(uint32_t crc, unsigned char const *data, size_t len) {
       uint32_t k0 = 0xe417f38a, k1 = 0x8f158014;
 
       /* Prefetch data for following block to avoid cache miss */
-      PREF1KL1((uint8_t *)buf64, 1024);
+      PREF1KL1((uint8_t*)buf64, 1024);
 
       /* First 8 byte for better pipelining */
       crc0 = crc32c_u64(crc, *buf64++);
@@ -184,22 +183,22 @@ crc32c_arm64(uint32_t crc, unsigned char const *data, size_t len) {
 #endif
   }  // if Pmull runtime check here
 
-  buf8 = (const uint8_t *)buf64;
+  buf8 = (const uint8_t*)buf64;
   while (length >= 8) {
-    crc = crc32c_u64(crc, *(const uint64_t *)buf8);
+    crc = crc32c_u64(crc, *(const uint64_t*)buf8);
     buf8 += 8;
     length -= 8;
   }
 
   /* The following is more efficient than the straight loop */
   if (length >= 4) {
-    crc = crc32c_u32(crc, *(const uint32_t *)buf8);
+    crc = crc32c_u32(crc, *(const uint32_t*)buf8);
     buf8 += 4;
     length -= 4;
   }
 
   if (length >= 2) {
-    crc = crc32c_u16(crc, *(const uint16_t *)buf8);
+    crc = crc32c_u16(crc, *(const uint16_t*)buf8);
     buf8 += 2;
     length -= 2;
   }
diff --git a/util/crc32c_arm64.h b/util/crc32c_arm64.h
index 5df3fa8d9deb..d2cfab3c7507 100644
--- a/util/crc32c_arm64.h
+++ b/util/crc32c_arm64.h
@@ -36,7 +36,7 @@
   PREF4X64L1(buffer, (PREF_OFFSET), 8) \
   PREF4X64L1(buffer, (PREF_OFFSET), 12)
 
-uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data, size_t len);
+uint32_t crc32c_arm64(uint32_t crc, unsigned char const* data, size_t len);
 uint32_t crc32c_runtime_check(void);
 bool crc32c_pmull_runtime_check(void);
 
diff --git a/util/crc32c_ppc.h b/util/crc32c_ppc.h
index 365ba2c427a1..a3cfc63705f1 100644
--- a/util/crc32c_ppc.h
+++ b/util/crc32c_ppc.h
@@ -14,7 +14,7 @@
 extern "C" {
 #endif
 
-uint32_t crc32c_ppc(uint32_t crc, unsigned char const *buffer, size_t len);
+uint32_t crc32c_ppc(uint32_t crc, unsigned char const* buffer, size_t len);
 
 #ifdef __cplusplus
 }
diff --git a/util/data_structure.cc b/util/data_structure.cc
index 04d0442a5fa9..6987168eebfa 100644
--- a/util/data_structure.cc
+++ b/util/data_structure.cc
@@ -13,4 +13,6 @@ int CountTrailingZeroBitsForSmallEnumSet(uint64_t v) {
   return CountTrailingZeroBits(v);
 }
 
+int BitsSetToOneForSmallEnumSet(uint64_t v) { return BitsSetToOne(v); }
+
 }  // namespace ROCKSDB_NAMESPACE::detail
diff --git a/util/dynamic_bloom.cc b/util/dynamic_bloom.cc
index 0ff3b4a758eb..96e1e0f4367c 100644
--- a/util/dynamic_bloom.cc
+++ b/util/dynamic_bloom.cc
@@ -62,9 +62,9 @@ DynamicBloom::DynamicBloom(Allocator* allocator, uint32_t total_bits,
     // Align on block_bytes boundary
     raw += block_bytes - block_offset;
   }
-  static_assert(sizeof(std::atomic<uint64_t>) == sizeof(uint64_t),
+  static_assert(sizeof(RelaxedAtomic<uint64_t>) == sizeof(uint64_t),
                 "Expecting zero-space-overhead atomic");
-  data_ = reinterpret_cast<std::atomic<uint64_t>*>(raw);
+  data_ = reinterpret_cast<RelaxedAtomic<uint64_t>*>(raw);
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/util/dynamic_bloom.h b/util/dynamic_bloom.h
index 0ff1053ca6a8..e478a60d4102 100644
--- a/util/dynamic_bloom.h
+++ b/util/dynamic_bloom.h
@@ -7,12 +7,10 @@
 
 #include <array>
 #include <atomic>
-#include <memory>
-#include <string>
 
-#include "port/port.h"
 #include "rocksdb/slice.h"
 #include "table/multiget_context.h"
+#include "util/atomic.h"
 #include "util/hash.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -50,16 +48,20 @@ class DynamicBloom {
 
   ~DynamicBloom() {}
 
-  // Assuming single threaded access to this function.
+  // Assuming single thread adding to the DynamicBloom
   void Add(const Slice& key);
 
-  // Like Add, but may be called concurrent with other functions.
+  // Like Add, but may be called concurrently with other functions. Does not
+  // establish happens-before relationship with other functions so requires some
+  // external mechanism to ensure other threads can see the change.
   void AddConcurrently(const Slice& key);
 
   // Assuming single threaded access to this function.
   void AddHash(uint32_t hash);
 
-  // Like AddHash, but may be called concurrent with other functions.
+  // Like AddHash, but may be called concurrently with other functions. Does not
+  // establish happens-before relationship with other functions so requires some
+  // external mechanism to ensure other threads can see the change.
   void AddHashConcurrently(uint32_t hash);
 
   // Multithreaded access to this function is OK
@@ -80,7 +82,7 @@ class DynamicBloom {
   // this stores k/2, the number of words to double-probe.
   const uint32_t kNumDoubleProbes;
 
-  std::atomic<uint64_t>* data_;
+  RelaxedAtomic<uint64_t>* data_;
 
   // or_func(ptr, mask) should effect *ptr |= mask with the appropriate
   // concurrency safety, working with bytes.
@@ -97,21 +99,20 @@ inline void DynamicBloom::AddConcurrently(const Slice& key) {
 }
 
 inline void DynamicBloom::AddHash(uint32_t hash) {
-  AddHash(hash, [](std::atomic<uint64_t>* ptr, uint64_t mask) {
-    ptr->store(ptr->load(std::memory_order_relaxed) | mask,
-               std::memory_order_relaxed);
+  AddHash(hash, [](RelaxedAtomic<uint64_t>* ptr, uint64_t mask) {
+    ptr->StoreRelaxed(ptr->LoadRelaxed() | mask);
   });
 }
 
 inline void DynamicBloom::AddHashConcurrently(uint32_t hash) {
-  AddHash(hash, [](std::atomic<uint64_t>* ptr, uint64_t mask) {
+  AddHash(hash, [](RelaxedAtomic<uint64_t>* ptr, uint64_t mask) {
     // Happens-before between AddHash and MaybeContains is handled by
     // access to versions_->LastSequence(), so all we have to do here is
     // avoid races (so we don't give the compiler a license to mess up
     // our code) and not lose bits.  std::memory_order_relaxed is enough
     // for that.
-    if ((mask & ptr->load(std::memory_order_relaxed)) != mask) {
-      ptr->fetch_or(mask, std::memory_order_relaxed);
+    if ((mask & ptr->LoadRelaxed()) != mask) {
+      ptr->FetchOrRelaxed(mask);
     }
   });
 }
@@ -183,7 +184,7 @@ inline bool DynamicBloom::DoubleProbe(uint32_t h32, size_t byte_offset) const {
     // Two bit probes per uint64_t probe
     uint64_t mask =
         ((uint64_t)1 << (h & 63)) | ((uint64_t)1 << ((h >> 6) & 63));
-    uint64_t val = data_[byte_offset ^ i].load(std::memory_order_relaxed);
+    uint64_t val = data_[byte_offset ^ i].LoadRelaxed();
     if (i + 1 >= kNumDoubleProbes) {
       return (val & mask) == mask;
     } else if ((val & mask) != mask) {
diff --git a/util/dynamic_bloom_test.cc b/util/dynamic_bloom_test.cc
index 949ab8f76bb1..6b35214a9eca 100644
--- a/util/dynamic_bloom_test.cc
+++ b/util/dynamic_bloom_test.cc
@@ -43,13 +43,13 @@ struct KeyMaker {
   // Sequential, within a hash function block
   inline Slice Seq(uint64_t i) {
     a = i;
-    return Slice(reinterpret_cast<char *>(&a), sizeof(a));
+    return Slice(reinterpret_cast<char*>(&a), sizeof(a));
   }
   // Not quite sequential, varies across hash function blocks
   inline Slice Nonseq(uint64_t i) {
     a = i;
     b = i * 123;
-    return Slice(reinterpret_cast<char *>(this), sizeof(*this));
+    return Slice(reinterpret_cast<char*>(this), sizeof(*this));
   }
   inline Slice Key(uint64_t i, bool nonseq) {
     return nonseq ? Nonseq(i) : Seq(i);
@@ -315,7 +315,7 @@ TEST_F(DynamicBloomTest, concurrent_with_perf) {
 
 }  // namespace ROCKSDB_NAMESPACE
 
-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
   ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
   ParseCommandLineFlags(&argc, &argv, true);
diff --git a/util/file_reader_writer_test.cc b/util/file_reader_writer_test.cc
index 35708aa7d61b..3ac8b9fe782b 100644
--- a/util/file_reader_writer_test.cc
+++ b/util/file_reader_writer_test.cc
@@ -1118,7 +1118,7 @@ TEST_F(WritableFileWriterIOPriorityTest, Append) {
 }
 
 TEST_F(WritableFileWriterIOPriorityTest, Pad) {
-  ASSERT_OK(writer_->Pad(IOOptions(), 500));
+  ASSERT_OK(writer_->Pad(IOOptions(), 500, kDefaultPageSize));
 }
 
 TEST_F(WritableFileWriterIOPriorityTest, Flush) {
diff --git a/util/filter_bench.cc b/util/filter_bench.cc
index c94d58194c39..7938d20953a5 100644
--- a/util/filter_bench.cc
+++ b/util/filter_bench.cc
@@ -126,7 +126,7 @@ DEFINE_bool(legend, false,
 
 DEFINE_uint32(runs, 1, "Number of times to rebuild and run benchmark tests");
 
-void _always_assert_fail(int line, const char *file, const char *expr) {
+void _always_assert_fail(int line, const char* file, const char* expr) {
   fprintf(stderr, "%s: %d: Assertion %s failed\n", file, line, expr);
   abort();
 }
@@ -145,7 +145,6 @@ using ROCKSDB_NAMESPACE::BlockContents;
 using ROCKSDB_NAMESPACE::BloomFilterPolicy;
 using ROCKSDB_NAMESPACE::BloomHash;
 using ROCKSDB_NAMESPACE::BloomLikeFilterPolicy;
-using ROCKSDB_NAMESPACE::BuiltinFilterBitsBuilder;
 using ROCKSDB_NAMESPACE::CachableEntry;
 using ROCKSDB_NAMESPACE::Cache;
 using ROCKSDB_NAMESPACE::CacheEntryRole;
@@ -153,6 +152,7 @@ using ROCKSDB_NAMESPACE::CacheEntryRoleOptions;
 using ROCKSDB_NAMESPACE::EncodeFixed32;
 using ROCKSDB_NAMESPACE::Env;
 using ROCKSDB_NAMESPACE::FastRange32;
+using ROCKSDB_NAMESPACE::FilterBitsBuilder;
 using ROCKSDB_NAMESPACE::FilterBitsReader;
 using ROCKSDB_NAMESPACE::FilterBuildingContext;
 using ROCKSDB_NAMESPACE::FilterPolicy;
@@ -195,7 +195,7 @@ struct KeyMaker {
       len += FastRange32(
           (val_num >> FLAGS_vary_key_size_log2_interval) * 1234567891, 5);
     }
-    char *data = buf_.get() + start;
+    char* data = buf_.get() + start;
     // Populate key data such that all data makes it into a key of at
     // least 8 bytes. We also don't want all the within-filter key
     // variance confined to a contiguous 32 bits, because then a 32 bit
@@ -220,7 +220,7 @@ void PrintWarnings() {
 #endif
 }
 
-void PrintError(const char *error) { fprintf(stderr, "ERROR: %s\n", error); }
+void PrintError(const char* error) { fprintf(stderr, "ERROR: %s\n", error); }
 
 struct FilterInfo {
   uint32_t filter_id_ = 0;
@@ -258,7 +258,7 @@ static const std::vector<TestMode> bestCaseTestModes = {
     kSingleFilter,
 };
 
-const char *TestModeToString(TestMode tm) {
+const char* TestModeToString(TestMode tm) {
   switch (tm) {
     case kSingleFilter:
       return "Single filter";
@@ -278,7 +278,7 @@ const char *TestModeToString(TestMode tm) {
 
 // Do just enough to keep some data dependence for the
 // compiler / CPU
-static uint32_t DryRunNoHash(Slice &s) {
+static uint32_t DryRunNoHash(Slice& s) {
   uint32_t sz = static_cast<uint32_t>(s.size());
   if (sz >= 4) {
     return sz + s.data()[3];
@@ -287,16 +287,16 @@ static uint32_t DryRunNoHash(Slice &s) {
   }
 }
 
-static uint32_t DryRunHash32(Slice &s) {
+static uint32_t DryRunHash32(Slice& s) {
   // Same perf characteristics as GetSliceHash()
   return BloomHash(s);
 }
 
-static uint32_t DryRunHash64(Slice &s) {
+static uint32_t DryRunHash64(Slice& s) {
   return Lower32of64(GetSliceHash64(s));
 }
 
-const std::shared_ptr<const FilterPolicy> &GetPolicy() {
+const std::shared_ptr<const FilterPolicy>& GetPolicy() {
   static std::shared_ptr<const FilterPolicy> policy;
   if (!policy) {
     policy = BloomLikeFilterPolicy::Create(
@@ -378,7 +378,7 @@ void FilterBench::Go() {
                                     FLAGS_average_keys_per_filter);
   const uint32_t variance_offset = variance_range / 2;
 
-  const std::vector<TestMode> &testModes = FLAGS_best_case ? bestCaseTestModes
+  const std::vector<TestMode>& testModes = FLAGS_best_case ? bestCaseTestModes
                                            : FLAGS_quick   ? quickTestModes
                                                            : allTestModes;
 
@@ -393,7 +393,7 @@ void FilterBench::Go() {
 
   std::cout << "Building..." << std::endl;
 
-  std::unique_ptr<BuiltinFilterBitsBuilder> builder;
+  std::unique_ptr<FilterBitsBuilder> builder;
 
   size_t total_memory_used = 0;
   size_t total_size = 0;
@@ -425,7 +425,7 @@ void FilterBench::Go() {
       keys_to_add = static_cast<uint32_t>(max_total_keys - total_keys_added);
     }
     infos_.emplace_back();
-    FilterInfo &info = infos_.back();
+    FilterInfo& info = infos_.back();
     info.filter_id_ = filter_id;
     info.keys_added_ = keys_to_add;
     if (FLAGS_use_plain_table_bloom) {
@@ -440,8 +440,7 @@ void FilterBench::Go() {
       info.filter_ = info.plain_table_bloom_->GetRawData();
     } else {
       if (!builder) {
-        builder.reset(
-            static_cast_with_check<BuiltinFilterBitsBuilder>(GetBuilder()));
+        builder.reset(GetBuilder());
       }
       for (uint32_t i = 0; i < keys_to_add; ++i) {
         builder->AddKey(kms_[0].Get(filter_id, i));
@@ -476,7 +475,7 @@ void FilterBench::Go() {
     total_size += info.filter_.size();
 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
     total_memory_used +=
-        malloc_usable_size(const_cast<char *>(info.filter_.data()));
+        malloc_usable_size(const_cast<char*>(info.filter_.data()));
 #endif  // ROCKSDB_MALLOC_USABLE_SIZE
     total_keys_added += keys_to_add;
   }
@@ -514,7 +513,7 @@ void FilterBench::Go() {
         static_cast<uint32_t>(m_queries_ * 1000000 / infos_.size());
     uint64_t fps = 0;
     for (uint32_t i = 0; i < infos_.size(); ++i) {
-      FilterInfo &info = infos_[i];
+      FilterInfo& info = infos_[i];
       for (uint32_t j = 0; j < info.keys_added_; ++j) {
         if (FLAGS_use_plain_table_bloom) {
           uint32_t hash = GetSliceHash(kms_[0].Get(info.filter_id_, j));
@@ -594,7 +593,7 @@ void FilterBench::Go() {
 
 double FilterBench::RandomQueryTest(uint32_t inside_threshold, bool dry_run,
                                     TestMode mode) {
-  for (auto &info : infos_) {
+  for (auto& info : infos_) {
     info.outside_queries_ = 0;
     info.false_positives_ = 0;
   }
@@ -645,14 +644,14 @@ double FilterBench::RandomQueryTest(uint32_t inside_threshold, bool dry_run,
   }
   uint32_t batch_size = 1;
   std::unique_ptr<Slice[]> batch_slices;
-  std::unique_ptr<Slice *[]> batch_slice_ptrs;
+  std::unique_ptr<Slice*[]> batch_slice_ptrs;
   std::unique_ptr<bool[]> batch_results;
   if (mode == kBatchPrepared || mode == kBatchUnprepared) {
     batch_size = static_cast<uint32_t>(kms_.size());
   }
 
   batch_slices.reset(new Slice[batch_size]);
-  batch_slice_ptrs.reset(new Slice *[batch_size]);
+  batch_slice_ptrs.reset(new Slice*[batch_size]);
   batch_results.reset(new bool[batch_size]);
   for (uint32_t i = 0; i < batch_size; ++i) {
     batch_results[i] = false;
@@ -673,7 +672,7 @@ double FilterBench::RandomQueryTest(uint32_t inside_threshold, bool dry_run,
       filter_index = num_primary_filters +
                      random_.Uniformish(num_infos - num_primary_filters);
     }
-    FilterInfo &info = infos_[filter_index];
+    FilterInfo& info = infos_[filter_index];
     for (uint32_t i = 0; i < batch_size; ++i) {
       if (inside_this_time) {
         batch_slices[i] =
@@ -768,7 +767,7 @@ double FilterBench::RandomQueryTest(uint32_t inside_threshold, bool dry_run,
     uint64_t fp = 0;
     double worst_fp_rate = 0.0;
     double best_fp_rate = 1.0;
-    for (auto &info : infos_) {
+    for (auto& info : infos_) {
       q += info.outside_queries_;
       fp += info.false_positives_;
       if (info.outside_queries_ > 0) {
@@ -790,7 +789,7 @@ double FilterBench::RandomQueryTest(uint32_t inside_threshold, bool dry_run,
   return ns;
 }
 
-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
   ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
   SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
                   " [-quick] [OTHER OPTIONS]...");
diff --git a/util/gflags_compat.h b/util/gflags_compat.h
index 8f4a30b0d661..a38273fd7492 100644
--- a/util/gflags_compat.h
+++ b/util/gflags_compat.h
@@ -22,8 +22,8 @@
   namespace gflags_compat {           \
   DEFINE_int32(name, val, txt);       \
   }                                   \
-  uint32_t &FLAGS_##name =            \
-      *reinterpret_cast<uint32_t *>(&gflags_compat::FLAGS_##name);
+  uint32_t& FLAGS_##name =            \
+      *reinterpret_cast<uint32_t*>(&gflags_compat::FLAGS_##name);
 
-#define DECLARE_uint32(name) extern uint32_t &FLAGS_##name;
+#define DECLARE_uint32(name) extern uint32_t& FLAGS_##name;
 #endif  // !DEFINE_uint32
diff --git a/util/hash_test.cc b/util/hash_test.cc
index ccc283a24376..2b3f5a4ae856 100644
--- a/util/hash_test.cc
+++ b/util/hash_test.cc
@@ -233,8 +233,8 @@ TEST(HashTest, Hash64SmallValueSchema) {
             uint64_t{10551812464348219044u});
 }
 
-std::string Hash64TestDescriptor(const char *repeat, size_t limit) {
-  const char *mod61_encode =
+std::string Hash64TestDescriptor(const char* repeat, size_t limit) {
+  const char* mod61_encode =
       "abcdefghijklmnopqrstuvwxyz123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
 
   std::string input;
@@ -388,8 +388,8 @@ TEST(HashTest, Hash128Trivial) {
   }
 }
 
-std::string Hash128TestDescriptor(const char *repeat, size_t limit) {
-  const char *mod61_encode =
+std::string Hash128TestDescriptor(const char* repeat, size_t limit) {
+  const char* mod61_encode =
       "abcdefghijklmnopqrstuvwxyz123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
 
   std::string input;
@@ -615,6 +615,13 @@ static void test_BitOps() {
 
     // BottomNBits
     {
+      // build the mask the extremely slow way
+      T bottom_n_mask = 0x00;
+      for (int j = 0; j < i; j++) {
+        bottom_n_mask <<= 1;
+        bottom_n_mask |= 0x1;
+      }
+
       // An essentially full length value
       T x = everyOtherBit;
       if (i > 2) {
@@ -623,6 +630,11 @@ static void test_BitOps() {
       }
       auto a = BottomNBits(x, i);
       auto b = BottomNBits(~x, i);
+
+      // check that a and b match the expected values
+      EXPECT_EQ(a, x & bottom_n_mask);
+      EXPECT_EQ(b, (~x) & bottom_n_mask);
+
       EXPECT_EQ(x | a, x);
       EXPECT_EQ(a | b, vm1);
       EXPECT_EQ(a & b, T{0});
@@ -838,7 +850,7 @@ TEST(MathTest, Math128) {
 }
 
 TEST(MathTest, Coding128) {
-  const char *in = "_1234567890123456";
+  const char* in = "_1234567890123456";
   // Note: in + 1 is likely unaligned
   Unsigned128 decoded = DecodeFixed128(in + 1);
   EXPECT_EQ(Lower64of128(decoded), 0x3837363534333231U);
@@ -851,7 +863,7 @@ TEST(MathTest, Coding128) {
 }
 
 TEST(MathTest, CodingGeneric) {
-  const char *in = "_1234567890123456";
+  const char* in = "_1234567890123456";
   // Decode
   // Note: in + 1 is likely unaligned
   Unsigned128 decoded128 = DecodeFixedGeneric<Unsigned128>(in + 1);
@@ -887,7 +899,7 @@ TEST(MathTest, CodingGeneric) {
   EXPECT_EQ(std::string("_12"), std::string(out));
 }
 
-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
   fprintf(stderr, "NPHash64 id: %x\n",
           static_cast<int>(ROCKSDB_NAMESPACE::GetSliceNPHash64("RocksDB")));
   ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
diff --git a/util/interval_test.cc b/util/interval_test.cc
new file mode 100644
index 000000000000..caa102df577e
--- /dev/null
+++ b/util/interval_test.cc
@@ -0,0 +1,102 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_test_util.h"
+#include "rocksdb/data_structure.h"
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class IntervalSetTest : public testing::Test {};
+
+TEST_F(IntervalSetTest, BasicTest) {
+  IntervalSet<int> set;
+  set.insert({2, 15});
+  EXPECT_EQ(set.size(), 1);
+  set.insert({5, 9});
+  EXPECT_EQ(set.size(), 1);
+  set.insert({0, 10});
+  EXPECT_EQ(set.size(), 1);
+  set.insert({25, 30});
+  EXPECT_EQ(set.size(), 2);
+  set.insert({16, 25});
+  EXPECT_EQ(set.size(), 2);
+  set.insert({45, 85});
+  ASSERT_EQ(set.size(), 3);
+  auto iter = set.begin();
+  ASSERT_EQ(*iter, Interval<int>(0, 15));
+  iter++;
+  ASSERT_EQ(*iter, Interval<int>(16, 30));
+  iter++;
+  ASSERT_EQ(*iter, Interval<int>(45, 85));
+  set.insert({31});
+  iter = set.begin();
+  ASSERT_EQ(*iter, Interval<int>(0, 15));
+  iter++;
+  ASSERT_EQ(*iter, Interval<int>(16, 30));
+  iter++;
+  ASSERT_EQ(*iter, Interval<int>(31));
+}
+
+TEST_F(IntervalSetTest, SliceTest) {
+  IntervalSet<Slice, Comparator> set(BytewiseComparator());
+  EXPECT_TRUE(set.insert("k00", "k10"));
+  // Should do nothing
+  EXPECT_TRUE(set.insert("k02", "k08"));
+  auto iter = set.begin();
+  ASSERT_EQ(iter->start().ToString(), "k00");
+  ASSERT_EQ(iter->end().ToString(), "k10");
+  ASSERT_EQ(set.size(), 1);
+  iter++;
+  ASSERT_EQ(iter, set.end());
+  EXPECT_TRUE(set.insert("k15", "k20"));
+  EXPECT_TRUE(set.insert("k16"));
+  ASSERT_EQ(set.size(), 2);
+  iter = set.begin();
+  ASSERT_EQ(iter->start().ToString(), "k00");
+  ASSERT_EQ(iter->end().ToString(), "k10");
+  iter++;
+  ASSERT_EQ(iter->start().ToString(), "k15");
+  ASSERT_EQ(iter->has_end(), false);
+  //
+}
+
+TEST_F(IntervalSetTest, PropModeTest) {
+  IntervalSet<Slice, Comparator> set(BytewiseComparator(), true);
+  EXPECT_TRUE(set.insert("k00", "k10"));
+  // Should do nothing
+  EXPECT_FALSE(set.insert("k02", "k08"));
+  EXPECT_EQ(set.size(), 1);
+  EXPECT_TRUE(set.insert("k15", "k20"));
+  EXPECT_EQ(set.size(), 2);
+  EXPECT_FALSE(set.insert("k16"));
+  ASSERT_EQ(set.size(), 2);
+  auto iter = set.begin();
+  ASSERT_EQ(iter->start().ToString(), "k00");
+  ASSERT_EQ(iter->end().ToString(), "k10");
+  iter++;
+  ASSERT_EQ(iter->start().ToString(), "k15");
+  ASSERT_EQ(iter->end().ToString(), "k20");
+  EXPECT_TRUE(set.insert("k12", "k14"));
+  iter = set.begin();
+  ASSERT_EQ(set.size(), 3);
+  ASSERT_EQ(iter->start().ToString(), "k00");
+  ASSERT_EQ(iter->end().ToString(), "k10");
+  iter++;
+  ASSERT_EQ(iter->start().ToString(), "k12");
+  ASSERT_EQ(iter->end().ToString(), "k14");
+  iter++;
+  ASSERT_EQ(iter->start().ToString(), "k15");
+  ASSERT_EQ(iter->end().ToString(), "k20");
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/util/io_dispatcher_imp.cc b/util/io_dispatcher_imp.cc
new file mode 100644
index 000000000000..2789414860c7
--- /dev/null
+++ b/util/io_dispatcher_imp.cc
@@ -0,0 +1,1063 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/io_dispatcher_imp.h"
+
+#include <deque>
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "file/random_access_file_reader.h"
+#include "monitoring/statistics_impl.h"
+#include "port/port.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/io_dispatcher.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/cachable_entry.h"
+#include "table/block_based/reader_common.h"
+#include "table/format.h"
+#include "test_util/sync_point.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// IODispatcherImplData is the base that provides ReleaseMemory interface
+// for ReadSets to call back when releasing blocks. Defined here so it's
+// visible to ReadSet methods.
+struct IODispatcherImplData {
+  virtual ~IODispatcherImplData() = default;
+  virtual void ReleaseMemory(size_t bytes) = 0;
+};
+
+// Helper function to create and pin a block from a buffer
+// Used by both ReadSet::PollAndProcessAsyncIO and IODispatcherImpl::Impl
+static Status CreateAndPinBlockFromBuffer(
+    const std::shared_ptr<IOJob>& job, const BlockHandle& block,
+    uint64_t buffer_start_offset, const Slice& buffer_data,
+    CachableEntry<Block>& pinned_block_entry) {
+  auto* rep = job->table->get_rep();
+
+  // Get decompressor
+  UnownedPtr<Decompressor> decompressor = rep->decompressor.get();
+  CachableEntry<DecompressorDict> cached_dict;
+
+  if (rep->uncompression_dict_reader) {
+    Status s = rep->uncompression_dict_reader->GetOrReadUncompressionDictionary(
+        nullptr, job->job_options.read_options, nullptr, nullptr, &cached_dict);
+    if (!s.ok()) {
+      return s;
+    }
+    if (cached_dict.GetValue()) {
+      decompressor = cached_dict.GetValue()->decompressor_.get();
+    }
+  }
+
+  // Create block from buffer data
+  const auto block_size_with_trailer =
+      BlockBasedTable::BlockSizeWithTrailer(block);
+  const auto block_offset_in_buffer = block.offset() - buffer_start_offset;
+
+  CacheAllocationPtr data = AllocateBlock(
+      block_size_with_trailer, GetMemoryAllocator(rep->table_options));
+  memcpy(data.get(), buffer_data.data() + block_offset_in_buffer,
+         block_size_with_trailer);
+  BlockContents tmp_contents(std::move(data), block.size());
+
+#ifndef NDEBUG
+  tmp_contents.has_trailer = rep->footer.GetBlockTrailerSize() > 0;
+#endif
+
+  return job->table->CreateAndPinBlockInCache<Block_kData>(
+      job->job_options.read_options, block, decompressor, &tmp_contents,
+      &pinned_block_entry.As<Block_kData>());
+}
+
+// State for async IO operations (implementation detail)
+struct AsyncIOState {
+  AsyncIOState() : offset(static_cast<uint64_t>(-1)) {}
+  ~AsyncIOState() { read_req.status.PermitUncheckedError(); }
+
+  AsyncIOState(const AsyncIOState&) = delete;
+  AsyncIOState& operator=(const AsyncIOState&) = delete;
+  AsyncIOState(AsyncIOState&&) = default;
+  AsyncIOState& operator=(AsyncIOState&&) = default;
+
+  std::unique_ptr<char[]> buf;
+  AlignedBuf aligned_buf;
+  void* io_handle = nullptr;
+  IOHandleDeleter del_fn;
+  uint64_t offset;
+  std::vector<size_t> block_indices;
+  std::vector<BlockHandle> blocks;
+  FSReadRequest read_req;
+};
+
+// ReadSet destructor - clean up IO handles
+// Must call AbortIO before deleting handles to avoid use-after-free when
+// io_uring completions arrive for deleted handles.
+ReadSet::~ReadSet() {
+  // Release memory for any blocks still pinned
+  // Note: block_sizes_[i] is only set for async IO reads where memory
+  // limiting applies. For sync reads, block_sizes_ remains 0, so this
+  // loop is effectively a no-op for sync reads.
+  if (auto dispatcher_data = dispatcher_data_.lock()) {
+    for (size_t i = 0; i < block_sizes_.size(); ++i) {
+      if (block_sizes_[i] > 0 && pinned_blocks_[i].GetValue()) {
+        dispatcher_data->ReleaseMemory(block_sizes_[i]);
+      }
+    }
+  }
+
+  if (async_io_map_.empty()) {
+    return;
+  }
+
+  // Collect unique pending IO handles (multiple block indices may share the
+  // same async_state due to coalescing)
+  std::vector<void*> pending_handles;
+  std::unordered_set<void*> seen_handles;
+  for (auto& pair : async_io_map_) {
+    auto& async_state = pair.second;
+    if (async_state->io_handle != nullptr &&
+        seen_handles.find(async_state->io_handle) == seen_handles.end()) {
+      pending_handles.push_back(async_state->io_handle);
+      seen_handles.insert(async_state->io_handle);
+    }
+  }
+
+  // Abort all pending IO operations before deleting handles
+  if (!pending_handles.empty() && fs_) {
+    // AbortIO cancels pending requests and waits for completions
+    IOStatus s = fs_->AbortIO(pending_handles);
+    (void)s;  // Ignore errors in destructor
+  }
+
+  // Now safe to delete the handles
+  for (auto& pair : async_io_map_) {
+    auto& async_state = pair.second;
+    if (async_state->io_handle != nullptr && async_state->del_fn != nullptr) {
+      async_state->del_fn(async_state->io_handle);
+      async_state->io_handle = nullptr;
+    }
+  }
+}
+
+// Main Read() method - transparently handles cache, async IO, and sync reads
+Status ReadSet::ReadIndex(size_t block_index, CachableEntry<Block>* out) {
+  // Bounds check
+  if (block_index >= pinned_blocks_.size()) {
+    return Status::InvalidArgument("Block index out of range");
+  }
+
+  // Case 1: Block is already available (from cache or sync read during
+  // SubmitJob)
+  if (pinned_blocks_[block_index].GetValue()) {
+    *out = std::move(pinned_blocks_[block_index]);
+    // Note: Statistics for this block were already counted during SubmitJob
+    // (either as cache hit or sync read)
+    return Status::OK();
+  }
+
+  // Case 2: Block has async IO in progress - poll and process
+  if (job_->job_options.read_options.async_io) {
+    auto it = async_io_map_.find(block_index);
+    if (it != async_io_map_.end()) {
+      // Get the number of blocks in this coalesced async request BEFORE polling
+      // (since PollAndProcessAsyncIO will remove entries from the map)
+      size_t num_blocks_in_request = it->second->block_indices.size();
+
+      if (Status s = PollAndProcessAsyncIO(it->second); !s.ok()) {
+        return s;
+      }
+      // Count all blocks that were read in this async request
+      num_async_reads_ += num_blocks_in_request;
+
+      // After polling, the block should be in pinned_blocks_
+      if (pinned_blocks_[block_index].GetValue()) {
+        *out = std::move(pinned_blocks_[block_index]);
+        return Status::OK();
+      }
+
+      return Status::IOError("Failed to process async IO result");
+    }
+  }
+
+  // Case 3: Block needs synchronous read
+  // If this block was pending prefetch, remove it since we're reading it now
+  RemoveFromPending(block_index);
+
+  Status s = SyncRead(block_index);
+  if (s.ok()) {
+    *out = std::move(pinned_blocks_[block_index]);
+    num_sync_reads_++;
+  }
+  return s;
+}
+
+Status ReadSet::ReadOffset(size_t offset, CachableEntry<Block>* out) {
+  if (sorted_block_indices_.empty()) {
+    return Status::InvalidArgument("ReadSet not initialized");
+  }
+
+  // Use binary search on the sorted index to find the block containing offset.
+  // sorted_block_indices_ contains original indices sorted by block offset.
+  const auto& block_handles = job_->block_handles;
+
+  // Binary search for the first block whose offset is > offset, then back up
+  auto it = std::upper_bound(sorted_block_indices_.begin(),
+                             sorted_block_indices_.end(), offset,
+                             [&block_handles](size_t off, size_t idx) {
+                               return off < block_handles[idx].offset();
+                             });
+
+  // If it == begin(), offset is before all blocks
+  if (it == sorted_block_indices_.begin()) {
+    return Status::InvalidArgument("Offset not found in any block");
+  }
+
+  // Back up to the candidate block (largest offset <= our offset)
+  --it;
+  size_t candidate_idx = *it;
+  const auto& handle = block_handles[candidate_idx];
+
+  // Check if offset falls within this block
+  if (offset >= handle.offset() && offset < (handle.offset() + handle.size())) {
+    return ReadIndex(candidate_idx, out);
+  }
+
+  return Status::InvalidArgument("Offset not found in any block");
+}
+
+void ReadSet::ReleaseBlock(size_t block_index) {
+  if (block_index >= pinned_blocks_.size()) {
+    return;
+  }
+
+  // Remove from pending if applicable
+  RemoveFromPending(block_index);
+
+  // Release memory BEFORE unpinning
+  // Note: block_sizes_[idx] is only set for async IO reads where memory
+  // limiting applies. For sync reads, block_sizes_ remains 0, so this
+  // check implicitly skips ReleaseMemory for sync reads.
+  if (pinned_blocks_[block_index].GetValue() &&
+      block_index < block_sizes_.size() && block_sizes_[block_index] > 0) {
+    if (auto dispatcher_data = dispatcher_data_.lock()) {
+      dispatcher_data->ReleaseMemory(block_sizes_[block_index]);
+    }
+    block_sizes_[block_index] = 0;  // Prevent double-release
+  }
+
+  // Unpin the block from cache
+  pinned_blocks_[block_index].Reset();
+  // Clean up any pending async IO for this block
+  async_io_map_.erase(block_index);
+}
+
+bool ReadSet::IsBlockAvailable(size_t block_index) const {
+  if (block_index >= pinned_blocks_.size()) {
+    return false;
+  }
+  // Block is available if it hasn't been released (still has a value or
+  // has pending async IO)
+  return pinned_blocks_[block_index].GetValue() != nullptr ||
+         async_io_map_.find(block_index) != async_io_map_.end();
+}
+
+// Poll and process async IO for a specific block
+Status ReadSet::PollAndProcessAsyncIO(
+    const std::shared_ptr<AsyncIOState>& async_state) {
+  auto* rep = job_->table->get_rep();
+
+  // Poll for IO completion using FileSystem Poll API
+  std::vector<void*> io_handles = {async_state->io_handle};
+  IOStatus io_s = rep->ioptions.env->GetFileSystem()->Poll(io_handles, 1);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+
+  // Check for read errors
+  if (!async_state->read_req.status.ok()) {
+    return async_state->read_req.status;
+  }
+
+  // Use the result slice from the callback which has been correctly set
+  // with any necessary alignment adjustments for direct IO
+  const Slice& buffer_data = async_state->read_req.result;
+
+  // Process all blocks in this async request
+  for (size_t i = 0; i < async_state->block_indices.size(); ++i) {
+    const size_t idx = async_state->block_indices[i];
+    const auto& block_handle = async_state->blocks[i];
+
+    Status s =
+        CreateAndPinBlockFromBuffer(job_, block_handle, async_state->offset,
+                                    buffer_data, pinned_blocks_[idx]);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  // Clean up IO handle
+  if (async_state->io_handle != nullptr && async_state->del_fn != nullptr) {
+    async_state->del_fn(async_state->io_handle);
+    async_state->io_handle = nullptr;
+  }
+
+  // Remove from map - all blocks in this request have been processed
+  // Store indices in a temporary vector to avoid iterator invalidation
+  std::vector<size_t> indices_to_remove = async_state->block_indices;
+  for (const auto idx : indices_to_remove) {
+    async_io_map_.erase(idx);
+  }
+
+  return Status::OK();
+}
+
+// Perform synchronous read for a specific block
+// This performs a direct synchronous read from disk when the block is not in
+// cache
+Status ReadSet::SyncRead(size_t block_index) {
+  const auto& block_handle = job_->block_handles[block_index];
+  auto* rep = job_->table->get_rep();
+
+  return job_->table->RetrieveBlock<Block_kData>(
+      /*prefetch_buffer=*/nullptr, job_->job_options.read_options, block_handle,
+      rep->decompressor.get(), &pinned_blocks_[block_index].As<Block_kData>(),
+      /*get_context=*/nullptr, /*lookup_context=*/nullptr,
+      /*for_compaction=*/false, /*use_cache=*/true,
+      /*async_read=*/false, /*use_block_cache_for_lookup=*/true);
+}
+
+// A pre-coalesced group of blocks for prefetching
+struct CoalescedPrefetchGroup {
+  std::vector<size_t> block_indices;  // Blocks in this group (sorted by offset)
+  size_t total_bytes = 0;             // Total bytes for this IO
+};
+
+// State for a pending memory request waiting to be granted
+// Groups are pre-coalesced at queue time for efficient dispatch
+struct PendingPrefetchRequest {
+  std::weak_ptr<ReadSet> read_set;
+  std::shared_ptr<IOJob> job;
+
+  // Pre-coalesced groups ready for dispatch (ordered by first block index)
+  std::deque<CoalescedPrefetchGroup> coalesced_groups;
+
+  // Individual block indices still pending (for RemoveFromPending lookup)
+  std::unordered_set<size_t> block_indices_to_prefetch;
+
+  std::atomic<size_t> pending_bytes_{0};  // Track remaining bytes
+  mutable port::Mutex groups_mutex_;  // Protects groups and set modifications
+};
+
+// Remove a block from pending prefetch (called when block is read or released)
+void ReadSet::RemoveFromPending(size_t block_index) {
+  if (!pending_prefetch_flags_ || block_index >= pending_prefetch_flags_size_) {
+    return;
+  }
+
+  // Atomic exchange - returns true only if it was previously true
+  if (!pending_prefetch_flags_[block_index].exchange(false)) {
+    return;  // Already removed or never pending
+  }
+
+  if (pending_request_) {
+    MutexLock lock(&pending_request_->groups_mutex_);
+    pending_request_->block_indices_to_prefetch.erase(block_index);
+    pending_request_->pending_bytes_ -= block_sizes_[block_index];
+  }
+}
+
+// IODispatcherImpl::Impl inherits from IODispatcherImplData
+struct IODispatcherImpl::Impl : public IODispatcherImplData,
+                                public std::enable_shared_from_this<Impl> {
+  explicit Impl(const IODispatcherOptions& options);
+  ~Impl() override;
+
+  // Non-copyable and non-movable
+  Impl(const Impl&) = delete;
+  Impl& operator=(const Impl&) = delete;
+  Impl(Impl&&) = delete;
+  Impl& operator=(Impl&&) = delete;
+
+  Status SubmitJob(const std::shared_ptr<IOJob>& job,
+                   std::shared_ptr<ReadSet>* read_set);
+
+  // Memory management methods - non-blocking
+  bool TryAcquireMemory(size_t bytes);
+  void ReleaseMemory(size_t bytes) override;
+
+  // Memory limiting state
+  size_t max_prefetch_memory_bytes_ = 0;
+  std::atomic<size_t> memory_used_{0};  // Atomic for lock-free accounting
+  std::atomic<bool> has_pending_requests_{false};  // Fast-path check
+  port::Mutex memory_mutex_;  // Only for pending_prefetch_queue_ access
+  std::deque<std::shared_ptr<PendingPrefetchRequest>> pending_prefetch_queue_;
+  Statistics* statistics_ = nullptr;
+
+ private:
+  void PrepareIORequests(
+      const std::shared_ptr<IOJob>& job,
+      const std::vector<size_t>& block_indices_to_read,
+      const std::vector<BlockHandle>& block_handles,
+      std::vector<FSReadRequest>* read_reqs,
+      std::vector<std::vector<size_t>>* coalesced_block_indices);
+
+  // Surface actual async IO errors to caller, but allow fallback for
+  // unsupported cases. Returns block indices that need sync fallback.
+  std::vector<size_t> ExecuteAsyncIO(
+      const std::shared_ptr<IOJob>& job,
+      const std::shared_ptr<ReadSet>& read_set,
+      std::vector<FSReadRequest>& read_reqs,
+      const std::vector<std::vector<size_t>>& coalesced_block_indices,
+      Status* out_status);
+
+  Status ExecuteSyncIO(
+      const std::shared_ptr<IOJob>& job,
+      const std::shared_ptr<ReadSet>& read_set,
+      std::vector<FSReadRequest>& read_reqs,
+      const std::vector<std::vector<size_t>>& coalesced_block_indices);
+
+  // Try to dispatch pending prefetch requests when memory becomes available
+  void TryDispatchPendingPrefetches();
+
+  // Dispatch prefetch for a specific ReadSet (called when memory is available)
+  void DispatchPrefetch(const std::shared_ptr<ReadSet>& read_set,
+                        const std::shared_ptr<IOJob>& job,
+                        const std::vector<size_t>& block_indices);
+
+  // Pre-coalesce blocks into groups, respecting max_group_bytes size limit.
+  // Returns groups ordered by first block index (earlier blocks first).
+  std::vector<CoalescedPrefetchGroup> PreCoalesceBlocks(
+      const std::shared_ptr<IOJob>& job, const std::shared_ptr<ReadSet>& rs,
+      const std::vector<size_t>& block_indices, size_t max_group_bytes);
+};
+
+IODispatcherImpl::Impl::Impl(const IODispatcherOptions& options)
+    : max_prefetch_memory_bytes_(options.max_prefetch_memory_bytes),
+      statistics_(options.statistics) {}
+
+IODispatcherImpl::Impl::~Impl() {}
+
+bool IODispatcherImpl::Impl::TryAcquireMemory(size_t bytes) {
+  if (max_prefetch_memory_bytes_ == 0) {
+    return true;  // No limit configured
+  }
+
+  // Lock-free memory acquisition using compare-exchange
+  size_t current = memory_used_.load(std::memory_order_relaxed);
+  while (true) {
+    if (current + bytes > max_prefetch_memory_bytes_) {
+      // Not enough memory - caller should queue for later
+      RecordTick(statistics_, PREFETCH_MEMORY_REQUESTS_BLOCKED);
+      return false;
+    }
+    if (memory_used_.compare_exchange_weak(current, current + bytes,
+                                           std::memory_order_release,
+                                           std::memory_order_relaxed)) {
+      RecordTick(statistics_, PREFETCH_MEMORY_BYTES_GRANTED, bytes);
+      return true;
+    }
+    // current is updated by compare_exchange_weak on failure, retry
+  }
+}
+
+void IODispatcherImpl::Impl::ReleaseMemory(size_t bytes) {
+  if (max_prefetch_memory_bytes_ == 0) {
+    return;  // No limit configured
+  }
+
+  // Lock-free memory release using atomic fetch_sub
+  size_t old_val = memory_used_.fetch_sub(bytes, std::memory_order_release);
+  assert(old_val >= bytes);
+  (void)old_val;  // Suppress unused warning in release builds
+  RecordTick(statistics_, PREFETCH_MEMORY_BYTES_RELEASED, bytes);
+
+  // Fast-path: skip dispatch attempt if no pending requests
+  // This avoids mutex contention in the common single-threaded iterator case
+  if (!has_pending_requests_.load(std::memory_order_acquire)) {
+    return;
+  }
+
+  // Try to dispatch pending prefetches now that memory is available
+  TryDispatchPendingPrefetches();
+}
+
+void IODispatcherImpl::Impl::TryDispatchPendingPrefetches() {
+  // Process pending prefetch requests - dispatch entire coalesced groups
+  while (true) {
+    std::shared_ptr<PendingPrefetchRequest> pending;
+
+    {
+      MutexLock lock(&memory_mutex_);
+      if (pending_prefetch_queue_.empty()) {
+        has_pending_requests_.store(false, std::memory_order_release);
+        return;
+      }
+
+      // Get the next pending request
+      pending = std::move(pending_prefetch_queue_.front());
+      pending_prefetch_queue_.pop_front();
+    }
+
+    // Check if the ReadSet is still alive
+    auto read_set = pending->read_set.lock();
+    if (!read_set) {
+      continue;  // ReadSet was destroyed, skip this request
+    }
+
+    // Try to acquire memory for coalesced groups (entire groups at a time)
+    std::vector<size_t> blocks_to_dispatch;
+    bool has_remaining_groups = false;
+
+    {
+      MutexLock lock(&pending->groups_mutex_);
+
+      while (!pending->coalesced_groups.empty()) {
+        auto& group = pending->coalesced_groups.front();
+
+        // Filter out blocks that were already read (not in pending set anymore)
+        std::vector<size_t> remaining_blocks;
+        size_t remaining_bytes = 0;
+        for (size_t idx : group.block_indices) {
+          if (pending->block_indices_to_prefetch.count(idx) > 0) {
+            remaining_blocks.push_back(idx);
+            remaining_bytes += read_set->block_sizes_[idx];
+          }
+        }
+
+        // Skip empty groups (all blocks were already read)
+        if (remaining_blocks.empty()) {
+          pending->coalesced_groups.pop_front();
+          continue;
+        }
+
+        // Try to acquire memory for remaining blocks only
+        if (TryAcquireMemory(remaining_bytes)) {
+          // Add all remaining blocks from this group to dispatch
+          for (size_t idx : remaining_blocks) {
+            blocks_to_dispatch.push_back(idx);
+            pending->block_indices_to_prefetch.erase(idx);
+          }
+          pending->pending_bytes_ -= remaining_bytes;
+          pending->coalesced_groups.pop_front();
+        } else {
+          // Not enough memory for this group - update with remaining blocks
+          group.block_indices = std::move(remaining_blocks);
+          group.total_bytes = remaining_bytes;
+          has_remaining_groups = true;
+          break;
+        }
+      }
+    }
+
+    // Save job before potential move of pending
+    auto job = pending->job;
+
+    // Requeue if groups remain
+    if (has_remaining_groups) {
+      MutexLock lock(&memory_mutex_);
+      pending_prefetch_queue_.push_front(std::move(pending));
+    } else {
+      // All groups dispatched, clear pending state
+      read_set->pending_request_.reset();
+    }
+
+    // Clear pending flags for dispatched blocks
+    if (read_set->pending_prefetch_flags_) {
+      for (size_t idx : blocks_to_dispatch) {
+        if (idx < read_set->pending_prefetch_flags_size_) {
+          read_set->pending_prefetch_flags_[idx].store(false);
+        }
+      }
+    }
+
+    // Dispatch acquired blocks
+    if (!blocks_to_dispatch.empty()) {
+      DispatchPrefetch(read_set, job, blocks_to_dispatch);
+    }
+
+    // If we dispatched nothing, stop (no memory available for any group)
+    if (blocks_to_dispatch.empty()) {
+      return;
+    }
+  }
+}
+
+void IODispatcherImpl::Impl::DispatchPrefetch(
+    const std::shared_ptr<ReadSet>& read_set, const std::shared_ptr<IOJob>& job,
+    const std::vector<size_t>& block_indices) {
+  // Sync point for testing partial prefetch - passes number of blocks being
+  // dispatched
+  TEST_SYNC_POINT_CALLBACK("IODispatcherImpl::DispatchPrefetch:BlockCount",
+                           const_cast<std::vector<size_t>*>(&block_indices));
+
+  // Prepare and execute IO for the given blocks
+  std::vector<FSReadRequest> read_reqs;
+  std::vector<std::vector<size_t>> coalesced_block_indices;
+  PrepareIORequests(job, block_indices, job->block_handles, &read_reqs,
+                    &coalesced_block_indices);
+
+  if (job->job_options.read_options.async_io) {
+    Status async_status;
+    std::vector<size_t> fallback_indices = ExecuteAsyncIO(
+        job, read_set, read_reqs, coalesced_block_indices, &async_status);
+
+    // For blocks where async is not supported, do sync IO
+    if (!fallback_indices.empty()) {
+      std::vector<FSReadRequest> sync_read_reqs;
+      std::vector<std::vector<size_t>> sync_coalesced_indices;
+      PrepareIORequests(job, fallback_indices, job->block_handles,
+                        &sync_read_reqs, &sync_coalesced_indices);
+      // Prefetch errors are ignored - user will get the error when reading
+      Status s =
+          ExecuteSyncIO(job, read_set, sync_read_reqs, sync_coalesced_indices);
+      s.PermitUncheckedError();
+      read_set->num_sync_reads_ += fallback_indices.size();
+    }
+    // Async errors are also ignored - user will get the error when reading
+    async_status.PermitUncheckedError();
+  } else {
+    // Prefetch errors are ignored - user will get the error when reading
+    Status s = ExecuteSyncIO(job, read_set, read_reqs, coalesced_block_indices);
+    s.PermitUncheckedError();
+    read_set->num_sync_reads_ += block_indices.size();
+  }
+}
+
+Status IODispatcherImpl::Impl::SubmitJob(const std::shared_ptr<IOJob>& job,
+                                         std::shared_ptr<ReadSet>* read_set) {
+  if (!read_set) {
+    return Status::InvalidArgument("read_set output parameter is null");
+  }
+
+  auto rs = std::make_shared<ReadSet>();
+
+  // Initialize ReadSet
+  rs->job_ = job;
+  rs->fs_ = job->table->get_rep()->ioptions.env->GetFileSystem();
+  rs->pinned_blocks_.resize(job->block_handles.size());
+  rs->block_sizes_.resize(job->block_handles.size(), 0);
+
+  // Build sorted index for O(log n) ReadOffset lookups via binary search.
+  // sorted_block_indices_[i] = original index of i-th smallest block by offset.
+  rs->sorted_block_indices_.resize(job->block_handles.size());
+  for (size_t i = 0; i < job->block_handles.size(); ++i) {
+    rs->sorted_block_indices_[i] = i;
+  }
+  std::sort(rs->sorted_block_indices_.begin(), rs->sorted_block_indices_.end(),
+            [&job](size_t a, size_t b) {
+              return job->block_handles[a].offset() <
+                     job->block_handles[b].offset();
+            });
+
+  // Step 1: Check cache and pin cached blocks
+  std::vector<size_t> block_indices_to_read;
+
+  for (size_t i = 0; i < job->block_handles.size(); ++i) {
+    const auto& data_block_handle = job->block_handles[i];
+
+    // Lookup and pin block in cache
+    Status s = job->table->LookupAndPinBlocksInCache<Block_kData>(
+        job->job_options.read_options, data_block_handle,
+        &(rs->pinned_blocks_)[i].As<Block_kData>());
+
+    if (!s.ok()) {
+      continue;
+    }
+
+    if (!(rs->pinned_blocks_)[i].GetValue()) {
+      // Block not in cache - needs to be read from disk
+      block_indices_to_read.emplace_back(i);
+    }
+  }
+
+  // Step 2: Prepare IO requests for blocks not in cache
+  if (block_indices_to_read.empty()) {
+    // All blocks found in cache - count them as cache hits
+    rs->num_cache_hits_ = job->block_handles.size();
+    *read_set = std::move(rs);
+    return Status::OK();
+  }
+
+  // Count cache hits (blocks that were found in cache during lookup above)
+  rs->num_cache_hits_ =
+      job->block_handles.size() - block_indices_to_read.size();
+
+  // Calculate block sizes for uncached blocks
+  for (const auto& idx : block_indices_to_read) {
+    size_t block_size =
+        BlockBasedTable::BlockSizeWithTrailer(job->block_handles[idx]);
+    rs->block_sizes_[idx] = block_size;
+  }
+
+  // Store dispatcher reference for release callbacks
+  rs->dispatcher_data_ = shared_from_this();
+
+  // Pre-coalesce blocks into groups, respecting memory budget per group
+  // This ensures we dispatch meaningful IO sizes, not tiny single-block IOs
+  // Both memory-limited and non-memory-limited paths use the same coalescing
+  auto coalesced_groups = PreCoalesceBlocks(job, rs, block_indices_to_read,
+                                            max_prefetch_memory_bytes_);
+
+  std::vector<size_t> blocks_to_dispatch;
+  std::deque<CoalescedPrefetchGroup> groups_to_queue;
+
+  // Try to acquire memory for entire coalesced groups
+  for (auto& group : coalesced_groups) {
+    if (TryAcquireMemory(group.total_bytes)) {
+      // Add all blocks from this group to dispatch
+      for (size_t idx : group.block_indices) {
+        blocks_to_dispatch.push_back(idx);
+      }
+    } else {
+      // Queue this group for later
+      groups_to_queue.push_back(std::move(group));
+    }
+  }
+
+  // Dispatch acquired blocks immediately
+  if (!blocks_to_dispatch.empty()) {
+    DispatchPrefetch(rs, job, blocks_to_dispatch);
+  }
+
+  // Queue remaining groups for later (only applies when memory limiting)
+  if (!groups_to_queue.empty()) {
+    auto pending = std::make_shared<PendingPrefetchRequest>();
+    pending->read_set = rs;
+    pending->job = job;
+
+    size_t pending_bytes = 0;
+    for (const auto& group : groups_to_queue) {
+      for (size_t idx : group.block_indices) {
+        pending->block_indices_to_prefetch.insert(idx);
+      }
+      pending_bytes += group.total_bytes;
+    }
+    pending->coalesced_groups = std::move(groups_to_queue);
+    pending->pending_bytes_ = pending_bytes;
+
+    // Set up pending flags for queued blocks only
+    size_t num_blocks = job->block_handles.size();
+    rs->pending_prefetch_flags_ =
+        std::make_unique<std::atomic<bool>[]>(num_blocks);
+    rs->pending_prefetch_flags_size_ = num_blocks;
+    for (size_t idx : pending->block_indices_to_prefetch) {
+      rs->pending_prefetch_flags_[idx].store(true);
+    }
+    rs->pending_request_ = pending;
+
+    {
+      MutexLock lock(&memory_mutex_);
+      pending_prefetch_queue_.push_back(std::move(pending));
+      has_pending_requests_.store(true, std::memory_order_release);
+    }
+  }
+
+  *read_set = std::move(rs);
+  return Status::OK();
+}
+
+void IODispatcherImpl::Impl::PrepareIORequests(
+    const std::shared_ptr<IOJob>& job,
+    const std::vector<size_t>& block_indices_to_read,
+    const std::vector<BlockHandle>& block_handles,
+    std::vector<FSReadRequest>* read_reqs,
+    std::vector<std::vector<size_t>>* coalesced_block_indices) {
+  // This is necessary because block handles may not be in sorted order
+  std::vector<size_t> sorted_block_indices = block_indices_to_read;
+  std::sort(sorted_block_indices.begin(), sorted_block_indices.end(),
+            [&block_handles](size_t a, size_t b) {
+              return block_handles[a].offset() < block_handles[b].offset();
+            });
+
+  assert(coalesced_block_indices->empty());
+  coalesced_block_indices->resize(1);
+
+  for (const auto& block_idx : sorted_block_indices) {
+    if (!coalesced_block_indices->back().empty()) {
+      // Check if we can coalesce with previous block
+      const auto& last_block_handle =
+          block_handles[coalesced_block_indices->back().back()];
+      uint64_t last_block_end =
+          last_block_handle.offset() +
+          BlockBasedTable::BlockSizeWithTrailer(last_block_handle);
+      uint64_t current_start = block_handles[block_idx].offset();
+
+      if (current_start >
+          last_block_end + job->job_options.io_coalesce_threshold) {
+        // Gap too large - start new IO request
+        coalesced_block_indices->emplace_back();
+      }
+    }
+    coalesced_block_indices->back().emplace_back(block_idx);
+  }
+
+  // Create FSReadRequest for each coalesced group
+  assert(read_reqs->empty());
+  read_reqs->reserve(coalesced_block_indices->size());
+
+  for (const auto& block_indices : *coalesced_block_indices) {
+    assert(!block_indices.empty());
+
+    // Find the min and max offsets in this coalesced group
+    // Since blocks are now sorted, first has min offset and last has max
+    const auto& first_block_handle = block_handles[block_indices[0]];
+    const auto& last_block_handle = block_handles[block_indices.back()];
+
+    const auto start_offset = first_block_handle.offset();
+    const auto end_offset =
+        last_block_handle.offset() +
+        BlockBasedTable::BlockSizeWithTrailer(last_block_handle);
+
+    assert(end_offset > start_offset);
+
+    read_reqs->emplace_back();
+    read_reqs->back().offset = start_offset;
+    read_reqs->back().len = end_offset - start_offset;
+    read_reqs->back().scratch = nullptr;
+  }
+}
+
+std::vector<CoalescedPrefetchGroup> IODispatcherImpl::Impl::PreCoalesceBlocks(
+    const std::shared_ptr<IOJob>& job, const std::shared_ptr<ReadSet>& rs,
+    const std::vector<size_t>& block_indices, size_t max_group_bytes) {
+  std::vector<CoalescedPrefetchGroup> groups;
+
+  if (block_indices.empty()) {
+    return groups;
+  }
+
+  const auto& block_handles = job->block_handles;
+  const uint64_t coalesce_threshold = job->job_options.io_coalesce_threshold;
+
+  // Sort block indices by offset for coalescing
+  std::vector<size_t> sorted_indices = block_indices;
+  std::sort(sorted_indices.begin(), sorted_indices.end(),
+            [&block_handles](size_t a, size_t b) {
+              return block_handles[a].offset() < block_handles[b].offset();
+            });
+
+  // Build coalesced groups respecting max_group_bytes
+  groups.emplace_back();
+
+  for (size_t idx : sorted_indices) {
+    size_t block_size = rs->block_sizes_[idx];
+
+    // Skip blocks that are individually larger than the memory budget
+    // These will be read synchronously when needed (via ReadIndex fallback)
+    if (max_group_bytes > 0 && block_size > max_group_bytes) {
+      continue;
+    }
+
+    // Check if we need to start a new group
+    bool start_new_group = false;
+
+    if (!groups.back().block_indices.empty()) {
+      // Check gap with previous block
+      size_t last_idx = groups.back().block_indices.back();
+      const auto& last_handle = block_handles[last_idx];
+      uint64_t last_end = last_handle.offset() +
+                          BlockBasedTable::BlockSizeWithTrailer(last_handle);
+      uint64_t current_start = block_handles[idx].offset();
+
+      if (current_start > last_end + coalesce_threshold) {
+        start_new_group = true;  // Gap too large
+      } else if (max_group_bytes > 0 &&
+                 groups.back().total_bytes + block_size > max_group_bytes) {
+        start_new_group = true;  // Would exceed size limit
+      }
+    }
+
+    if (start_new_group) {
+      groups.emplace_back();
+    }
+
+    groups.back().block_indices.push_back(idx);
+    groups.back().total_bytes += block_size;
+  }
+
+  return groups;
+}
+
+std::vector<size_t> IODispatcherImpl::Impl::ExecuteAsyncIO(
+    const std::shared_ptr<IOJob>& job, const std::shared_ptr<ReadSet>& read_set,
+    std::vector<FSReadRequest>& read_reqs,
+    const std::vector<std::vector<size_t>>& coalesced_block_indices,
+    Status* out_status) {
+  std::vector<size_t> fallback_block_indices;
+  *out_status = Status::OK();
+
+  // Get file and IO options
+  auto* rep = job->table->get_rep();
+  IOOptions io_opts;
+  Status s =
+      rep->file->PrepareIOOptions(job->job_options.read_options, io_opts);
+  if (!s.ok()) {
+    *out_status = s;
+    return fallback_block_indices;
+  }
+
+  const bool direct_io = rep->file->use_direct_io();
+
+  // Submit async read requests and store them in the ReadSet
+  for (size_t i = 0; i < read_reqs.size(); ++i) {
+    auto async_state = std::make_shared<AsyncIOState>();
+
+    async_state->offset = read_reqs[i].offset;
+    async_state->block_indices = coalesced_block_indices[i];
+    async_state->read_req = std::move(read_reqs[i]);
+
+    for (const auto idx : coalesced_block_indices[i]) {
+      async_state->blocks.emplace_back(job->block_handles[idx]);
+    }
+
+    if (direct_io) {
+      async_state->read_req.scratch = nullptr;
+    } else {
+      async_state->buf.reset(new char[async_state->read_req.len]);
+      async_state->read_req.scratch = async_state->buf.get();
+    }
+
+    // Callback for async read completion
+    // Store the result slice and status back into async_state so we can access
+    // them after Poll() completes.
+    auto cb = [](const FSReadRequest& req, void* cb_arg) {
+      auto* state = static_cast<AsyncIOState*>(cb_arg);
+      state->read_req.result = req.result;
+      state->read_req.status = req.status;
+    };
+
+    s = rep->file->ReadAsync(async_state->read_req, io_opts, cb,
+                             async_state.get(), &async_state->io_handle,
+                             &async_state->del_fn,
+                             direct_io ? &async_state->aligned_buf : nullptr);
+
+    if (!s.ok()) {
+      // Actual error - surface to caller
+      *out_status = s;
+      return fallback_block_indices;
+    }
+
+    if (async_state->io_handle == nullptr) {
+      // Async IO not supported - add to fallback list for sync IO
+      for (const auto idx : coalesced_block_indices[i]) {
+        fallback_block_indices.push_back(idx);
+      }
+      continue;
+    }
+
+    // Add async state to map for all blocks in this request
+    for (const auto idx : async_state->block_indices) {
+      read_set->async_io_map_[idx] = async_state;
+    }
+  }
+
+  return fallback_block_indices;
+}
+
+Status IODispatcherImpl::Impl::ExecuteSyncIO(
+    const std::shared_ptr<IOJob>& job, const std::shared_ptr<ReadSet>& read_set,
+    std::vector<FSReadRequest>& read_reqs,
+    const std::vector<std::vector<size_t>>& coalesced_block_indices) {
+  // Get file and IO options
+  auto* rep = job->table->get_rep();
+  IOOptions io_opts;
+  if (Status s =
+          rep->file->PrepareIOOptions(job->job_options.read_options, io_opts);
+      !s.ok()) {
+    return s;
+  }
+
+  const bool direct_io = rep->file->use_direct_io();
+
+  // Setup scratch buffers for MultiRead
+  std::unique_ptr<char[]> buf;
+
+  if (direct_io) {
+    for (auto& read_req : read_reqs) {
+      read_req.scratch = nullptr;
+    }
+  } else {
+    // Allocate a single contiguous buffer for all requests
+    size_t total_len = 0;
+    for (const auto& req : read_reqs) {
+      total_len += req.len;
+    }
+    buf.reset(new char[total_len]);
+    size_t offset = 0;
+    for (auto& read_req : read_reqs) {
+      read_req.scratch = buf.get() + offset;
+      offset += read_req.len;
+    }
+  }
+
+  // Execute MultiRead
+  AlignedBuf aligned_buf;
+  if (Status s =
+          rep->file->MultiRead(io_opts, read_reqs.data(), read_reqs.size(),
+                               direct_io ? &aligned_buf : nullptr);
+      !s.ok()) {
+    return s;
+  }
+
+  for (const auto& rq : read_reqs) {
+    if (!rq.status.ok()) {
+      return rq.status;
+    }
+  }
+
+  // Process all blocks from the MultiRead results
+  for (size_t i = 0; i < coalesced_block_indices.size(); ++i) {
+    const auto& read_req = read_reqs[i];
+    for (const auto& block_idx : coalesced_block_indices[i]) {
+      const auto& block_handle = job->block_handles[block_idx];
+
+      Status create_status = CreateAndPinBlockFromBuffer(
+          job, block_handle, read_req.offset, read_req.result,
+          read_set->pinned_blocks_[block_idx]);
+      if (!create_status.ok()) {
+        return create_status;
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+IODispatcherImpl::IODispatcherImpl()
+    : impl_(std::make_shared<Impl>(IODispatcherOptions())) {}
+
+IODispatcherImpl::IODispatcherImpl(const IODispatcherOptions& options)
+    : impl_(std::make_shared<Impl>(options)) {}
+
+IODispatcherImpl::~IODispatcherImpl() = default;
+
+Status IODispatcherImpl::SubmitJob(const std::shared_ptr<IOJob>& job,
+                                   std::shared_ptr<ReadSet>* read_set) {
+  return impl_->SubmitJob(job, read_set);
+}
+
+IODispatcher* NewIODispatcher() { return new IODispatcherImpl(); }
+
+IODispatcher* NewIODispatcher(const IODispatcherOptions& options) {
+  return new IODispatcherImpl(options);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/util/io_dispatcher_imp.h b/util/io_dispatcher_imp.h
new file mode 100644
index 000000000000..c4e52b86d546
--- /dev/null
+++ b/util/io_dispatcher_imp.h
@@ -0,0 +1,36 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <memory>
+
+#include "rocksdb/io_dispatcher.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class IODispatcherImpl : public IODispatcher {
+ public:
+  IODispatcherImpl();
+  explicit IODispatcherImpl(const IODispatcherOptions& options);
+  ~IODispatcherImpl() override;
+
+  Status SubmitJob(const std::shared_ptr<IOJob>& job,
+                   std::shared_ptr<ReadSet>* read_set) override;
+
+ private:
+  struct Impl;
+  std::shared_ptr<Impl> impl_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/util/io_dispatcher_test.cc b/util/io_dispatcher_test.cc
new file mode 100644
index 000000000000..89624ac5bcd4
--- /dev/null
+++ b/util/io_dispatcher_test.cc
@@ -0,0 +1,1800 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/io_dispatcher.h"
+
+#include <memory>
+#include <mutex>
+#include <thread>
+
+#include "db/db_test_util.h"
+#include "db/dbformat.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "test_util/sync_point.h"
+
+// Enable io_uring support for this test
+extern "C" bool RocksDbIOUringEnable() { return true; }
+
+// Check if io_uring is available at compile time
+#ifdef ROCKSDB_IOURING_PRESENT
+static constexpr bool kIOUringPresent = true;
+#else
+static constexpr bool kIOUringPresent = false;
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+// Represents a single read operation recorded by the tracking file system
+struct ReadOp {
+  enum Type { kMultiRead, kReadAsync };
+  Type type;
+  // For MultiRead: contains all (offset, len) pairs in the request
+  // For ReadAsync: contains a single (offset, len) pair
+  std::vector<std::pair<uint64_t, size_t>> requests;
+};
+
+// Forward declaration
+class ReadTrackingFS;
+
+// Wrapper around FSRandomAccessFile that tracks read operations
+class ReadTrackingRandomAccessFile : public FSRandomAccessFileOwnerWrapper {
+ public:
+  ReadTrackingRandomAccessFile(std::unique_ptr<FSRandomAccessFile>&& file,
+                               ReadTrackingFS* fs)
+      : FSRandomAccessFileOwnerWrapper(std::move(file)), fs_(fs) {}
+
+  IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
+                     const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts,
+                     std::function<void(FSReadRequest&, void*)> cb,
+                     void* cb_arg, void** io_handle, IOHandleDeleter* del_fn,
+                     IODebugContext* dbg) override;
+
+ private:
+  ReadTrackingFS* fs_;
+};
+
+// FileSystem wrapper that tracks all read operations for verification
+class ReadTrackingFS : public FileSystemWrapper {
+ public:
+  explicit ReadTrackingFS(const std::shared_ptr<FileSystem>& target)
+      : FileSystemWrapper(target) {}
+
+  static const char* kClassName() { return "ReadTrackingFS"; }
+  const char* Name() const override { return kClassName(); }
+
+  IOStatus NewRandomAccessFile(const std::string& fname,
+                               const FileOptions& opts,
+                               std::unique_ptr<FSRandomAccessFile>* result,
+                               IODebugContext* dbg) override {
+    std::unique_ptr<FSRandomAccessFile> file;
+    IOStatus s = target()->NewRandomAccessFile(fname, opts, &file, dbg);
+    if (s.ok()) {
+      result->reset(new ReadTrackingRandomAccessFile(std::move(file), this));
+    }
+    return s;
+  }
+
+  // Record a MultiRead operation
+  void RecordMultiRead(const std::vector<std::pair<uint64_t, size_t>>& reqs) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    ReadOp op;
+    op.type = ReadOp::kMultiRead;
+    op.requests = reqs;
+    read_ops_.push_back(std::move(op));
+  }
+
+  // Record a ReadAsync operation
+  void RecordReadAsync(uint64_t offset, size_t len) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    ReadOp op;
+    op.type = ReadOp::kReadAsync;
+    op.requests.push_back({offset, len});
+    read_ops_.push_back(std::move(op));
+  }
+
+  // Get all recorded read operations
+  std::vector<ReadOp> GetReadOps() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return read_ops_;
+  }
+
+  // Clear recorded read operations
+  void ClearReadOps() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    read_ops_.clear();
+  }
+
+  // Get count of MultiRead operations
+  size_t GetMultiReadCount() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    size_t count = 0;
+    for (const auto& op : read_ops_) {
+      if (op.type == ReadOp::kMultiRead) {
+        count++;
+      }
+    }
+    return count;
+  }
+
+  // Get count of ReadAsync operations
+  size_t GetReadAsyncCount() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    size_t count = 0;
+    for (const auto& op : read_ops_) {
+      if (op.type == ReadOp::kReadAsync) {
+        count++;
+      }
+    }
+    return count;
+  }
+
+ private:
+  mutable std::mutex mutex_;
+  std::vector<ReadOp> read_ops_;
+};
+
+IOStatus ReadTrackingRandomAccessFile::MultiRead(FSReadRequest* reqs,
+                                                 size_t num_reqs,
+                                                 const IOOptions& options,
+                                                 IODebugContext* dbg) {
+  // Record the read operation before executing it
+  std::vector<std::pair<uint64_t, size_t>> recorded_reqs;
+  recorded_reqs.reserve(num_reqs);
+  for (size_t i = 0; i < num_reqs; i++) {
+    recorded_reqs.push_back({reqs[i].offset, reqs[i].len});
+  }
+  fs_->RecordMultiRead(recorded_reqs);
+
+  // Delegate to underlying file
+  return target()->MultiRead(reqs, num_reqs, options, dbg);
+}
+
+IOStatus ReadTrackingRandomAccessFile::ReadAsync(
+    FSReadRequest& req, const IOOptions& opts,
+    std::function<void(FSReadRequest&, void*)> cb, void* cb_arg,
+    void** io_handle, IOHandleDeleter* del_fn, IODebugContext* dbg) {
+  // Record the read operation before executing it
+  fs_->RecordReadAsync(req.offset, req.len);
+
+  // Delegate to underlying file
+  return target()->ReadAsync(req, opts, cb, cb_arg, io_handle, del_fn, dbg);
+}
+
+class IODispatcherTest : public DBTestBase {
+ public:
+  IODispatcherTest()
+      : DBTestBase("io_dispatcher_test", /*env_do_fsync=*/false) {}
+
+  ~IODispatcherTest() override {
+    // Close any open tables
+    for (auto& table : tables_) {
+      table.reset();
+    }
+    tables_.clear();
+  }
+
+  // Helper to collect block handles from a table
+  // We use TEST_GetDataBlockHandle to get handles for specific keys
+  // Since we know the keys we inserted, we can collect their block handles
+  Status CollectBlockHandles(BlockBasedTable* table, size_t num_keys,
+                             std::vector<BlockHandle>* block_handles_out) {
+    block_handles_out->clear();
+
+    ReadOptions read_options;
+    std::unordered_set<uint64_t> seen_offsets;
+
+    // Iterate through all keys and get their block handles
+    // We collect unique block handles (same block might contain multiple keys)
+    IndexBlockIter iiter_on_stack;
+    BlockCacheLookupContext context{TableReaderCaller::kUserVerifyChecksum};
+    auto iiter = table->NewIndexIterator(read_options, false, &iiter_on_stack,
+                                         nullptr, &context);
+    std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+    if (iiter != &iiter_on_stack) {
+      iiter_unique_ptr.reset(iiter);
+    }
+
+    // Position the iterator at the first entry
+    iiter->SeekToFirst();
+
+    while (iiter->Valid()) {
+      auto handle = iiter->value().handle;
+      if (seen_offsets.find(handle.offset()) == seen_offsets.end()) {
+        block_handles_out->push_back(handle);
+        seen_offsets.insert(handle.offset());
+        if (block_handles_out->size() >= num_keys) {
+          break;
+        }
+      }
+      iiter->Next();
+    }
+
+    return Status::OK();
+  }
+
+  std::string test_dir_{};
+  Env* env_{};
+  std::shared_ptr<FileSystem> base_fs_;
+  std::shared_ptr<ReadTrackingFS> tracking_fs_;
+
+  std::string Path(const std::string& fname) { return test_dir_ + "/" + fname; }
+
+  void SetUp() override {
+    SetupSyncPointsToMockDirectIO();
+    test_dir_ = test::PerThreadDBPath("block_based_table_reader_test");
+    env_ = Env::Default();
+    base_fs_ = FileSystem::Default();
+    tracking_fs_ = std::make_shared<ReadTrackingFS>(base_fs_);
+    ASSERT_OK(base_fs_->CreateDir(test_dir_, IOOptions(), nullptr));
+  }
+
+  void TearDown() override { EXPECT_OK(DestroyDir(env_, test_dir_)); }
+
+  void NewFileWriter(const std::string& filename,
+                     std::unique_ptr<WritableFileWriter>* writer) {
+    std::string path = Path(filename);
+    EnvOptions env_options;
+    FileOptions foptions;
+    std::unique_ptr<FSWritableFile> file;
+    ASSERT_OK(base_fs_->NewWritableFile(path, foptions, &file, nullptr));
+    writer->reset(new WritableFileWriter(std::move(file), path, env_options));
+  }
+
+  void NewFileReader(const std::string& filename, const FileOptions& opt,
+                     std::unique_ptr<RandomAccessFileReader>* reader,
+                     Statistics* stats = nullptr) {
+    std::string path = Path(filename);
+    std::unique_ptr<FSRandomAccessFile> f;
+    // Use tracking_fs_ to record read operations
+    ASSERT_OK(tracking_fs_->NewRandomAccessFile(path, opt, &f, nullptr));
+    reader->reset(new RandomAccessFileReader(std::move(f), path,
+                                             env_->GetSystemClock().get(),
+                                             /*io_tracer=*/nullptr,
+                                             /*stats=*/stats));
+  }
+
+  std::vector<std::shared_ptr<Statistics>> all_stats_;
+  std::vector<std::unique_ptr<BlockBasedTable>> tables_;
+
+  // Options must be stored as member variables to avoid use-after-scope
+  // The BlockBasedTable keeps references to these options
+  std::vector<std::unique_ptr<ImmutableOptions>> all_ioptions_;
+  std::vector<std::unique_ptr<EnvOptions>> all_env_options_;
+
+  // Helper to create an SST file and open it as a table
+  // Following pattern from table_test.cc TableConstructor
+  Status CreateAndOpenSST(int num_blocks,
+                          std::unique_ptr<BlockBasedTable>* table,
+                          std::vector<BlockHandle>* block_handles_out) {
+    // Create options - store in member variables to avoid use-after-scope
+    // The BlockBasedTable will keep references to these options
+    Options options{};
+    options.statistics = nullptr;
+    BlockBasedTableOptions table_options;
+    table_options.block_cache = NewLRUCache(8 * 1024 * 1024);
+    table_options.block_size = 16 * 1024;
+    table_options.no_block_cache = false;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    // Store these in member variables so they outlive the function
+    auto ioptions = std::make_unique<ImmutableOptions>(options);
+    auto moptions = MutableCFOptions{options};
+    InternalKeyComparator internal_comparator(options.comparator);
+
+    // Create in-memory file using StringSink (like table_test.cc)
+    auto table_name = "test_table";
+    std::unique_ptr<WritableFileWriter> file_writer;
+    NewFileWriter(table_name, &file_writer);
+
+    // Create table builder
+    std::string column_family_name;
+    const ReadOptions read_options;
+    const WriteOptions write_options;
+    std::vector<std::unique_ptr<InternalTblPropCollFactory>>
+        int_tbl_prop_coll_factories;
+    TableBuilderOptions builder_options(
+        *ioptions, moptions, read_options, write_options, internal_comparator,
+        &int_tbl_prop_coll_factories, kNoCompression, options.compression_opts,
+        0 /* column_family_id */, column_family_name, -1 /* level */,
+        kUnknownNewestKeyTime);
+
+    std::unique_ptr<TableBuilder> builder(
+        options.table_factory->NewTableBuilder(builder_options,
+                                               file_writer.get()));
+
+    Status s;
+    auto rnd = Random::GetTLSInstance();
+    // Add keys to the table
+    // 10k * 1Kib = ~10MiB
+    for (int i = 0; i < 10000; i++) {
+      std::string value = rnd->RandomString(2 << 10);
+      InternalKey ikey(Key(i), i, kTypeValue);
+      builder->Add(ikey.Encode(), value);
+    }
+    s = builder->Finish();
+    if (!s.ok()) {
+      return s;
+    }
+
+    uint64_t file_size = builder->FileSize();
+
+    IOOptions io_options;
+    s = file_writer->Flush(io_options);
+    if (!s.ok()) {
+      return s;
+    }
+
+    // Now open the file for reading using StringSource (like table_test.cc)
+    std::unique_ptr<RandomAccessFileReader> file;
+    FileOptions foptions;
+    foptions.use_direct_reads = false;
+
+    NewFileReader(table_name, foptions, &file, nullptr);
+
+    // Store EnvOptions and InternalKeyComparator to avoid use-after-scope
+    auto soptions = std::make_unique<EnvOptions>();
+    BlockCacheTracer block_cache_tracer;
+    std::unique_ptr<TableReader> table_reader;
+
+    auto ikc = InternalKeyComparator(options.comparator);
+    TableReaderOptions reader_options(*ioptions, moptions.prefix_extractor,
+                                      moptions.compression_manager.get(),
+                                      *soptions, ikc,
+                                      0 /* block_protection_bytes_per_key */);
+
+    s = options.table_factory->NewTableReader(reader_options, std::move(file),
+                                              file_size, &table_reader);
+
+    if (!s.ok()) {
+      return s;
+    }
+
+    table->reset(static_cast<BlockBasedTable*>(table_reader.release()));
+
+    // Collect actual block handles from the table's index
+    // This is similar to how block_based_table_iterator.cc CollectBlockHandles
+    // works
+    s = CollectBlockHandles(table->get(), num_blocks, block_handles_out);
+    if (!s.ok()) {
+      return s;
+    }
+
+    // Store all options in member variables to keep them alive
+    all_ioptions_.push_back(std::move(ioptions));
+    all_env_options_.push_back(std::move(soptions));
+
+    return Status::OK();
+  }
+
+  static uint64_t cur_file_num_;
+};
+
+uint64_t IODispatcherTest::cur_file_num_ = 1;
+
+TEST_F(IODispatcherTest, BasicSSTRead) {
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
+
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(50, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_NE(table, nullptr);
+  ASSERT_GT(block_handles.size(), 0);
+
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  ReadOptions read_options;
+  // Only use async IO when io_uring is available
+  job->job_options.read_options.async_io = kIOUringPresent;
+
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+
+  // Read blocks using the new ReadSet API and verify they are valid
+  // ReadIndex will poll for async IO completion internally, no need to sleep
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+
+    // Verify the block has reasonable content
+    const Block* block_ptr = block.GetValue();
+    ASSERT_GT(block_ptr->size(), 0);
+  }
+
+  // Verify statistics - some blocks should have been read asynchronously
+  // Note: actual counts depend on cache behavior and IO completion
+  uint64_t total_reads = read_set->GetNumSyncReads() +
+                         read_set->GetNumAsyncReads() +
+                         read_set->GetNumCacheHits();
+  ASSERT_EQ(total_reads, block_handles.size());
+}
+
+TEST_F(IODispatcherTest, MultipleSSTFiles) {
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
+
+  std::vector<std::shared_ptr<ReadSet>> read_sets;
+  std::vector<std::vector<BlockHandle>> all_block_handles;
+
+  // Create and submit jobs for multiple SST files
+  for (int i = 0; i < 3; i++) {
+    std::unique_ptr<BlockBasedTable> table;
+    std::vector<BlockHandle> block_handles;
+
+    Status s = CreateAndOpenSST(30 + i * 10, &table, &block_handles);
+    ASSERT_OK(s);
+
+    auto job = std::make_shared<IOJob>();
+    job->block_handles = block_handles;
+    job->table = table.get();
+    tables_.push_back(std::move(table));
+
+    all_block_handles.push_back(block_handles);
+    std::shared_ptr<ReadSet> read_set;
+    s = dispatcher->SubmitJob(job, &read_set);
+    ASSERT_OK(s);
+    read_sets.push_back(read_set);
+  }
+
+  // Verify all ReadSets can read their blocks successfully
+  // ReadIndex will poll for async IO completion internally, no need to sleep
+  for (size_t i = 0; i < read_sets.size(); ++i) {
+    for (size_t j = 0; j < all_block_handles[i].size(); ++j) {
+      CachableEntry<Block> block;
+      Status read_status = read_sets[i]->ReadIndex(j, &block);
+      ASSERT_OK(read_status);
+      ASSERT_NE(block.GetValue(), nullptr);
+    }
+  }
+}
+
+TEST_F(IODispatcherTest, StatisticsTracking) {
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
+
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(30, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_NE(table, nullptr);
+  ASSERT_GT(block_handles.size(), 0);
+
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  // Only use async IO when io_uring is available
+  job->job_options.read_options.async_io = kIOUringPresent;
+
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+
+  // Read all blocks - ReadIndex handles polling for async IO completion
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+  }
+
+  // Read the same blocks again - should all be cache hits now
+  std::shared_ptr<ReadSet> read_set2;
+  s = dispatcher->SubmitJob(job, &read_set2);
+  ASSERT_OK(s);
+
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set2->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+  }
+
+  // After reading all blocks, verify statistics
+  uint64_t num_sync = read_set->GetNumSyncReads();
+  uint64_t num_async = read_set->GetNumAsyncReads();
+  uint64_t num_cache = read_set->GetNumCacheHits();
+
+  // Total reads should equal number of blocks
+  uint64_t total_reads = num_sync + num_async + num_cache;
+  ASSERT_EQ(total_reads, block_handles.size());
+}
+TEST_F(IODispatcherTest, AsyncAndSyncRead) {
+  // This test verifies the difference between async_io=true and async_io=false
+  // by checking the statistics after reading all blocks.
+  // Only test async_io=true when io_uring is available.
+  std::vector<bool> async_modes = {false};
+  if (kIOUringPresent) {
+    async_modes.push_back(true);
+  }
+
+  for (auto async : async_modes) {
+    std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
+
+    std::unique_ptr<BlockBasedTable> table;
+    std::vector<BlockHandle> block_handles;
+    Status s = CreateAndOpenSST(40, &table, &block_handles);
+    ASSERT_OK(s);
+    ASSERT_NE(table, nullptr);
+    ASSERT_GT(block_handles.size(), 0);
+
+    auto job = std::make_shared<IOJob>();
+    job->block_handles = block_handles;
+    job->table = table.get();
+    ReadOptions read_options;
+    // Ensure we don't use cache for this test - we want fresh reads
+    read_options.fill_cache = false;
+    job->job_options.read_options.async_io = async;
+
+    std::shared_ptr<ReadSet> read_set;
+    s = dispatcher->SubmitJob(job, &read_set);
+    ASSERT_OK(s);
+    ASSERT_NE(read_set, nullptr);
+
+    // Read all blocks - ReadIndex handles polling for async IO internally
+    for (size_t i = 0; i < block_handles.size(); ++i) {
+      CachableEntry<Block> block;
+      Status read_status = read_set->ReadIndex(i, &block);
+      ASSERT_OK(read_status);
+      ASSERT_NE(block.GetValue(), nullptr);
+
+      // Verify the block has reasonable content
+      const Block* block_ptr = block.GetValue();
+      ASSERT_GT(block_ptr->size(), 0);
+    }
+
+    // Verify statistics
+    uint64_t num_sync = read_set->GetNumSyncReads();
+    uint64_t num_async = read_set->GetNumAsyncReads();
+    uint64_t num_cache = read_set->GetNumCacheHits();
+
+    // Total reads should equal number of blocks
+    uint64_t total_reads = num_sync + num_async + num_cache;
+    EXPECT_EQ(total_reads, block_handles.size());
+
+    // When async_io is false, we always expect sync reads
+    if (!async) {
+      EXPECT_GT(num_sync, 0) << "Expected sync reads when async_io=false";
+      EXPECT_EQ(num_async, 0) << "Expected no async reads when async_io=false";
+    }
+    // When async_io is true:
+    // - If io_uring is available, we expect async reads
+    // - If io_uring is NOT available, ReadAsync returns NotSupported and
+    //   we fall back to sync reads. This is valid behavior.
+    // So we only verify that ALL blocks were read (checked above).
+  }
+}
+
+TEST_F(IODispatcherTest, VerifyBlockContent) {
+  // Test that blocks retrieved through ReadSet contain the correct data
+  // that was written to the SST file
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
+
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(50, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_NE(table, nullptr);
+  ASSERT_GT(block_handles.size(), 0);
+
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  ReadOptions read_options;
+  job->job_options.read_options.async_io = false;
+
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+
+  // Read each block and verify its content
+  int t = 0;
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block_entry;
+    Status read_status = read_set->ReadIndex(i, &block_entry);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block_entry.GetValue(), nullptr);
+
+    Block* block = block_entry.GetValue();
+    ASSERT_GT(block->size(), 0);
+
+    // Create an iterator to walk through the block's keys
+    // We use InternalKeyComparator for data blocks
+    InternalKeyComparator internal_comparator(BytewiseComparator());
+    std::unique_ptr<DataBlockIter> iter(block->NewDataIterator(
+        internal_comparator.user_comparator(), kDisableGlobalSequenceNumber));
+
+    // Iterate through all keys in this block
+    size_t num_keys_in_block = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      num_keys_in_block++;
+
+      // Verify key is not empty
+      ASSERT_GT(iter->key().size(), 0)
+          << "Block " << i << " contains empty key";
+
+      // Verify value is not empty (we wrote 1KB values)
+      ASSERT_GT(iter->value().size(), 2 ^ 10)
+          << "Block " << i << " contains empty value";
+
+      // Parse the internal key
+      ParsedInternalKey parsed_key;
+      Status parse_status =
+          ParseInternalKey(iter->key(), &parsed_key, true /* log_err */);
+      ASSERT_OK(parse_status) << "Failed to parse internal key in block " << i;
+
+      // Verify the key matches the expected format from CreateAndOpenSST
+      // Keys are created with Key(i) which generates keys like "key000000"
+      std::string user_key = parsed_key.user_key.ToString();
+      auto check = Key(t);
+      t++;
+      ASSERT_TRUE(user_key.find("key") == 0)
+          << "Unexpected key format in block " << i << ": " << user_key;
+
+      ASSERT_EQ(check.c_str(), user_key);
+
+      // Verify value type is correct (should be kTypeValue)
+      ASSERT_EQ(parsed_key.type, kTypeValue)
+          << "Unexpected value type in block " << i;
+    }
+
+    // Verify iterator status after iteration
+    ASSERT_OK(iter->status()) << "Iterator error in block " << i;
+
+    // Each block should contain at least one key
+    ASSERT_GT(num_keys_in_block, 0) << "Block " << i << " contains no keys";
+  }
+}
+
+// We want to test here that even when we DONT read from the readset that all
+// pinned blocks will be unpinned.
+TEST_F(IODispatcherTest, ReadSetDestroysUnpinsBlocks) {
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
+
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(30, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_NE(table, nullptr);
+  ASSERT_EQ(block_handles.size(), 30);
+
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  ReadOptions read_options;
+  job->job_options.read_options.async_io =
+      false;  // Use sync IO so blocks are pinned immediately
+
+  auto* rep = table->get_rep();
+  auto cache = rep->table_options.block_cache.get();
+  ASSERT_NE(cache, nullptr);
+
+  auto initial_pinned_usage = cache->GetPinnedUsage();
+  ASSERT_EQ(initial_pinned_usage, 0);
+
+  {
+    std::shared_ptr<ReadSet> read_set;
+    Status t = dispatcher->SubmitJob(job, &read_set);
+    ASSERT_OK(t);
+    ASSERT_NE(read_set, nullptr);
+
+    // With sync IO, blocks are already pinned in read_set->pinned_blocks_
+    // We do NOT call read_set->Read() - blocks should remain in pinned_blocks_
+
+    // At this point, blocks should be pinned in the ReadSet
+    auto pinned_usage_with_blocks = cache->GetPinnedUsage();
+    ASSERT_GT(pinned_usage_with_blocks, initial_pinned_usage)
+        << "Expected pinned usage to increase after SubmitJob, but "
+        << "initial=" << initial_pinned_usage
+        << " current=" << pinned_usage_with_blocks;
+
+    // ReadSet goes out of scope here, its destructor should unpin all blocks
+  }
+
+  // ReadSet destroyed - all blocks should be unpinned
+  auto final_pinned_usage = cache->GetPinnedUsage();
+  ASSERT_EQ(final_pinned_usage, initial_pinned_usage)
+      << "Expected pinned usage to return to initial value after ReadSet "
+      << "destruction, but initial=" << initial_pinned_usage
+      << " final=" << final_pinned_usage;
+}
+
+// Test that verifies the coalescing logic: adjacent blocks within the
+// coalesce threshold should be combined into a single read request.
+TEST_F(IODispatcherTest, VerifyCoalescing) {
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
+
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  // Get many blocks so we can test coalescing behavior
+  Status s = CreateAndOpenSST(50, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_NE(table, nullptr);
+  ASSERT_GE(block_handles.size(), 20);
+
+  tracking_fs_->ClearReadOps();
+
+  // Test coalescing with sync reads (uses MultiRead)
+  {
+    auto job = std::make_shared<IOJob>();
+    // Use a subset of adjacent blocks
+    std::vector<BlockHandle> adjacent_blocks;
+    for (size_t i = 0; i < 10 && i < block_handles.size(); ++i) {
+      adjacent_blocks.push_back(block_handles[i]);
+    }
+    job->block_handles = adjacent_blocks;
+    job->table = table.get();
+    job->job_options.read_options.async_io = false;
+    // Set a large coalesce threshold so all adjacent blocks are combined
+    job->job_options.io_coalesce_threshold = 1024 * 1024;  // 1MB
+
+    std::shared_ptr<ReadSet> read_set;
+    s = dispatcher->SubmitJob(job, &read_set);
+    ASSERT_OK(s);
+
+    for (size_t i = 0; i < adjacent_blocks.size(); ++i) {
+      CachableEntry<Block> block;
+      Status read_status = read_set->ReadIndex(i, &block);
+      ASSERT_OK(read_status);
+      ASSERT_NE(block.GetValue(), nullptr);
+    }
+
+    // With a large coalesce threshold and adjacent blocks, we expect
+    // all blocks to be coalesced into a single MultiRead request
+    auto read_ops = tracking_fs_->GetReadOps();
+    size_t multiread_count = 0;
+    size_t total_requests_in_multireads = 0;
+    for (const auto& op : read_ops) {
+      if (op.type == ReadOp::kMultiRead) {
+        multiread_count++;
+        total_requests_in_multireads += op.requests.size();
+      }
+    }
+
+    // Adjacent blocks should be coalesced into a single read request
+    // (assuming they're within the coalesce threshold)
+    EXPECT_EQ(multiread_count, 1)
+        << "Expected 1 MultiRead call with coalesced blocks";
+    EXPECT_EQ(total_requests_in_multireads, 1)
+        << "Expected all adjacent blocks to be coalesced into 1 request";
+  }
+
+  tracking_fs_->ClearReadOps();
+
+  // Test with zero coalesce threshold and non-adjacent blocks
+  // Non-adjacent blocks (with gaps) should NOT be coalesced with threshold=0
+  {
+    // Create new table to avoid cache hits
+    std::unique_ptr<BlockBasedTable> table2;
+    std::vector<BlockHandle> block_handles2;
+    s = CreateAndOpenSST(50, &table2, &block_handles2);
+    ASSERT_OK(s);
+    ASSERT_GE(block_handles2.size(), 20);
+
+    tracking_fs_->ClearReadOps();
+
+    auto job = std::make_shared<IOJob>();
+    // Skip every other block to create gaps between requested blocks
+    // This ensures there are gaps that won't be bridged with threshold=0
+    std::vector<BlockHandle> non_adjacent_blocks;
+    for (size_t i = 0;
+         i < block_handles2.size() && non_adjacent_blocks.size() < 5; i += 2) {
+      non_adjacent_blocks.push_back(block_handles2[i]);
+    }
+    job->block_handles = non_adjacent_blocks;
+    job->table = table2.get();
+    job->job_options.read_options.async_io = false;
+    // Set zero coalesce threshold - blocks with gaps should not be coalesced
+    job->job_options.io_coalesce_threshold = 0;
+
+    std::shared_ptr<ReadSet> read_set;
+    s = dispatcher->SubmitJob(job, &read_set);
+    ASSERT_OK(s);
+
+    for (size_t i = 0; i < non_adjacent_blocks.size(); ++i) {
+      CachableEntry<Block> block;
+      Status read_status = read_set->ReadIndex(i, &block);
+      ASSERT_OK(read_status);
+      ASSERT_NE(block.GetValue(), nullptr);
+    }
+
+    // With zero coalesce threshold and non-adjacent blocks (with gaps),
+    // each block should be a separate request
+    auto read_ops = tracking_fs_->GetReadOps();
+    size_t total_requests_in_multireads = 0;
+    for (const auto& op : read_ops) {
+      if (op.type == ReadOp::kMultiRead) {
+        total_requests_in_multireads += op.requests.size();
+      }
+    }
+
+    // Each non-adjacent block should be a separate request since there are
+    // gaps between them and threshold=0 means no gap tolerance
+    EXPECT_EQ(total_requests_in_multireads, non_adjacent_blocks.size())
+        << "Expected each non-adjacent block to be a separate request with "
+           "zero coalesce threshold";
+  }
+}
+
+// Test that verifies the read request offsets and lengths match the
+// expected block handles.
+TEST_F(IODispatcherTest, VerifyReadRequestDetails) {
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
+
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(10, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_NE(table, nullptr);
+  ASSERT_GE(block_handles.size(), 5);
+
+  tracking_fs_->ClearReadOps();
+
+  // Use just a few non-adjacent blocks to avoid coalescing
+  std::vector<BlockHandle> test_blocks;
+  // Pick every other block to ensure they're not adjacent
+  for (size_t i = 0; i < block_handles.size(); i += 2) {
+    test_blocks.push_back(block_handles[i]);
+  }
+
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = test_blocks;
+  job->table = table.get();
+  job->job_options.read_options.async_io = false;
+  // Small coalesce threshold to minimize coalescing for this test
+  job->job_options.io_coalesce_threshold = 0;
+
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+
+  for (size_t i = 0; i < test_blocks.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+  }
+
+  // Verify the read requests match the block handles
+  auto read_ops = tracking_fs_->GetReadOps();
+  std::unordered_set<uint64_t> expected_offsets;
+  for (const auto& handle : test_blocks) {
+    expected_offsets.insert(handle.offset());
+  }
+
+  std::unordered_set<uint64_t> actual_offsets;
+  for (const auto& op : read_ops) {
+    if (op.type == ReadOp::kMultiRead) {
+      for (const auto& req : op.requests) {
+        actual_offsets.insert(req.first);
+      }
+    }
+  }
+
+  // Verify all expected offsets were read
+  for (const auto& expected : expected_offsets) {
+    EXPECT_TRUE(actual_offsets.count(expected) > 0)
+        << "Expected read at offset " << expected << " but it was not found";
+  }
+}
+
+// Test that memory limiting blocks when the limit is exceeded
+TEST_F(IODispatcherTest, MemoryLimitBlocksWhenExceeded) {
+  // Create dispatcher with a small memory limit (1MB)
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = 1 * 1024 * 1024;  // 1MB
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(50, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GT(block_handles.size(), 0);
+
+  // Submit a job - should succeed immediately (non-blocking)
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  job->job_options.read_options.async_io = false;
+
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+
+  // Read all blocks - they may be read synchronously if prefetch was deferred
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+  }
+}
+
+// Test that SubmitJob never blocks even when memory is exhausted
+TEST_F(IODispatcherTest, SubmitJobNeverBlocks) {
+  // Create dispatcher with a tiny memory limit
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = 1024;  // 1KB - very small
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(50, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GT(block_handles.size(), 0);
+
+  // Submit first job - uses up all memory
+  auto job1 = std::make_shared<IOJob>();
+  job1->block_handles = block_handles;
+  job1->table = table.get();
+  job1->job_options.read_options.async_io = false;
+
+  std::shared_ptr<ReadSet> read_set1;
+  s = dispatcher->SubmitJob(job1, &read_set1);
+  ASSERT_OK(s);  // Should succeed immediately
+
+  // Submit second job - should also succeed immediately (not block)
+  std::unique_ptr<BlockBasedTable> table2;
+  std::vector<BlockHandle> block_handles2;
+  s = CreateAndOpenSST(30, &table2, &block_handles2);
+  ASSERT_OK(s);
+
+  auto job2 = std::make_shared<IOJob>();
+  job2->block_handles = block_handles2;
+  job2->table = table2.get();
+  job2->job_options.read_options.async_io = false;
+
+  std::shared_ptr<ReadSet> read_set2;
+  s = dispatcher->SubmitJob(job2, &read_set2);
+  ASSERT_OK(s);  // Should succeed immediately - prefetch is just deferred
+
+  // Reads work - blocks are fetched synchronously on demand
+  for (size_t i = 0; i < block_handles2.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set2->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+  }
+}
+
+// Test that releasing blocks triggers pending prefetches
+TEST_F(IODispatcherTest, BlockReleaseTriggersWaitingJob) {
+  // Create dispatcher with a small memory limit
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = 100 * 1024;  // 100KB
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(30, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GT(block_handles.size(), 0);
+
+  // Submit first job
+  auto job1 = std::make_shared<IOJob>();
+  job1->block_handles = block_handles;
+  job1->table = table.get();
+  job1->job_options.read_options.async_io = false;
+
+  std::shared_ptr<ReadSet> read_set1;
+  s = dispatcher->SubmitJob(job1, &read_set1);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set1, nullptr);
+
+  // Read all blocks from first job
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set1->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+  }
+
+  // Submit second job - prefetch will be deferred due to memory limit
+  std::unique_ptr<BlockBasedTable> table2;
+  std::vector<BlockHandle> block_handles2;
+  s = CreateAndOpenSST(20, &table2, &block_handles2);
+  ASSERT_OK(s);
+
+  auto job2 = std::make_shared<IOJob>();
+  job2->block_handles = block_handles2;
+  job2->table = table2.get();
+  job2->job_options.read_options.async_io = false;
+
+  std::shared_ptr<ReadSet> read_set2;
+  s = dispatcher->SubmitJob(job2, &read_set2);
+  ASSERT_OK(s);  // Should succeed immediately
+  ASSERT_NE(read_set2, nullptr);
+
+  // Release blocks from first job - this should trigger pending prefetches
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    read_set1->ReleaseBlock(i);
+  }
+
+  // Read all blocks from second job - should work
+  for (size_t i = 0; i < block_handles2.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set2->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+  }
+}
+
+// Test that multiple ReadSets share the memory budget
+TEST_F(IODispatcherTest, MultipleReadSetsShareMemoryBudget) {
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = 10 * 1024 * 1024;  // 10MB
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+
+  std::vector<std::shared_ptr<ReadSet>> read_sets;
+  std::vector<std::vector<BlockHandle>> all_block_handles;
+
+  // Create and submit multiple jobs
+  for (int i = 0; i < 3; i++) {
+    std::unique_ptr<BlockBasedTable> table;
+    std::vector<BlockHandle> block_handles;
+
+    Status s = CreateAndOpenSST(20 + i * 5, &table, &block_handles);
+    ASSERT_OK(s);
+
+    auto job = std::make_shared<IOJob>();
+    job->block_handles = block_handles;
+    job->table = table.get();
+    job->job_options.read_options.async_io = false;
+    tables_.push_back(std::move(table));
+
+    all_block_handles.push_back(block_handles);
+    std::shared_ptr<ReadSet> read_set;
+    s = dispatcher->SubmitJob(job, &read_set);
+    ASSERT_OK(s);
+    read_sets.push_back(read_set);
+  }
+
+  // Verify all ReadSets can read their blocks
+  for (size_t i = 0; i < read_sets.size(); ++i) {
+    for (size_t j = 0; j < all_block_handles[i].size(); ++j) {
+      CachableEntry<Block> block;
+      Status read_status = read_sets[i]->ReadIndex(j, &block);
+      ASSERT_OK(read_status);
+      ASSERT_NE(block.GetValue(), nullptr);
+    }
+  }
+
+  // Release all blocks from first ReadSet
+  for (size_t i = 0; i < all_block_handles[0].size(); ++i) {
+    read_sets[0]->ReleaseBlock(i);
+  }
+
+  // Create another job - should work because first ReadSet released memory
+  std::unique_ptr<BlockBasedTable> table_new;
+  std::vector<BlockHandle> block_handles_new;
+  Status s = CreateAndOpenSST(25, &table_new, &block_handles_new);
+  ASSERT_OK(s);
+
+  auto job_new = std::make_shared<IOJob>();
+  job_new->block_handles = block_handles_new;
+  job_new->table = table_new.get();
+  job_new->job_options.read_options.async_io = false;
+
+  std::shared_ptr<ReadSet> read_set_new;
+  s = dispatcher->SubmitJob(job_new, &read_set_new);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set_new, nullptr);
+
+  for (size_t i = 0; i < block_handles_new.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set_new->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+  }
+}
+
+// Test that no memory limiting is applied when max_prefetch_memory_bytes is 0
+TEST_F(IODispatcherTest, NoMemoryLimitWhenZero) {
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = 0;  // No limit
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(50, &table, &block_handles);
+  ASSERT_OK(s);
+
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  job->job_options.read_options.async_io = false;
+
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+  }
+}
+
+// Test memory release on ReadSet destruction triggers pending prefetches
+TEST_F(IODispatcherTest, MemoryReleasedOnReadSetDestruction) {
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = 100 * 1024;  // 100KB
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+
+  // Create table outside the scope so it outlives the ReadSet
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(30, &table, &block_handles);
+  ASSERT_OK(s);
+
+  // Second table - created now so it's available after first ReadSet is
+  // destroyed
+  std::unique_ptr<BlockBasedTable> table2;
+  std::vector<BlockHandle> block_handles2;
+  s = CreateAndOpenSST(30, &table2, &block_handles2);
+  ASSERT_OK(s);
+
+  std::shared_ptr<ReadSet> read_set2;
+
+  {
+    auto job = std::make_shared<IOJob>();
+    job->block_handles = block_handles;
+    job->table = table.get();
+    job->job_options.read_options.async_io = false;
+
+    std::shared_ptr<ReadSet> read_set;
+    s = dispatcher->SubmitJob(job, &read_set);
+    ASSERT_OK(s);
+    ASSERT_NE(read_set, nullptr);
+
+    // Submit second job while first is still alive - prefetch will be deferred
+    auto job2 = std::make_shared<IOJob>();
+    job2->block_handles = block_handles2;
+    job2->table = table2.get();
+    job2->job_options.read_options.async_io = false;
+
+    s = dispatcher->SubmitJob(job2, &read_set2);
+    ASSERT_OK(s);  // Should succeed immediately
+    ASSERT_NE(read_set2, nullptr);
+
+    // First ReadSet goes out of scope here and should release all memory,
+    // which triggers pending prefetches for second ReadSet
+  }
+
+  // Read all blocks from second job - should work because first ReadSet
+  // released its memory on destruction
+  for (size_t i = 0; i < block_handles2.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set2->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+  }
+}
+
+// Test that partial prefetch dispatches as many blocks as memory allows
+// and queues the rest for later dispatch
+TEST_F(IODispatcherTest, PartialPrefetchDispatchesWhatFits) {
+  // Skip this test if io_uring is not available since partial prefetch
+  // only applies to async IO
+  if (!kIOUringPresent) {
+    return;  // io_uring not available, skip async IO test
+  }
+
+  // Create dispatcher with memory limit that allows only some blocks
+  // Each block is ~16KB, so 50KB allows roughly 3 blocks
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = 50 * 1024;  // 50KB
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  // Create 10 blocks - only ~3 should fit in memory
+  Status s = CreateAndOpenSST(10, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GE(block_handles.size(), 5);
+
+  // Use sync point to count blocks dispatched during SubmitJob
+  size_t blocks_dispatched_on_submit = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "IODispatcherImpl::DispatchPrefetch:BlockCount", [&](void* arg) {
+        auto* indices = static_cast<std::vector<size_t>*>(arg);
+        blocks_dispatched_on_submit += indices->size();
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  job->job_options.read_options.async_io = true;  // Use async IO
+
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+
+  // With partial prefetch, we expect SOME blocks to have been dispatched
+  // (the ones that fit in memory), but not ALL blocks
+  // This is the key assertion: partial prefetch means > 0 blocks dispatched
+  // even though total memory needed exceeds the limit
+  EXPECT_GT(blocks_dispatched_on_submit, 0)
+      << "Expected some blocks to be dispatched with partial prefetch";
+  EXPECT_LT(blocks_dispatched_on_submit, block_handles.size())
+      << "Expected not all blocks to be dispatched (memory limit should apply)";
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // Now read all blocks - remaining blocks will be fetched on demand
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+  }
+
+  // Verify all blocks were ultimately read
+  uint64_t total_reads = read_set->GetNumSyncReads() +
+                         read_set->GetNumAsyncReads() +
+                         read_set->GetNumCacheHits();
+  EXPECT_EQ(total_reads, block_handles.size());
+}
+
+// Test that earlier block indices are prioritized in partial prefetch
+TEST_F(IODispatcherTest, PartialPrefetchPrioritizesEarlierIndices) {
+  // Skip this test if io_uring is not available
+  if (!kIOUringPresent) {
+    return;  // io_uring not available, skip async IO test
+  }
+
+  // Create dispatcher with memory limit that allows only 1-2 blocks
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = 20 * 1024;  // 20KB - room for ~1 block
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(10, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GE(block_handles.size(), 5);
+
+  tracking_fs_->ClearReadOps();
+
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  job->job_options.read_options.async_io = true;
+
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+
+  // Get the async reads that were dispatched
+  auto read_ops = tracking_fs_->GetReadOps();
+
+  // Find the offset of the first async read
+  uint64_t first_async_offset = UINT64_MAX;
+  for (const auto& op : read_ops) {
+    if (op.type == ReadOp::kReadAsync && !op.requests.empty()) {
+      first_async_offset = std::min(first_async_offset, op.requests[0].first);
+    }
+  }
+
+  // The first async read should be for the first block (lowest offset)
+  // This verifies that earlier indices are prioritized
+  if (first_async_offset != UINT64_MAX) {
+    EXPECT_EQ(first_async_offset, block_handles[0].offset())
+        << "Expected first async read to be for the first block (earliest "
+           "index)";
+  }
+
+  // Read all blocks to complete the test
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+  }
+}
+
+// Test that blocks larger than the memory budget are excluded from prefetch
+// and fall back to synchronous read
+TEST_F(IODispatcherTest, OversizedBlocksFallbackToSyncRead) {
+  // Skip this test if io_uring is not available since we need async IO
+  if (!kIOUringPresent) {
+    return;
+  }
+
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(10, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GE(block_handles.size(), 3);
+
+  // Calculate the size of a single block
+  size_t single_block_size =
+      BlockBasedTable::BlockSizeWithTrailer(block_handles[0]);
+
+  // Create dispatcher with memory limit smaller than a single block
+  // This means ALL blocks are "oversized" and should fall back to sync read
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = single_block_size / 2;  // Half a block
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+
+  // Track dispatches - with oversized blocks, nothing should be dispatched
+  size_t blocks_dispatched = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "IODispatcherImpl::DispatchPrefetch:BlockCount", [&](void* arg) {
+        auto* indices = static_cast<std::vector<size_t>*>(arg);
+        blocks_dispatched += indices->size();
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  job->job_options.read_options.async_io = true;
+
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+
+  // No blocks should have been dispatched since they're all oversized
+  EXPECT_EQ(blocks_dispatched, 0)
+      << "Expected no blocks to be dispatched when all blocks are oversized";
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // All blocks should still be readable via sync fallback
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+  }
+
+  // All reads should be sync since blocks couldn't be prefetched
+  EXPECT_GT(read_set->GetNumSyncReads(), 0)
+      << "Expected sync reads for oversized blocks";
+}
+
+// Test that reading blocks before prefetch dispatch correctly updates
+// memory accounting for coalesced groups
+TEST_F(IODispatcherTest, PartialReadsUpdateCoalescedGroups) {
+  // Skip this test if io_uring is not available
+  if (!kIOUringPresent) {
+    return;
+  }
+
+  // Create dispatcher with memory limit that allows only some blocks
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = 50 * 1024;  // 50KB
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(20, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GE(block_handles.size(), 10);
+
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  job->job_options.read_options.async_io = true;
+
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+
+  // Read some blocks directly (simulating on-demand access before prefetch)
+  // This removes them from pending and should update coalesced group accounting
+  for (size_t i = 0; i < 5 && i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+  }
+
+  // Release the blocks we read - this frees memory
+  for (size_t i = 0; i < 5 && i < block_handles.size(); ++i) {
+    read_set->ReleaseBlock(i);
+  }
+
+  // Now read the remaining blocks - these should work correctly
+  // The key test: memory accounting should be correct even though some blocks
+  // were removed from pending groups before dispatch
+  for (size_t i = 5; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status) << "Failed to read block " << i;
+    ASSERT_NE(block.GetValue(), nullptr) << "Block " << i << " is null";
+  }
+
+  // Verify all remaining blocks were read successfully
+  uint64_t total_reads = read_set->GetNumSyncReads() +
+                         read_set->GetNumAsyncReads() +
+                         read_set->GetNumCacheHits();
+  // We read 5 blocks initially, then the remaining blocks
+  EXPECT_GE(total_reads, block_handles.size() - 5)
+      << "Expected at least the remaining blocks to be counted";
+}
+
+// Test that a mix of oversized and normal blocks works correctly
+TEST_F(IODispatcherTest, MixedOversizedAndNormalBlocks) {
+  // Skip this test if io_uring is not available
+  if (!kIOUringPresent) {
+    return;
+  }
+
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(10, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GE(block_handles.size(), 5);
+
+  // Calculate the size of a typical block
+  size_t typical_block_size =
+      BlockBasedTable::BlockSizeWithTrailer(block_handles[0]);
+
+  // Create dispatcher with memory limit that allows exactly 2 typical blocks
+  // This means groups of 3+ blocks become "oversized" as a group
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = typical_block_size * 2;
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  job->job_options.read_options.async_io = true;
+
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+
+  // All blocks should be readable regardless of prefetch status
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status) << "Failed to read block " << i;
+    ASSERT_NE(block.GetValue(), nullptr) << "Block " << i << " is null";
+  }
+
+  // Verify total reads match
+  uint64_t total_reads = read_set->GetNumSyncReads() +
+                         read_set->GetNumAsyncReads() +
+                         read_set->GetNumCacheHits();
+  EXPECT_EQ(total_reads, block_handles.size());
+}
+
+// Test that memory is properly accounted when groups are partially consumed
+TEST_F(IODispatcherTest, MemoryAccountingWithPartialGroupConsumption) {
+  // Skip this test if io_uring is not available
+  if (!kIOUringPresent) {
+    return;
+  }
+
+  // Create dispatcher with a specific memory limit
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = 100 * 1024;  // 100KB
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(30, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GE(block_handles.size(), 10);
+
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  job->job_options.read_options.async_io = true;
+
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+
+  // Read blocks one at a time and release them
+  // This tests that RemoveFromPending correctly updates pending state
+  // and that TryDispatchPendingPrefetches filters correctly
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status) << "Failed to read block " << i;
+    ASSERT_NE(block.GetValue(), nullptr) << "Block " << i << " is null";
+
+    // Release the block immediately after reading
+    read_set->ReleaseBlock(i);
+  }
+
+  // Verify total reads match
+  uint64_t total_reads = read_set->GetNumSyncReads() +
+                         read_set->GetNumAsyncReads() +
+                         read_set->GetNumCacheHits();
+  EXPECT_EQ(total_reads, block_handles.size());
+}
+
+// Test that sync prefetching respects memory limits
+TEST_F(IODispatcherTest, SyncPrefetchWithMemoryLimit) {
+  // Create dispatcher with a small memory limit
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = 50 * 1024;  // 50KB
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(20, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GE(block_handles.size(), 10);
+
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  job->job_options.read_options.async_io = false;  // Sync IO
+
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+
+  // All blocks should be readable even with memory limits
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status) << "Failed to read block " << i;
+    ASSERT_NE(block.GetValue(), nullptr) << "Block " << i << " is null";
+  }
+
+  // Verify all were sync reads
+  EXPECT_GT(read_set->GetNumSyncReads(), 0)
+      << "Expected sync reads with async_io=false";
+  EXPECT_EQ(read_set->GetNumAsyncReads(), 0)
+      << "Expected no async reads with async_io=false";
+}
+
+// Test that oversized blocks work correctly with sync IO
+TEST_F(IODispatcherTest, OversizedBlocksWithSyncIO) {
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(10, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GE(block_handles.size(), 3);
+
+  // Calculate the size of a single block
+  size_t single_block_size =
+      BlockBasedTable::BlockSizeWithTrailer(block_handles[0]);
+
+  // Create dispatcher with memory limit smaller than a single block
+  // This means ALL blocks are "oversized"
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = single_block_size / 2;  // Half a block
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  job->job_options.read_options.async_io = false;  // Sync IO
+
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+
+  // All blocks should still be readable via sync fallback
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status) << "Failed to read block " << i;
+    ASSERT_NE(block.GetValue(), nullptr) << "Block " << i << " is null";
+  }
+
+  // All reads should be sync
+  EXPECT_GT(read_set->GetNumSyncReads(), 0)
+      << "Expected sync reads for oversized blocks";
+}
+
+// Test that a single block larger than total memory budget still works
+TEST_F(IODispatcherTest, SingleBlockLargerThanTotalMemory) {
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(5, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GE(block_handles.size(), 1);
+
+  // Set memory limit to 1 byte - smaller than any block
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = 1;
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+
+  // Test with both sync and async modes
+  for (bool async : {false, true}) {
+    // Skip async if io_uring not available
+    if (async && !kIOUringPresent) {
+      continue;
+    }
+
+    auto job = std::make_shared<IOJob>();
+    job->block_handles = block_handles;
+    job->table = table.get();
+    job->job_options.read_options.async_io = async;
+
+    std::shared_ptr<ReadSet> read_set;
+    s = dispatcher->SubmitJob(job, &read_set);
+    ASSERT_OK(s) << "SubmitJob failed with async=" << async;
+    ASSERT_NE(read_set, nullptr);
+
+    // All blocks should be readable
+    for (size_t i = 0; i < block_handles.size(); ++i) {
+      CachableEntry<Block> block;
+      Status read_status = read_set->ReadIndex(i, &block);
+      ASSERT_OK(read_status)
+          << "Failed to read block " << i << " with async=" << async;
+      ASSERT_NE(block.GetValue(), nullptr)
+          << "Block " << i << " is null with async=" << async;
+    }
+  }
+}
+
+// Test that sync prefetching defers later groups and dispatches them
+// when memory is released
+TEST_F(IODispatcherTest, SyncPrefetchDefersAndDispatchesLaterGroups) {
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  // Create 10+ blocks so we have enough to test deferred dispatch
+  Status s = CreateAndOpenSST(20, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GE(block_handles.size(), 10);
+
+  // Calculate typical block size
+  size_t typical_block_size =
+      BlockBasedTable::BlockSizeWithTrailer(block_handles[0]);
+
+  // Set memory limit to fit approximately 3 blocks
+  // This should cause groups to be split and some deferred
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = typical_block_size * 3;
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+
+  // Track dispatch calls
+  std::vector<size_t> dispatch_counts;
+  SyncPoint::GetInstance()->SetCallBack(
+      "IODispatcherImpl::DispatchPrefetch:BlockCount", [&](void* arg) {
+        auto* indices = static_cast<std::vector<size_t>*>(arg);
+        dispatch_counts.push_back(indices->size());
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  job->job_options.read_options.async_io = false;  // Sync IO
+
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+
+  // After SubmitJob, some blocks should have been dispatched (first group)
+  // and remaining groups should be queued
+  size_t initial_dispatch_count = dispatch_counts.size();
+  EXPECT_GT(initial_dispatch_count, 0)
+      << "Expected at least one dispatch during SubmitJob";
+
+  // Read and release first few blocks - this should trigger deferred dispatch
+  for (size_t i = 0; i < 3 && i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+    // Release to free memory
+    read_set->ReleaseBlock(i);
+  }
+
+  // After releasing blocks, more dispatches should have occurred
+  // as the pending queue gets processed
+  size_t dispatch_count_after_release = dispatch_counts.size();
+  EXPECT_GE(dispatch_count_after_release, initial_dispatch_count)
+      << "Expected more dispatches after releasing blocks";
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // All remaining blocks should still be readable
+  for (size_t i = 3; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status) << "Failed to read block " << i;
+    ASSERT_NE(block.GetValue(), nullptr) << "Block " << i << " is null";
+  }
+}
+
+// Test that coalesced groups are properly split based on memory budget
+TEST_F(IODispatcherTest, CoalescedGroupsSplitByMemoryBudget) {
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(15, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GE(block_handles.size(), 10);
+
+  // Calculate typical block size
+  size_t typical_block_size =
+      BlockBasedTable::BlockSizeWithTrailer(block_handles[0]);
+
+  // Set memory limit to fit exactly 5 blocks
+  // With 10+ blocks, we should get at least 2 groups
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = typical_block_size * 5;
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+
+  // Track how many blocks are in each dispatch call
+  std::vector<size_t> blocks_per_dispatch;
+  SyncPoint::GetInstance()->SetCallBack(
+      "IODispatcherImpl::DispatchPrefetch:BlockCount", [&](void* arg) {
+        auto* indices = static_cast<std::vector<size_t>*>(arg);
+        blocks_per_dispatch.push_back(indices->size());
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  job->job_options.read_options.async_io = false;
+
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+
+  // First dispatch should have at most 5 blocks (memory limit)
+  ASSERT_GT(blocks_per_dispatch.size(), 0);
+  EXPECT_LE(blocks_per_dispatch[0], 5)
+      << "First dispatch should be limited by memory budget";
+
+  // Read and release all blocks to trigger remaining dispatches
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    read_set->ReleaseBlock(i);
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // Verify each dispatch was limited by memory budget
+  for (size_t i = 0; i < blocks_per_dispatch.size(); ++i) {
+    EXPECT_LE(blocks_per_dispatch[i], 5)
+        << "Dispatch " << i << " exceeded memory budget";
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/util/math.h b/util/math.h
index e1948e0a313e..112a54f9ffb6 100644
--- a/util/math.h
+++ b/util/math.h
@@ -41,7 +41,9 @@ inline T BottomNBits(T v, int nbits) {
 #endif
   // Newer compilers compile this down to bzhi on x86, but some older
   // ones don't, thus the need for the intrinsic above.
-  return static_cast<T>(v & ((T{1} << nbits) - 1));
+  using UnsignedT = std::make_unsigned_t<T>;
+  UnsignedT mask = (static_cast<UnsignedT>(1) << nbits) - 1;
+  return static_cast<T>(static_cast<UnsignedT>(v) & mask);
 }
 
 // Fast implementation of floor(log2(v)). Undefined for 0 or negative
diff --git a/util/mutexlock.h b/util/mutexlock.h
index aecd4f21cb4f..b142bde320f2 100644
--- a/util/mutexlock.h
+++ b/util/mutexlock.h
@@ -34,15 +34,15 @@ namespace ROCKSDB_NAMESPACE {
 
 class MutexLock {
  public:
-  explicit MutexLock(port::Mutex *mu) : mu_(mu) { this->mu_->Lock(); }
+  explicit MutexLock(port::Mutex* mu) : mu_(mu) { this->mu_->Lock(); }
   // No copying allowed
-  MutexLock(const MutexLock &) = delete;
-  void operator=(const MutexLock &) = delete;
+  MutexLock(const MutexLock&) = delete;
+  void operator=(const MutexLock&) = delete;
 
   ~MutexLock() { this->mu_->Unlock(); }
 
  private:
-  port::Mutex *const mu_;
+  port::Mutex* const mu_;
 };
 
 //
@@ -52,15 +52,15 @@ class MutexLock {
 //
 class ReadLock {
  public:
-  explicit ReadLock(port::RWMutex *mu) : mu_(mu) { this->mu_->ReadLock(); }
+  explicit ReadLock(port::RWMutex* mu) : mu_(mu) { this->mu_->ReadLock(); }
   // No copying allowed
-  ReadLock(const ReadLock &) = delete;
-  void operator=(const ReadLock &) = delete;
+  ReadLock(const ReadLock&) = delete;
+  void operator=(const ReadLock&) = delete;
 
   ~ReadLock() { this->mu_->ReadUnlock(); }
 
  private:
-  port::RWMutex *const mu_;
+  port::RWMutex* const mu_;
 };
 
 //
@@ -68,15 +68,15 @@ class ReadLock {
 //
 class ReadUnlock {
  public:
-  explicit ReadUnlock(port::RWMutex *mu) : mu_(mu) { mu->AssertHeld(); }
+  explicit ReadUnlock(port::RWMutex* mu) : mu_(mu) { mu->AssertHeld(); }
   // No copying allowed
-  ReadUnlock(const ReadUnlock &) = delete;
-  ReadUnlock &operator=(const ReadUnlock &) = delete;
+  ReadUnlock(const ReadUnlock&) = delete;
+  ReadUnlock& operator=(const ReadUnlock&) = delete;
 
   ~ReadUnlock() { mu_->ReadUnlock(); }
 
  private:
-  port::RWMutex *const mu_;
+  port::RWMutex* const mu_;
 };
 
 //
@@ -86,15 +86,15 @@ class ReadUnlock {
 //
 class WriteLock {
  public:
-  explicit WriteLock(port::RWMutex *mu) : mu_(mu) { this->mu_->WriteLock(); }
+  explicit WriteLock(port::RWMutex* mu) : mu_(mu) { this->mu_->WriteLock(); }
   // No copying allowed
-  WriteLock(const WriteLock &) = delete;
-  void operator=(const WriteLock &) = delete;
+  WriteLock(const WriteLock&) = delete;
+  void operator=(const WriteLock&) = delete;
 
   ~WriteLock() { this->mu_->WriteUnlock(); }
 
  private:
-  port::RWMutex *const mu_;
+  port::RWMutex* const mu_;
 };
 
 //
@@ -145,12 +145,12 @@ struct ALIGN_AS(CACHE_LINE_SIZE) CacheAlignedWrapper {
 template <class T>
 struct Unwrap {
   using type = T;
-  static type &Go(T &t) { return t; }
+  static type& Go(T& t) { return t; }
 };
 template <class T>
 struct Unwrap<CacheAlignedWrapper<T>> {
   using type = T;
-  static type &Go(CacheAlignedWrapper<T> &t) { return t.obj_; }
+  static type& Go(CacheAlignedWrapper<T>& t) { return t.obj_; }
 };
 
 //
@@ -169,7 +169,7 @@ class Striped {
       : stripe_count_(stripe_count), data_(new T[stripe_count]) {}
 
   using Unwrapped = typename Unwrap<T>::type;
-  Unwrapped &Get(const Key &key, uint64_t seed = 0) {
+  Unwrapped& Get(const Key& key, uint64_t seed = 0) {
     size_t index = FastRangeGeneric(hash_(key, seed), stripe_count_);
     return Unwrap<T>::Go(data_[index]);
   }
diff --git a/util/ribbon_alg.h b/util/ribbon_alg.h
index f9afefc2377b..52016e266c9d 100644
--- a/util/ribbon_alg.h
+++ b/util/ribbon_alg.h
@@ -545,10 +545,10 @@ namespace ribbon {
 // solution satisfying all the cr@start -> rr entries added.
 template <bool kFirstCoeffAlwaysOne, typename BandingStorage,
           typename BacktrackStorage>
-bool BandingAdd(BandingStorage *bs, typename BandingStorage::Index start,
+bool BandingAdd(BandingStorage* bs, typename BandingStorage::Index start,
                 typename BandingStorage::ResultRow rr,
-                typename BandingStorage::CoeffRow cr, BacktrackStorage *bts,
-                typename BandingStorage::Index *backtrack_pos) {
+                typename BandingStorage::CoeffRow cr, BacktrackStorage* bts,
+                typename BandingStorage::Index* backtrack_pos) {
   using CoeffRow = typename BandingStorage::CoeffRow;
   using ResultRow = typename BandingStorage::ResultRow;
   using Index = typename BandingStorage::Index;
@@ -608,8 +608,8 @@ bool BandingAdd(BandingStorage *bs, typename BandingStorage::Index start,
 //
 template <typename BandingStorage, typename BacktrackStorage,
           typename BandingHasher, typename InputIterator>
-bool BandingAddRange(BandingStorage *bs, BacktrackStorage *bts,
-                     const BandingHasher &bh, InputIterator begin,
+bool BandingAddRange(BandingStorage* bs, BacktrackStorage* bts,
+                     const BandingHasher& bh, InputIterator begin,
                      InputIterator end) {
   using CoeffRow = typename BandingStorage::CoeffRow;
   using Index = typename BandingStorage::Index;
@@ -703,7 +703,7 @@ bool BandingAddRange(BandingStorage *bs, BacktrackStorage *bts,
 //
 template <typename BandingStorage, typename BandingHasher,
           typename InputIterator>
-bool BandingAddRange(BandingStorage *bs, const BandingHasher &bh,
+bool BandingAddRange(BandingStorage* bs, const BandingHasher& bh,
                      InputIterator begin, InputIterator end) {
   using Index = typename BandingStorage::Index;
   struct NoopBacktrackStorage {
@@ -754,7 +754,7 @@ bool BandingAddRange(BandingStorage *bs, const BandingHasher &bh,
 // Back-substitution for generating a solution from BandingStorage to
 // SimpleSolutionStorage.
 template <typename SimpleSolutionStorage, typename BandingStorage>
-void SimpleBackSubst(SimpleSolutionStorage *sss, const BandingStorage &bs) {
+void SimpleBackSubst(SimpleSolutionStorage* sss, const BandingStorage& bs) {
   using CoeffRow = typename BandingStorage::CoeffRow;
   using Index = typename BandingStorage::Index;
   using ResultRow = typename BandingStorage::ResultRow;
@@ -815,7 +815,7 @@ template <typename SimpleSolutionStorage>
 typename SimpleSolutionStorage::ResultRow SimpleQueryHelper(
     typename SimpleSolutionStorage::Index start_slot,
     typename SimpleSolutionStorage::CoeffRow cr,
-    const SimpleSolutionStorage &sss) {
+    const SimpleSolutionStorage& sss) {
   using CoeffRow = typename SimpleSolutionStorage::CoeffRow;
   using ResultRow = typename SimpleSolutionStorage::ResultRow;
 
@@ -833,8 +833,8 @@ typename SimpleSolutionStorage::ResultRow SimpleQueryHelper(
 // General PHSF query a key from SimpleSolutionStorage.
 template <typename SimpleSolutionStorage, typename PhsfQueryHasher>
 typename SimpleSolutionStorage::ResultRow SimplePhsfQuery(
-    const typename PhsfQueryHasher::Key &key, const PhsfQueryHasher &hasher,
-    const SimpleSolutionStorage &sss) {
+    const typename PhsfQueryHasher::Key& key, const PhsfQueryHasher& hasher,
+    const SimpleSolutionStorage& sss) {
   const typename PhsfQueryHasher::Hash hash = hasher.GetHash(key);
 
   static_assert(sizeof(typename SimpleSolutionStorage::Index) ==
@@ -850,9 +850,9 @@ typename SimpleSolutionStorage::ResultRow SimplePhsfQuery(
 
 // Filter query a key from SimpleSolutionStorage.
 template <typename SimpleSolutionStorage, typename FilterQueryHasher>
-bool SimpleFilterQuery(const typename FilterQueryHasher::Key &key,
-                       const FilterQueryHasher &hasher,
-                       const SimpleSolutionStorage &sss) {
+bool SimpleFilterQuery(const typename FilterQueryHasher::Key& key,
+                       const FilterQueryHasher& hasher,
+                       const SimpleSolutionStorage& sss) {
   const typename FilterQueryHasher::Hash hash = hasher.GetHash(key);
   const typename SimpleSolutionStorage::ResultRow expected =
       hasher.GetResultRowFromHash(hash);
@@ -968,9 +968,9 @@ bool SimpleFilterQuery(const typename FilterQueryHasher::Key &key,
 
 // A helper for InterleavedBackSubst.
 template <typename BandingStorage>
-inline void BackSubstBlock(typename BandingStorage::CoeffRow *state,
+inline void BackSubstBlock(typename BandingStorage::CoeffRow* state,
                            typename BandingStorage::Index num_columns,
-                           const BandingStorage &bs,
+                           const BandingStorage& bs,
                            typename BandingStorage::Index start_slot) {
   using CoeffRow = typename BandingStorage::CoeffRow;
   using Index = typename BandingStorage::Index;
@@ -1004,8 +1004,8 @@ inline void BackSubstBlock(typename BandingStorage::CoeffRow *state,
 // Back-substitution for generating a solution from BandingStorage to
 // InterleavedSolutionStorage.
 template <typename InterleavedSolutionStorage, typename BandingStorage>
-void InterleavedBackSubst(InterleavedSolutionStorage *iss,
-                          const BandingStorage &bs) {
+void InterleavedBackSubst(InterleavedSolutionStorage* iss,
+                          const BandingStorage& bs) {
   using CoeffRow = typename BandingStorage::CoeffRow;
   using Index = typename BandingStorage::Index;
 
@@ -1084,12 +1084,12 @@ void InterleavedBackSubst(InterleavedSolutionStorage *iss,
 // Prefetch memory for a key in InterleavedSolutionStorage.
 template <typename InterleavedSolutionStorage, typename PhsfQueryHasher>
 inline void InterleavedPrepareQuery(
-    const typename PhsfQueryHasher::Key &key, const PhsfQueryHasher &hasher,
-    const InterleavedSolutionStorage &iss,
-    typename PhsfQueryHasher::Hash *saved_hash,
-    typename InterleavedSolutionStorage::Index *saved_segment_num,
-    typename InterleavedSolutionStorage::Index *saved_num_columns,
-    typename InterleavedSolutionStorage::Index *saved_start_bit) {
+    const typename PhsfQueryHasher::Key& key, const PhsfQueryHasher& hasher,
+    const InterleavedSolutionStorage& iss,
+    typename PhsfQueryHasher::Hash* saved_hash,
+    typename InterleavedSolutionStorage::Index* saved_segment_num,
+    typename InterleavedSolutionStorage::Index* saved_num_columns,
+    typename InterleavedSolutionStorage::Index* saved_start_bit) {
   using Hash = typename PhsfQueryHasher::Hash;
   using CoeffRow = typename InterleavedSolutionStorage::CoeffRow;
   using Index = typename InterleavedSolutionStorage::Index;
@@ -1131,7 +1131,7 @@ inline typename InterleavedSolutionStorage::ResultRow InterleavedPhsfQuery(
     typename InterleavedSolutionStorage::Index segment_num,
     typename InterleavedSolutionStorage::Index num_columns,
     typename InterleavedSolutionStorage::Index start_bit,
-    const PhsfQueryHasher &hasher, const InterleavedSolutionStorage &iss) {
+    const PhsfQueryHasher& hasher, const InterleavedSolutionStorage& iss) {
   using CoeffRow = typename InterleavedSolutionStorage::CoeffRow;
   using Index = typename InterleavedSolutionStorage::Index;
   using ResultRow = typename InterleavedSolutionStorage::ResultRow;
@@ -1170,7 +1170,7 @@ inline bool InterleavedFilterQuery(
     typename InterleavedSolutionStorage::Index segment_num,
     typename InterleavedSolutionStorage::Index num_columns,
     typename InterleavedSolutionStorage::Index start_bit,
-    const FilterQueryHasher &hasher, const InterleavedSolutionStorage &iss) {
+    const FilterQueryHasher& hasher, const InterleavedSolutionStorage& iss) {
   using CoeffRow = typename InterleavedSolutionStorage::CoeffRow;
   using Index = typename InterleavedSolutionStorage::Index;
   using ResultRow = typename InterleavedSolutionStorage::ResultRow;
diff --git a/util/semaphore.h b/util/semaphore.h
new file mode 100644
index 000000000000..59e767d6246d
--- /dev/null
+++ b/util/semaphore.h
@@ -0,0 +1,164 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+#include <condition_variable>
+#include <mutex>
+#ifdef ROCKSDB_USE_STD_SEMAPHORES
+#include <semaphore>
+#endif
+
+#include "port/port.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Wrapper providing a chosen counting semaphore implementation. The default
+// implementation based on a mutex and condvar unfortunately can result in
+// Release() temporarily waiting on another thread to make progress (if that
+// other thread is preempted while holding the mutex), but that should be rare.
+// However, alternative implementations may have correctness issues or even
+// worse performance. See std::counting_semaphore for general contract.
+//
+// NOTE1: std::counting_semaphore is known to be buggy on many std library
+// implementations, so be cautious about enabling it. Reportedly, an acquire()
+// can falsely block indefinitely. And we can't easily work around that with
+// try_acquire_for because another common bug has that function consistently
+// sleeping for the entire timeout duration even if a release() happens earlier.
+// Therefore, using std::counting_semaphore/binary_semaphore is strictly opt-in
+// for now.
+//
+// NOTE2: Also tried wrapping folly::fibers::Semaphore here but it was not as
+// efficient (for parallel compression) as even the mutex+condvar version.
+class ALIGN_AS(CACHE_LINE_SIZE) CountingSemaphore {
+ public:
+  explicit CountingSemaphore(std::ptrdiff_t starting_count)
+#ifdef ROCKSDB_USE_STD_SEMAPHORES
+      : sem_(starting_count)
+#else
+      : count_(static_cast<int32_t>(starting_count))
+#endif  // ROCKSDB_USE_STD_SEMAPHORES
+  {
+    assert(starting_count >= 0);
+    assert(starting_count <= INT32_MAX);
+  }
+  void Acquire() {
+#ifdef ROCKSDB_USE_STD_SEMAPHORES
+    sem_.acquire();
+#else
+    std::unique_lock<std::mutex> lock(mutex_);
+    assert(count_ >= 0);
+    cv_.wait(lock, [this] { return count_ > 0; });
+    --count_;
+#endif  // ROCKSDB_USE_STD_SEMAPHORES
+  }
+  bool TryAcquire() {
+#ifdef ROCKSDB_USE_STD_SEMAPHORES
+    return sem_.try_acquire();
+#else
+    std::unique_lock<std::mutex> lock(mutex_);
+    assert(count_ >= 0);
+    if (count_ == 0) {
+      return false;
+    } else {
+      --count_;
+      return true;
+    }
+#endif  // ROCKSDB_USE_STD_SEMAPHORES
+  }
+  void Release(std::ptrdiff_t n = 1) {
+#ifdef ROCKSDB_USE_STD_SEMAPHORES
+    sem_.release(n);
+#else
+    assert(n >= 0);
+    assert(n <= INT32_MAX);
+    if (n > 0) {
+      std::unique_lock<std::mutex> lock(mutex_);
+      assert(count_ >= 0);
+      count_ += static_cast<int32_t>(n);
+      assert(count_ >= 0);  // no overflow
+      if (n == 1) {
+        cv_.notify_one();
+      } else {
+        cv_.notify_all();
+      }
+    }
+#endif  // ROCKSDB_USE_STD_SEMAPHORES
+  }
+
+ private:
+#ifdef ROCKSDB_USE_STD_SEMAPHORES
+  std::counting_semaphore<INT32_MAX> sem_;
+#else
+  int32_t count_;
+  std::mutex mutex_;
+  std::condition_variable cv_;
+#endif  // ROCKSDB_USE_STD_SEMAPHORES
+};  // namespace ROCKSDB_NAMESPACE
+
+// Wrapper providing a chosen binary semaphore implementation. See notes on
+// CountingSemaphore above, and on Release() below.
+class BinarySemaphore {
+ public:
+  explicit BinarySemaphore(std::ptrdiff_t starting_count)
+#ifdef ROCKSDB_USE_STD_SEMAPHORES
+      : sem_(starting_count)
+#else
+      : state_(starting_count > 0)
+#endif  // ROCKSDB_USE_STD_SEMAPHORES
+  {
+    assert(starting_count >= 0);
+  }
+  void Acquire() {
+#ifdef ROCKSDB_USE_STD_SEMAPHORES
+    sem_.acquire();
+#else
+    std::unique_lock<std::mutex> lock(mutex_);
+    cv_.wait(lock, [this] { return state_; });
+    state_ = false;
+#endif  // ROCKSDB_USE_STD_SEMAPHORES
+  }
+  bool TryAcquire() {
+#ifdef ROCKSDB_USE_STD_SEMAPHORES
+    return sem_.try_acquire();
+#else
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (state_) {
+      state_ = false;
+      return true;
+    } else {
+      return false;
+    }
+#endif  // ROCKSDB_USE_STD_SEMAPHORES
+  }
+  void Release() {
+    // NOTE: implementations of std::binary_semaphore::release() tend to behave
+    // like counting semaphores in the case of multiple Release() calls without
+    // Acquire() in between, though it is undefined behavior. It is also OK to
+    // cap the count at 1.
+#ifdef ROCKSDB_USE_STD_SEMAPHORES
+    sem_.release();
+#else
+    std::unique_lock<std::mutex> lock(mutex_);
+    // check precondition to avoid UB in std implementation
+    assert(state_ == false);
+    state_ = true;
+    cv_.notify_one();
+#endif  // ROCKSDB_USE_STD_SEMAPHORES
+  }
+
+ private:
+#ifdef ROCKSDB_USE_STD_SEMAPHORES
+  std::binary_semaphore sem_;
+#else
+  bool state_;
+  std::mutex mutex_;
+  std::condition_variable cv_;
+#endif  // ROCKSDB_USE_STD_SEMAPHORES
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/util/simple_mixed_compressor.cc b/util/simple_mixed_compressor.cc
new file mode 100644
index 000000000000..46b2e74c3091
--- /dev/null
+++ b/util/simple_mixed_compressor.cc
@@ -0,0 +1,119 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Creates mixed compressor wrapper which uses multiple compression algorithm
+// within same SST file.
+
+#include "simple_mixed_compressor.h"
+
+#include <options/options_helper.h>
+
+#include "random.h"
+#include "rocksdb/advanced_compression.h"
+namespace ROCKSDB_NAMESPACE {
+
+// MultiCompressorWrapper implementation
+MultiCompressorWrapper::MultiCompressorWrapper(const CompressionOptions& opts)
+    : opts_(opts) {
+  // TODO: make the compression manager a field
+  auto builtInManager = GetBuiltinV2CompressionManager();
+  const auto& compressions = GetSupportedCompressions();
+  for (auto type : compressions) {
+    if (type == kNoCompression) {
+      continue;
+    }
+    compressors_.push_back(builtInManager->GetCompressor(opts, type));
+  }
+}
+
+Compressor::DictConfig MultiCompressorWrapper::GetDictGuidance(
+    CacheEntryRole block_type) const {
+  return compressors_.back()->GetDictGuidance(block_type);
+}
+
+Slice MultiCompressorWrapper::GetSerializedDict() const {
+  return compressors_.back()->GetSerializedDict();
+}
+
+CompressionType MultiCompressorWrapper::GetPreferredCompressionType() const {
+  return compressors_.back()->GetPreferredCompressionType();
+}
+
+Compressor::ManagedWorkingArea MultiCompressorWrapper::ObtainWorkingArea() {
+  return compressors_.back()->ObtainWorkingArea();
+}
+
+std::unique_ptr<Compressor> MultiCompressorWrapper::MaybeCloneSpecialized(
+    CacheEntryRole block_type, DictConfigArgs&& dict_config) const {
+  // TODO: full dictionary compression support. Currently this just falls
+  // back on a non-multi compressor when asked to use a dictionary.
+  return compressors_.back()->MaybeCloneSpecialized(block_type,
+                                                    std::move(dict_config));
+}
+
+// RandomMixedCompressor implementation
+const char* RandomMixedCompressor::Name() const {
+  return "RandomMixedCompressor";
+}
+
+std::unique_ptr<Compressor> RandomMixedCompressor::Clone() const {
+  return std::make_unique<RandomMixedCompressor>(opts_);
+}
+
+Status RandomMixedCompressor::CompressBlock(
+    Slice uncompressed_data, char* compressed_output,
+    size_t* compressed_output_size, CompressionType* out_compression_type,
+    ManagedWorkingArea* wa) {
+  auto selected =
+      Random::GetTLSInstance()->Uniform(static_cast<int>(compressors_.size()));
+  auto& compressor = compressors_[selected];
+  return compressor->CompressBlock(uncompressed_data, compressed_output,
+                                   compressed_output_size, out_compression_type,
+                                   wa);
+}
+
+const char* RandomMixedCompressionManager::Name() const {
+  return "RandomMixedCompressionManager";
+}
+
+std::unique_ptr<Compressor> RandomMixedCompressionManager::GetCompressorForSST(
+    const FilterBuildingContext& /*context*/, const CompressionOptions& opts,
+    CompressionType /*preferred*/) {
+  return std::make_unique<RandomMixedCompressor>(opts);
+}
+
+// RoundRobinCompressor implementation
+const char* RoundRobinCompressor::Name() const {
+  return "RoundRobinCompressor";
+}
+
+std::unique_ptr<Compressor> RoundRobinCompressor::Clone() const {
+  return std::make_unique<RoundRobinCompressor>(opts_);
+}
+
+Status RoundRobinCompressor::CompressBlock(
+    Slice uncompressed_data, char* compressed_output,
+    size_t* compressed_output_size, CompressionType* out_compression_type,
+    ManagedWorkingArea* wa) {
+  auto counter = block_counter.FetchAddRelaxed(1);
+  auto sel_idx = counter % (compressors_.size());
+  auto& compressor = compressors_[sel_idx];
+  return compressor->CompressBlock(uncompressed_data, compressed_output,
+                                   compressed_output_size, out_compression_type,
+                                   wa);
+}
+
+RelaxedAtomic<uint64_t> RoundRobinCompressor::block_counter{0};
+
+// RoundRobinManager implementation
+const char* RoundRobinManager::Name() const { return "RoundRobinManager"; }
+
+std::unique_ptr<Compressor> RoundRobinManager::GetCompressorForSST(
+    const FilterBuildingContext& /*context*/, const CompressionOptions& opts,
+    CompressionType /*preferred*/) {
+  return std::make_unique<RoundRobinCompressor>(opts);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/util/simple_mixed_compressor.h b/util/simple_mixed_compressor.h
new file mode 100644
index 000000000000..f2499a8f4e99
--- /dev/null
+++ b/util/simple_mixed_compressor.h
@@ -0,0 +1,71 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Creates mixed compressor wrapper which uses multiple compression algorithm
+// within same SST file.
+
+#pragma once
+#include <memory>
+#include <vector>
+
+#include "rocksdb/advanced_compression.h"
+#include "util/atomic.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MultiCompressorWrapper : public Compressor {
+ public:
+  explicit MultiCompressorWrapper(const CompressionOptions& opts);
+
+  DictConfig GetDictGuidance(CacheEntryRole block_type) const override;
+  Slice GetSerializedDict() const override;
+  CompressionType GetPreferredCompressionType() const override;
+  ManagedWorkingArea ObtainWorkingArea() override;
+  std::unique_ptr<Compressor> MaybeCloneSpecialized(
+      CacheEntryRole block_type, DictConfigArgs&& dict_config) const override;
+
+ protected:
+  const CompressionOptions opts_;
+  std::vector<std::unique_ptr<Compressor>> compressors_;
+};
+
+struct RandomMixedCompressor : public MultiCompressorWrapper {
+  using MultiCompressorWrapper::MultiCompressorWrapper;
+  const char* Name() const override;
+  std::unique_ptr<Compressor> Clone() const override;
+  Status CompressBlock(Slice uncompressed_data, char* compressed_output,
+                       size_t* compressed_output_size,
+                       CompressionType* out_compression_type,
+                       ManagedWorkingArea* wa) override;
+};
+
+class RandomMixedCompressionManager : public CompressionManagerWrapper {
+  using CompressionManagerWrapper::CompressionManagerWrapper;
+  const char* Name() const override;
+  std::unique_ptr<Compressor> GetCompressorForSST(
+      const FilterBuildingContext& context, const CompressionOptions& opts,
+      CompressionType preferred) override;
+};
+
+struct RoundRobinCompressor : public MultiCompressorWrapper {
+  using MultiCompressorWrapper::MultiCompressorWrapper;
+  const char* Name() const override;
+  std::unique_ptr<Compressor> Clone() const override;
+  Status CompressBlock(Slice uncompressed_data, char* compressed_output,
+                       size_t* compressed_output_size,
+                       CompressionType* out_compression_type,
+                       ManagedWorkingArea* wa) override;
+  static RelaxedAtomic<uint64_t> block_counter;
+};
+
+class RoundRobinManager : public CompressionManagerWrapper {
+  using CompressionManagerWrapper::CompressionManagerWrapper;
+  const char* Name() const override;
+  std::unique_ptr<Compressor> GetCompressorForSST(
+      const FilterBuildingContext& context, const CompressionOptions& opts,
+      CompressionType preferred) override;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/util/slice.cc b/util/slice.cc
index 9ec0af132c27..cd3be5d33761 100644
--- a/util/slice.cc
+++ b/util/slice.cc
@@ -61,10 +61,6 @@ class FixedPrefixTransform : public SliceTransform {
     return (src.size() >= prefix_len_);
   }
 
-  bool InRange(const Slice& dst) const override {
-    return (dst.size() == prefix_len_);
-  }
-
   bool FullLengthEnabled(size_t* len) const override {
     *len = prefix_len_;
     return true;
@@ -111,10 +107,6 @@ class CappedPrefixTransform : public SliceTransform {
 
   bool InDomain(const Slice& /*src*/) const override { return true; }
 
-  bool InRange(const Slice& dst) const override {
-    return (dst.size() <= cap_len_);
-  }
-
   bool FullLengthEnabled(size_t* len) const override {
     *len = cap_len_;
     return true;
@@ -136,8 +128,6 @@ class NoopTransform : public SliceTransform {
 
   bool InDomain(const Slice& /*src*/) const override { return true; }
 
-  bool InRange(const Slice& /*dst*/) const override { return true; }
-
   bool SameResultWhenAppended(const Slice& /*prefix*/) const override {
     return false;
   }
diff --git a/util/slice_test.cc b/util/slice_test.cc
index 0028cce85965..9106ec3c6e58 100644
--- a/util/slice_test.cc
+++ b/util/slice_test.cc
@@ -3,17 +3,26 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+// Because there are a small set of tests for Slice and there's a cost in having
+// extra test binaries for each component, this test file has evolved into a
+// "grab bag" of small tests for various reusable components, mostly in  util/.
+
 #include "rocksdb/slice.h"
 
 #include <gtest/gtest.h>
 
+#include <semaphore>
+
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "rocksdb/data_structure.h"
 #include "rocksdb/types.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/bit_fields.h"
 #include "util/cast_util.h"
+#include "util/semaphore.h"
+#include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -177,18 +186,23 @@ class SmallEnumSetTest : public testing::Test {
 TEST_F(SmallEnumSetTest, SmallEnumSetTest1) {
   FileTypeSet fs;  // based on a legacy enum type
   ASSERT_TRUE(fs.empty());
+  ASSERT_EQ(fs.count(), 0U);
   ASSERT_TRUE(fs.Add(FileType::kIdentityFile));
   ASSERT_FALSE(fs.empty());
+  ASSERT_EQ(fs.count(), 1U);
   ASSERT_FALSE(fs.Add(FileType::kIdentityFile));
   ASSERT_TRUE(fs.Add(FileType::kInfoLogFile));
   ASSERT_TRUE(fs.Contains(FileType::kIdentityFile));
   ASSERT_FALSE(fs.Contains(FileType::kDBLockFile));
   ASSERT_FALSE(fs.empty());
+  ASSERT_EQ(fs.count(), 2U);
   ASSERT_FALSE(fs.Remove(FileType::kDBLockFile));
   ASSERT_TRUE(fs.Remove(FileType::kIdentityFile));
   ASSERT_FALSE(fs.empty());
+  ASSERT_EQ(fs.count(), 1U);
   ASSERT_TRUE(fs.Remove(FileType::kInfoLogFile));
   ASSERT_TRUE(fs.empty());
+  ASSERT_EQ(fs.count(), 0U);
 }
 
 namespace {
@@ -224,12 +238,16 @@ TEST_F(SmallEnumSetTest, SmallEnumSetTest2) {
   ASSERT_NE(cs, MyEnumClassSet{MyEnumClass::B});
   ASSERT_NE(cs, MyEnumClassSet::All());
 
+  ASSERT_EQ(MyEnumClassSet{}.count(), 0U);
+  ASSERT_EQ(MyEnumClassSet::All().count(), 3U);
+
   int count = 0;
   for (MyEnumClass e : cs) {
     ASSERT_EQ(e, MyEnumClass::A);
     ++count;
   }
   ASSERT_EQ(count, 1);
+  ASSERT_EQ(cs.count(), 1U);
 
   count = 0;
   for (MyEnumClass e : MyEnumClassSet::All().Without(MyEnumClass::B)) {
@@ -244,6 +262,68 @@ TEST_F(SmallEnumSetTest, SmallEnumSetTest2) {
   }
 }
 
+template <typename ENUM_TYPE, ENUM_TYPE MAX_ENUMERATOR>
+void TestBiggerEnumSet() {
+  using MySet = SmallEnumSet<ENUM_TYPE, MAX_ENUMERATOR>;
+  constexpr int kMaxValue = static_cast<int>(MAX_ENUMERATOR);
+  SCOPED_TRACE("kMaxValue = " + std::to_string(kMaxValue));
+
+  ASSERT_EQ(sizeof(MySet), (kMaxValue + 1 + 63) / 64 * 8);
+
+  MySet s;
+  ASSERT_TRUE(s.empty());
+  ASSERT_EQ(s.count(), 0U);
+  ASSERT_TRUE(s.Add(ENUM_TYPE(0)));
+  ASSERT_FALSE(s.empty());
+  ASSERT_EQ(s.count(), 1U);
+  ASSERT_TRUE(s.Add(ENUM_TYPE(kMaxValue - 1)));
+  ASSERT_FALSE(s.empty());
+  ASSERT_EQ(s.count(), 2U);
+  ASSERT_TRUE(s.Add(ENUM_TYPE(kMaxValue)));
+  ASSERT_FALSE(s.empty());
+  ASSERT_EQ(s.count(), 3U);
+
+  int count = 0;
+  for (ENUM_TYPE e : s) {
+    ASSERT_TRUE(e == ENUM_TYPE(0) || e == ENUM_TYPE(kMaxValue - 1) ||
+                e == ENUM_TYPE(kMaxValue));
+    ++count;
+  }
+  ASSERT_EQ(count, 3);
+
+  ASSERT_TRUE(s.Remove(ENUM_TYPE(0)));
+  ASSERT_TRUE(s.Remove(ENUM_TYPE(kMaxValue)));
+  ASSERT_FALSE(s.empty());
+  ASSERT_EQ(s.count(), 1U);
+
+  count = 0;
+  for (ENUM_TYPE e : s) {
+    ASSERT_EQ(e, ENUM_TYPE(kMaxValue - 1));
+    ++count;
+  }
+  ASSERT_EQ(count, 1);
+}
+
+TEST_F(SmallEnumSetTest, BiggerEnumClasses) {
+  enum class BiggerEnumClass63 { A, B, C = 63 };
+  enum class BiggerEnumClass64 { A, B, C = 64 };
+  enum class BiggerEnumClass65 { A, B, C = 65 };
+  enum class BiggerEnumClass127 { A, B, C = 127 };
+  enum class BiggerEnumClass128 { A, B, C = 128 };
+  enum class BiggerEnumClass129 { A, B, C = 129 };
+  enum class BiggerEnumClass150 { A, B, C = 150 };
+  enum class BiggerEnumClass255 { A, B, C = 255 };
+
+  TestBiggerEnumSet<BiggerEnumClass63, BiggerEnumClass63::C>();
+  TestBiggerEnumSet<BiggerEnumClass64, BiggerEnumClass64::C>();
+  TestBiggerEnumSet<BiggerEnumClass65, BiggerEnumClass65::C>();
+  TestBiggerEnumSet<BiggerEnumClass127, BiggerEnumClass127::C>();
+  TestBiggerEnumSet<BiggerEnumClass128, BiggerEnumClass128::C>();
+  TestBiggerEnumSet<BiggerEnumClass129, BiggerEnumClass129::C>();
+  TestBiggerEnumSet<BiggerEnumClass150, BiggerEnumClass150::C>();
+  TestBiggerEnumSet<BiggerEnumClass255, BiggerEnumClass255::C>();
+}
+
 // ***************************************************************** //
 // Unit test for Status
 TEST(StatusTest, Update) {
@@ -339,6 +419,271 @@ TEST(UnownedPtrTest, Tests) {
   }
 }
 
+TEST(ToBaseCharsStringTest, Tests) {
+  using ROCKSDB_NAMESPACE::ToBaseCharsString;
+  // Base 16
+  ASSERT_EQ(ToBaseCharsString<16>(5, 0, true), "00000");
+  ASSERT_EQ(ToBaseCharsString<16>(5, 42, true), "0002A");
+  ASSERT_EQ(ToBaseCharsString<16>(5, 42, false), "0002a");
+  ASSERT_EQ(ToBaseCharsString<16>(2, 255, false), "ff");
+  // Base 32
+  ASSERT_EQ(ToBaseCharsString<32>(2, 255, false), "7v");
+}
+
+TEST(SemaphoreTest, CountingSemaphore) {
+  CountingSemaphore sem{0};
+  int kCount = 5;
+  std::vector<std::thread> threads;
+  for (int i = 0; i < kCount; ++i) {
+    threads.emplace_back([&sem] { sem.Release(); });
+  }
+  for (int i = 0; i < kCount; ++i) {
+    threads.emplace_back([&sem] { sem.Acquire(); });
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+  // Nothing left on the semaphore
+  ASSERT_FALSE(sem.TryAcquire());
+  // Keep testing
+  sem.Release(2);
+  ASSERT_TRUE(sem.TryAcquire());
+  sem.Acquire();
+  ASSERT_FALSE(sem.TryAcquire());
+}
+
+TEST(SemaphoreTest, BinarySemaphore) {
+  BinarySemaphore sem{0};
+  int kCount = 5;
+  std::vector<std::thread> threads;
+  for (int i = 0; i < kCount; ++i) {
+    threads.emplace_back([&sem] {
+      sem.Acquire();
+      sem.Release();
+    });
+  }
+  threads.emplace_back([&sem] { sem.Release(); });
+  for (auto& t : threads) {
+    t.join();
+  }
+  // Only able to acquire one excess release
+  ASSERT_TRUE(sem.TryAcquire());
+  ASSERT_FALSE(sem.TryAcquire());
+}
+
+TEST(BitFieldsTest, BitFields) {
+  // Start by verifying example from BitFields comment
+  struct MyState : public BitFields<uint32_t, MyState> {
+    // Extra helper declarations and/or field type declarations
+  };
+
+  using Field1 = UnsignedBitField<MyState, 16, NoPrevBitField>;
+  using Field2 = BoolBitField<MyState, Field1>;
+  using Field3 = BoolBitField<MyState, Field2>;
+  using Field4 = UnsignedBitField<MyState, 5, Field3>;
+
+  // MyState{} is zero-initialized
+  auto state = MyState{}.With<Field1>(42U).With<Field2>(true);
+  state.Set<Field4>(3U);
+  state.Ref<Field1>() += state.Get<Field4>();
+
+  ASSERT_EQ(state.Get<Field1>(), 45U);
+  ASSERT_EQ(state.Get<Field2>(), true);
+  ASSERT_EQ(state.Get<Field3>(), false);
+  ASSERT_EQ(state.Get<Field4>(), 3U);
+
+  // Misc operators
+  auto ref = state.Ref<Field3>();
+  auto ref2 = std::move(ref);
+  ref2 = true;
+  ASSERT_EQ(state.Get<Field3>(), true);
+
+  MyState state2;
+  // Basic non-concurrent tests for atomic wrappers
+  {
+    RelaxedBitFieldsAtomic<MyState> relaxed{state};
+    ASSERT_EQ(state, relaxed.LoadRelaxed());
+    relaxed.StoreRelaxed(state2);
+    ASSERT_EQ(state2, relaxed.LoadRelaxed());
+    MyState state3 = relaxed.ExchangeRelaxed(state);
+    ASSERT_EQ(state2, state3);
+    ASSERT_TRUE(relaxed.CasStrongRelaxed(state, state2));
+    while (!relaxed.CasWeakRelaxed(state2, state)) {
+    }
+    ASSERT_EQ(state2, state3);
+    ASSERT_EQ(state, relaxed.LoadRelaxed());
+
+    auto transform1 = Field2::ClearTransform() + Field3::ClearTransform();
+    MyState before, after;
+    relaxed.ApplyRelaxed(transform1, &before, &after);
+    ASSERT_EQ(before, state);
+    ASSERT_NE(after, state);
+    ASSERT_EQ(after.Get<Field2>(), false);
+    ASSERT_EQ(after.Get<Field3>(), false);
+
+    auto transform2 = Field2::SetTransform() + Field3::SetTransform();
+    relaxed.ApplyRelaxed(transform2, &before, &after);
+    ASSERT_NE(before, state);
+    ASSERT_EQ(before.Get<Field2>(), false);
+    ASSERT_EQ(before.Get<Field3>(), false);
+    ASSERT_EQ(after, state);
+
+    ASSERT_EQ(state.Get<Field1>(), 45U);
+    ASSERT_EQ(after.Get<Field2>(), true);
+    ASSERT_EQ(after.Get<Field3>(), true);
+    ASSERT_EQ(state.Get<Field4>(), 3U);
+
+    auto transform3 = Field1::PlusTransformPromiseNoOverflow(10000U) +
+                      Field4::MinusTransformPromiseNoUnderflow(3U);
+    relaxed.ApplyRelaxed(transform3, &before, &after);
+    ASSERT_EQ(before, state);
+    ASSERT_NE(after, state);
+    ASSERT_EQ(after.Get<Field1>(), 10045U);
+    ASSERT_EQ(after.Get<Field4>(), 0U);
+
+    auto transform4 = Field1::MinusTransformPromiseNoUnderflow(999U) +
+                      Field4::PlusTransformPromiseNoOverflow(31U);
+    relaxed.ApplyRelaxed(transform4, &before, &after);
+    ASSERT_EQ(after.Get<Field1>(), 9046U);
+    ASSERT_EQ(after.Get<Field4>(), 31U);
+
+    // Unmodified
+    ASSERT_EQ(after.Get<Field2>(), true);
+    ASSERT_EQ(after.Get<Field3>(), true);
+
+    // Test overflow/underflow detection
+    relaxed.StoreRelaxed(MyState{}.With<Field1>(65535U));  // Field1 max value
+    ASSERT_TESTABLE_FAILURE(
+        relaxed.ApplyRelaxed(Field1::PlusTransformPromiseNoOverflow(1U)));
+    relaxed.StoreRelaxed(MyState{}.With<Field4>(31U));  // Field4 max value
+    ASSERT_TESTABLE_FAILURE(
+        relaxed.ApplyRelaxed(Field4::PlusTransformPromiseNoOverflow(1U)));
+    relaxed.StoreRelaxed(MyState{}.With<Field1>(0U));
+    ASSERT_TESTABLE_FAILURE(
+        relaxed.ApplyRelaxed(Field1::MinusTransformPromiseNoUnderflow(1U)));
+    relaxed.StoreRelaxed(MyState{}.With<Field4>(0U));
+    ASSERT_TESTABLE_FAILURE(
+        relaxed.ApplyRelaxed(Field4::MinusTransformPromiseNoUnderflow(1U)));
+    ASSERT_TESTABLE_FAILURE(relaxed.ApplyRelaxed(
+        Field4::MinusTransformPromiseNoUnderflow(64U)));  // Too big
+    ASSERT_TESTABLE_FAILURE(relaxed.ApplyRelaxed(
+        Field4::PlusTransformPromiseNoOverflow(64U)));  // Too big
+
+    // Including combinations
+    relaxed.StoreRelaxed(MyState{}.With<Field4>(31U));  // Field4 max value
+    relaxed.StoreRelaxed(MyState{}.With<Field1>(0U));
+    ASSERT_TESTABLE_FAILURE(
+        relaxed.ApplyRelaxed(Field4::PlusTransformPromiseNoOverflow(1U) +
+                             Field1::MinusTransformPromiseNoUnderflow(1U)));
+
+    // But a field at the limit of upper bits is allowed to over/underflow
+    using Field5 = UnsignedBitField<MyState, 9, Field4>;
+    relaxed.StoreRelaxed(MyState{}.With<Field5>(0));  // Field5 max value
+    relaxed.ApplyRelaxed(Field5::MinusTransformIgnoreUnderflow(1U), &before,
+                         &after);  // "Safe" underflow
+    ASSERT_EQ(after.Get<Field5>(), 511U);
+    relaxed.ApplyRelaxed(Field5::PlusTransformIgnoreOverflow(1U), &before,
+                         &after);  // "Safe" overflow
+    ASSERT_EQ(after.Get<Field5>(), 0U);
+    relaxed.ApplyRelaxed(Field5::PlusTransformIgnoreOverflow(2048U), &before,
+                         &after);  // "Safe" overflow
+    ASSERT_EQ(after.Get<Field5>(), 0U);
+  }
+  {
+    BitFieldsAtomic<MyState> acqrel{state};
+    ASSERT_EQ(state, acqrel.Load());
+    acqrel.Store(state2);
+    ASSERT_EQ(state2, acqrel.Load());
+    MyState state3 = acqrel.Exchange(state);
+    ASSERT_EQ(state2, state3);
+    ASSERT_TRUE(acqrel.CasStrong(state, state2));
+    while (!acqrel.CasWeak(state2, state)) {
+    }
+    ASSERT_EQ(state2, state3);
+    ASSERT_EQ(state, acqrel.Load());
+
+    auto transform1 = Field2::ClearTransform() + Field3::ClearTransform();
+    MyState before, after;
+    acqrel.Apply(transform1, &before, &after);
+    ASSERT_EQ(before, state);
+    ASSERT_NE(after, state);
+    ASSERT_EQ(after.Get<Field2>(), false);
+    ASSERT_EQ(after.Get<Field3>(), false);
+
+    auto transform2 = Field2::SetTransform() + Field3::SetTransform();
+    acqrel.Apply(transform2, &before, &after);
+    ASSERT_NE(before, state);
+    ASSERT_EQ(before.Get<Field2>(), false);
+    ASSERT_EQ(before.Get<Field3>(), false);
+    ASSERT_EQ(after, state);
+
+    ASSERT_EQ(state.Get<Field1>(), 45U);
+    ASSERT_EQ(after.Get<Field2>(), true);
+    ASSERT_EQ(after.Get<Field3>(), true);
+    ASSERT_EQ(state.Get<Field4>(), 3U);
+
+    auto transform2a = Field2::And(true) + Field3::And(false);
+    acqrel.Apply(transform2a, &before, &after);
+    ASSERT_EQ(after.Get<Field2>(), true);
+    ASSERT_EQ(after.Get<Field3>(), false);
+
+    auto transform2b = Field2::And(false) + Field3::And(true);
+    acqrel.Apply(transform2b, &before, &after);
+    ASSERT_EQ(after.Get<Field2>(), false);
+    ASSERT_EQ(after.Get<Field3>(), false);
+
+    auto transform2c = Field2::Or(true) + Field3::Or(false);
+    acqrel.Apply(transform2c, &before, &after);
+    ASSERT_EQ(after.Get<Field2>(), true);
+    ASSERT_EQ(after.Get<Field3>(), false);
+
+    auto transform2d = Field2::Or(false) + Field3::Or(true);
+    acqrel.Apply(transform2d, &before, &after);
+    ASSERT_EQ(after.Get<Field2>(), true);
+    ASSERT_EQ(after.Get<Field3>(), true);
+
+    ASSERT_EQ(state.Get<Field1>(), 45U);
+    ASSERT_EQ(state.Get<Field4>(), 3U);
+
+    auto transform3 = Field1::PlusTransformPromiseNoOverflow(10000U) +
+                      Field4::MinusTransformPromiseNoUnderflow(3U);
+    acqrel.Apply(transform3, &before, &after);
+    ASSERT_EQ(before, state);
+    ASSERT_NE(after, state);
+    ASSERT_EQ(after.Get<Field1>(), 10045U);
+    ASSERT_EQ(after.Get<Field4>(), 0U);
+
+    auto transform4 = Field1::MinusTransformPromiseNoUnderflow(999U) +
+                      Field4::PlusTransformPromiseNoOverflow(31U);
+    acqrel.Apply(transform4, &before, &after);
+    ASSERT_EQ(after.Get<Field1>(), 9046U);
+    ASSERT_EQ(after.Get<Field4>(), 31U);
+
+    auto transform4a =
+        Field1::AndTransform(8192U + 4096U) + Field4::AndTransform(15U);
+    acqrel.Apply(transform4a, &before, &after);
+    ASSERT_EQ(after.Get<Field1>(), 8192U);
+    ASSERT_EQ(after.Get<Field4>(), 15U);
+
+    auto transform4b = Field1::OrTransform(127U) + Field4::OrTransform(16U);
+    acqrel.Apply(transform4b, &before, &after);
+    ASSERT_EQ(after.Get<Field1>(), 8192U + 127U);
+    ASSERT_EQ(after.Get<Field4>(), 31U);
+
+    // Unmodified
+    ASSERT_EQ(after.Get<Field2>(), true);
+    ASSERT_EQ(after.Get<Field3>(), true);
+
+    // Test overflow/underflow detection
+    acqrel.Store(MyState{}.With<Field1>(65535U));
+    ASSERT_TESTABLE_FAILURE(
+        acqrel.Apply(Field1::PlusTransformPromiseNoOverflow(1U)));
+    acqrel.Store(MyState{}.With<Field4>(0U));
+    ASSERT_TESTABLE_FAILURE(
+        acqrel.Apply(Field4::MinusTransformPromiseNoUnderflow(1U)));
+  }
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/util/slice_transform_test.cc b/util/slice_transform_test.cc
index 18b0ea51f327..9761699f41af 100644
--- a/util/slice_transform_test.cc
+++ b/util/slice_transform_test.cc
@@ -49,7 +49,7 @@ class SliceTransformDBTest : public testing::Test {
  private:
   std::string dbname_;
   Env* env_;
-  DB* db_;
+  std::unique_ptr<DB> db_;
 
  public:
   SliceTransformDBTest() : env_(Env::Default()), db_(nullptr) {
@@ -58,11 +58,11 @@ class SliceTransformDBTest : public testing::Test {
   }
 
   ~SliceTransformDBTest() override {
-    delete db_;
+    db_.reset();
     EXPECT_OK(DestroyDB(dbname_, last_options_));
   }
 
-  DB* db() { return db_; }
+  DB* db() { return db_.get(); }
 
   // Return the current option configuration.
   Options* GetOptions() { return &last_options_; }
@@ -74,14 +74,12 @@ class SliceTransformDBTest : public testing::Test {
   }
 
   void Destroy() {
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     ASSERT_OK(DestroyDB(dbname_, last_options_));
   }
 
   Status TryReopen() {
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     last_options_.create_if_missing = true;
 
     return DB::Open(last_options_, dbname_, &db_);
diff --git a/util/status.cc b/util/status.cc
index 8f49077406bc..cf9e59e96757 100644
--- a/util/status.cc
+++ b/util/status.cc
@@ -46,6 +46,9 @@ static const char* msgs[static_cast<int>(Status::kMaxSubCode)] = {
     "IO fenced off",          // kIOFenced
     "Merge operator failed",  // kMergeOperatorFailed
     "Number of operands merged exceeded threshold",  // kMergeOperandThresholdExceeded
+    "MultiScan reached file prefetch limit",         // kPrefetchLimitReached
+    "Not expected code path",                        // kNotExpectedCodePath
+    "All compactions aborted",                       // kCompactionAborted
 };
 
 Status::Status(Code _code, SubCode _subcode, const Slice& msg,
diff --git a/util/stop_watch.h b/util/stop_watch.h
index 28781304577d..36ae9bea802b 100644
--- a/util/stop_watch.h
+++ b/util/stop_watch.h
@@ -102,6 +102,7 @@ class StopWatch {
 };
 
 // a nano second precision stopwatch
+template <bool use_cpu_time = false>
 class StopWatchNano {
  public:
   explicit StopWatchNano(SystemClock* clock, bool auto_start = false)
@@ -110,27 +111,36 @@ class StopWatchNano {
       Start();
     }
   }
-
-  void Start() { start_ = clock_->NowNanos(); }
-
+  void Start() {
+    if constexpr (use_cpu_time) {
+      start_ = clock_->CPUNanos();
+    } else {
+      start_ = clock_->NowNanos();
+    }
+  }
   uint64_t ElapsedNanos(bool reset = false) {
-    auto now = clock_->NowNanos();
+    uint64_t now = 0;
+    if constexpr (use_cpu_time) {
+      now = clock_->CPUNanos();
+    } else {
+      now = clock_->NowNanos();
+    }
     auto elapsed = now - start_;
     if (reset) {
       start_ = now;
     }
     return elapsed;
   }
-
   uint64_t ElapsedNanosSafe(bool reset = false) {
     return (clock_ != nullptr) ? ElapsedNanos(reset) : 0U;
   }
-
   bool IsStarted() { return start_ != 0; }
+  uint64_t ElapsedMicros(bool reset = false) {
+    return ElapsedNanos(reset) / 1000;
+  }
 
  private:
   SystemClock* clock_;
   uint64_t start_;
 };
-
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/util/string_util.cc b/util/string_util.cc
index 2a45c3a0ee8f..0dc3e7158e9f 100644
--- a/util/string_util.cc
+++ b/util/string_util.cc
@@ -20,20 +20,6 @@
 #include "port/sys_time.h"
 #include "rocksdb/slice.h"
 
-#ifndef __has_cpp_attribute
-#define ROCKSDB_HAS_CPP_ATTRIBUTE(x) 0
-#else
-#define ROCKSDB_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
-#endif
-
-#if ROCKSDB_HAS_CPP_ATTRIBUTE(maybe_unused) && __cplusplus >= 201703L
-#define ROCKSDB_MAYBE_UNUSED [[maybe_unused]]
-#elif ROCKSDB_HAS_CPP_ATTRIBUTE(gnu::unused) || __GNUC__
-#define ROCKSDB_MAYBE_UNUSED [[gnu::unused]]
-#else
-#define ROCKSDB_MAYBE_UNUSED
-#endif
-
 namespace ROCKSDB_NAMESPACE {
 
 const std::string kNullptrString = "nullptr";
@@ -501,7 +487,7 @@ bool TryParseTimeRangeString(const std::string& value, int& start_time,
 // selects proper function.
 
 #if !(defined(_WIN32) && (defined(__MINGW32__) || defined(_MSC_VER)))
-ROCKSDB_MAYBE_UNUSED
+[[maybe_unused]]
 static std::string invoke_strerror_r(int (*strerror_r)(int, char*, size_t),
                                      int err, char* buf, size_t buflen) {
   // Using XSI-compatible strerror_r
@@ -515,7 +501,7 @@ static std::string invoke_strerror_r(int (*strerror_r)(int, char*, size_t),
   return buf;
 }
 
-ROCKSDB_MAYBE_UNUSED
+[[maybe_unused]]
 static std::string invoke_strerror_r(char* (*strerror_r)(int, char*, size_t),
                                      int err, char* buf, size_t buflen) {
   // Using GNU strerror_r
diff --git a/util/string_util.h b/util/string_util.h
index 1374642a6cd7..818349870883 100644
--- a/util/string_util.h
+++ b/util/string_util.h
@@ -40,6 +40,16 @@ inline void PutBaseChars(char** buf, size_t n, uint64_t v, bool uppercase) {
   *buf += n;
 }
 
+// Construct a string of n digits from v in base kBase
+template <size_t kBase>
+inline std::string ToBaseCharsString(size_t n, uint64_t v, bool uppercase) {
+  std::string result;
+  result.resize(n);
+  char* buf = &result[0];
+  PutBaseChars<kBase>(&buf, n, v, uppercase);
+  return result;
+}
+
 // Parse n digits from *buf in base kBase to *v and advance *buf to the
 // position after what was read. On success, true is returned. On failure,
 // false is returned, *buf is placed at the first bad character, and *v
diff --git a/util/thread_list_test.cc b/util/thread_list_test.cc
index 4899b98ac4d9..76170768a146 100644
--- a/util/thread_list_test.cc
+++ b/util/thread_list_test.cc
@@ -10,7 +10,7 @@
 #include "rocksdb/db.h"
 #include "test_util/testharness.h"
 
-#ifdef ROCKSDB_USING_THREAD_STATUS
+#ifndef NROCKSDB_THREAD_STATUS
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -359,4 +359,4 @@ int main(int argc, char** argv) {
   return 0;
 }
 
-#endif  // ROCKSDB_USING_THREAD_STATUS
+#endif  // !NROCKSDB_THREAD_STATUS
diff --git a/util/thread_operation.h b/util/thread_operation.h
index 7d906572615d..91c26f99079b 100644
--- a/util/thread_operation.h
+++ b/util/thread_operation.h
@@ -19,7 +19,7 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-#ifdef ROCKSDB_USING_THREAD_STATUS
+#ifndef NROCKSDB_THREAD_STATUS
 
 // The structure that describes a major thread operation.
 struct OperationInfo {
@@ -47,7 +47,8 @@ static OperationInfo global_operation_table[] = {
     {ThreadStatus::OP_VERIFY_FILE_CHECKSUMS, "VerifyFileChecksums"},
     {ThreadStatus::OP_GETENTITY, "GetEntity"},
     {ThreadStatus::OP_MULTIGETENTITY, "MultiGetEntity"},
-    {ThreadStatus::OP_READ_MANIFEST, "ReadManifest"},
+    {ThreadStatus::OP_GET_FILE_CHECKSUMS_FROM_CURRENT_MANIFEST,
+     "GetFileChecksumsFromCurrentManifest"},
 
 };
 
@@ -119,5 +120,5 @@ struct OperationInfo {};
 
 struct StateInfo {};
 
-#endif  // ROCKSDB_USING_THREAD_STATUS
+#endif  // !NROCKSDB_THREAD_STATUS
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/util/threadpool_imp.cc b/util/threadpool_imp.cc
index 8397c4b39072..901de90555f4 100644
--- a/util/threadpool_imp.cc
+++ b/util/threadpool_imp.cc
@@ -324,7 +324,7 @@ void ThreadPoolImpl::Impl::BGThreadWrapper(void* arg) {
   BGThreadMetadata* meta = static_cast<BGThreadMetadata*>(arg);
   size_t thread_id = meta->thread_id_;
   ThreadPoolImpl::Impl* tp = meta->thread_pool_;
-#ifdef ROCKSDB_USING_THREAD_STATUS
+#ifndef NROCKSDB_THREAD_STATUS
   // initialize it because compiler isn't good enough to see we don't use it
   // uninitialized
   ThreadStatus::ThreadType thread_type = ThreadStatus::NUM_THREAD_TYPES;
@@ -350,7 +350,7 @@ void ThreadPoolImpl::Impl::BGThreadWrapper(void* arg) {
 #endif
   delete meta;
   tp->BGThread(thread_id);
-#ifdef ROCKSDB_USING_THREAD_STATUS
+#ifndef NROCKSDB_THREAD_STATUS
   ThreadStatusUtil::UnregisterThread();
 #endif
   return;
diff --git a/util/timer_queue_test.cc b/util/timer_queue_test.cc
index b3c3768ec797..3afae866290d 100644
--- a/util/timer_queue_test.cc
+++ b/util/timer_queue_test.cc
@@ -28,6 +28,10 @@
 
 #include <future>
 
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
 namespace Timing {
 
 using Clock = std::chrono::high_resolution_clock;
@@ -39,7 +43,9 @@ double now() {
 
 }  // namespace Timing
 
-int main() {
+class TimerQueueTest : public testing::Test {};
+
+TEST_F(TimerQueueTest, BasicFunctionality) {
   TimerQueue q;
 
   double tnow = Timing::now();
@@ -68,6 +74,14 @@ int main() {
   // assert(ret == 1);
   // q.cancelAll();
 
-  return 0;
+  // Test passes if we can create and add timers without crashing
+  ASSERT_TRUE(true);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
-//////////////////////////////////////////
diff --git a/util/udt_util.cc b/util/udt_util.cc
index 7a0eeb2e3d38..555dcf5d1645 100644
--- a/util/udt_util.cc
+++ b/util/udt_util.cc
@@ -429,11 +429,24 @@ void GetFullHistoryTsLowFromU64CutoffTs(Slice* cutoff_ts,
   PutFixed64(full_history_ts_low, cutoff_udt_ts + 1);
 }
 
-std::tuple<std::optional<Slice>, std::optional<Slice>>
-MaybeAddTimestampsToRange(const Slice* start, const Slice* end, size_t ts_sz,
-                          std::string* start_with_ts, std::string* end_with_ts,
-                          bool exclusive_end) {
-  std::optional<Slice> ret_start, ret_end;
+void GetU64CutoffTsFromFullHistoryTsLow(Slice* full_history_ts_low,
+                                        std::string* cutoff_ts) {
+  uint64_t full_history_ts_low_int = 0;
+  [[maybe_unused]] bool format_res =
+      GetFixed64(full_history_ts_low, &full_history_ts_low_int);
+  assert(format_res);
+  assert(full_history_ts_low_int > 0);
+  if (full_history_ts_low_int > 0) {
+    PutFixed64(cutoff_ts, full_history_ts_low_int - 1);
+  } else {
+    PutFixed64(cutoff_ts, 0);
+  }
+}
+
+std::tuple<OptSlice, OptSlice> MaybeAddTimestampsToRange(
+    const OptSlice& start, const OptSlice& end, size_t ts_sz,
+    std::string* start_with_ts, std::string* end_with_ts, bool exclusive_end) {
+  OptSlice ret_start, ret_end;
   if (start) {
     if (ts_sz == 0) {
       ret_start = *start;
diff --git a/util/udt_util.h b/util/udt_util.h
index 51ea76e8544e..a9736e433d6c 100644
--- a/util/udt_util.h
+++ b/util/udt_util.h
@@ -275,14 +275,17 @@ Status ValidateUserDefinedTimestampsOptions(
 void GetFullHistoryTsLowFromU64CutoffTs(Slice* cutoff_ts,
                                         std::string* full_history_ts_low);
 
+// The reverse of `GetFullHistoryTsLowFromU64CutoffTs`.
+void GetU64CutoffTsFromFullHistoryTsLow(Slice* full_history_ts_low,
+                                        std::string* cutoff_ts);
+
 // `start` is the inclusive lower user key bound without user-defined timestamp.
 // `end` is the upper user key bound without user-defined timestamp.
 // By default, `end` is treated as being exclusive. If `exclusive_end` is set to
-// false, it's treated as an inclusive upper bound.
-// If any of these two bounds is nullptr, an empty std::optional<Slice> is
-// returned for that bound.
-std::tuple<std::optional<Slice>, std::optional<Slice>>
-MaybeAddTimestampsToRange(const Slice* start, const Slice* end, size_t ts_sz,
-                          std::string* start_with_ts, std::string* end_with_ts,
-                          bool exclusive_end = true);
+// false, it's treated as an inclusive upper bound. For either bound that has no
+// value, a "no value" OptSlice is returned for that bound.
+std::tuple<OptSlice, OptSlice> MaybeAddTimestampsToRange(
+    const OptSlice& start, const OptSlice& end, size_t ts_sz,
+    std::string* start_with_ts, std::string* end_with_ts,
+    bool exclusive_end = true);
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/backup/backup_engine.cc b/utilities/backup/backup_engine.cc
index 76ce993b20ff..3eedfa13c6c6 100644
--- a/utilities/backup/backup_engine.cc
+++ b/utilities/backup/backup_engine.cc
@@ -615,6 +615,10 @@ class BackupEngineImpl {
                                       std::string* checksum_hex,
                                       const Temperature src_temperature) const;
 
+  // Helper method to check if backup should be stopped. Can be overridden
+  // via sync points for testing.
+  bool ShouldStopBackup() const;
+
   // Obtain db_id and db_session_id from the table properties of file_path
   Status GetFileDbIdentities(Env* src_env, const EnvOptions& src_env_options,
                              const std::string& file_path,
@@ -2353,6 +2357,10 @@ IOStatus BackupEngineImpl::CopyOrCreateFile(
     Temperature dst_temperature, uint64_t* bytes_toward_next_callback,
     uint64_t* size, std::string* checksum_hex) {
   assert(src.empty() != contents.empty());
+  if (ShouldStopBackup()) {
+    return status_to_io_status(Status::Incomplete("Backup stopped"));
+  }
+
   IOStatus io_s;
   std::unique_ptr<FSWritableFile> dst_file;
   std::unique_ptr<FSSequentialFile> src_file;
@@ -2372,7 +2380,11 @@ IOStatus BackupEngineImpl::CopyOrCreateFile(
 
   io_s = dst_env->GetFileSystem()->NewWritableFile(dst, dst_file_options,
                                                    &dst_file, nullptr);
-  if (io_s.ok() && !src.empty()) {
+  if (!io_s.ok()) {
+    return io_s;
+  }
+
+  if (!src.empty()) {
     auto src_file_options = FileOptions(src_env_options);
     src_file_options.temperature = *src_temperature;
     io_s = src_env->GetFileSystem()->NewSequentialFile(src, src_file_options,
@@ -2409,7 +2421,7 @@ IOStatus BackupEngineImpl::CopyOrCreateFile(
   Slice data;
   const IOOptions opts;
   do {
-    if (stop_backup_.load(std::memory_order_acquire)) {
+    if (ShouldStopBackup()) {
       return status_to_io_status(Status::Incomplete("Backup stopped"));
     }
     if (!src.empty()) {
@@ -2745,6 +2757,12 @@ IOStatus BackupEngineImpl::AddBackupFileWorkItem(
   return IOStatus::OK();
 }
 
+bool BackupEngineImpl::ShouldStopBackup() const {
+  bool should_stop = stop_backup_.load(std::memory_order_acquire);
+  TEST_SYNC_POINT_CALLBACK("BackupEngineImpl::ShouldStopBackup", &should_stop);
+  return should_stop;
+}
+
 IOStatus BackupEngineImpl::ReadFileAndComputeChecksum(
     const std::string& src, const std::shared_ptr<FileSystem>& src_fs,
     const EnvOptions& src_env_options, uint64_t size_limit,
@@ -2752,6 +2770,9 @@ IOStatus BackupEngineImpl::ReadFileAndComputeChecksum(
   if (checksum_hex == nullptr) {
     return status_to_io_status(Status::Aborted("Checksum pointer is null"));
   }
+  if (ShouldStopBackup()) {
+    return status_to_io_status(Status::Incomplete("Backup stopped"));
+  }
   uint32_t checksum_value = 0;
   if (size_limit == 0) {
     size_limit = std::numeric_limits<uint64_t>::max();
@@ -2779,7 +2800,7 @@ IOStatus BackupEngineImpl::ReadFileAndComputeChecksum(
   Slice data;
 
   do {
-    if (stop_backup_.load(std::memory_order_acquire)) {
+    if (ShouldStopBackup()) {
       return status_to_io_status(Status::Incomplete("Backup stopped"));
     }
     size_t buffer_to_read =
@@ -2825,7 +2846,7 @@ Status BackupEngineImpl::GetFileDbIdentities(Env* src_env,
     // Try to get table properties from the table reader of sst_reader
     if (!sst_reader.ReadTableProperties(&tp).ok()) {
       // FIXME (peterd): this logic is untested and seems obsolete.
-      // Try to use table properites from the initialization of sst_reader
+      // Try to use table properties from the initialization of sst_reader
       table_properties = sst_reader.GetInitTableProperties();
     } else {
       table_properties = tp.get();
diff --git a/utilities/backup/backup_engine_test.cc b/utilities/backup/backup_engine_test.cc
index e01911be29c9..ff5c7378e7e3 100644
--- a/utilities/backup/backup_engine_test.cc
+++ b/utilities/backup/backup_engine_test.cc
@@ -43,6 +43,7 @@
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/atomic.h"
 #include "util/cast_util.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
@@ -135,6 +136,7 @@ class DummyDB : public StackableDB {
   }
 
   // To avoid FlushWAL called on stacked db which is nullptr
+  using DB::FlushWAL;
   Status FlushWAL(bool /*sync*/) override { return Status::OK(); }
 
   std::vector<std::string> live_files_;
@@ -756,8 +758,8 @@ class BackupEngineTest : public testing::Test {
     ASSERT_OK(CreateLoggerFromOptions(dbname_, logger_options, &logger_));
   }
 
-  DB* OpenDB() {
-    DB* db;
+  std::unique_ptr<DB> OpenDB() {
+    std::unique_ptr<DB> db;
     EXPECT_OK(DB::Open(options_, dbname_, &db));
     return db;
   }
@@ -768,13 +770,11 @@ class BackupEngineTest : public testing::Test {
 
     // Open DB
     test_db_fs_->SetLimitWrittenFiles(1000000);
-    DB* db;
     if (read_only) {
-      ASSERT_OK(DB::OpenForReadOnly(options_, dbname_, &db));
+      ASSERT_OK(DB::OpenForReadOnly(options_, dbname_, &db_));
     } else {
-      ASSERT_OK(DB::Open(options_, dbname_, &db));
+      ASSERT_OK(DB::Open(options_, dbname_, &db_));
     }
-    db_.reset(db);
   }
 
   void InitializeDBAndBackupEngine(bool dummy = false) {
@@ -782,14 +782,12 @@ class BackupEngineTest : public testing::Test {
     test_db_fs_->SetLimitWrittenFiles(1000000);
     test_db_fs_->SetDummySequentialFile(dummy);
 
-    DB* db;
     if (dummy) {
       dummy_db_ = new DummyDB(options_, dbname_);
-      db = dummy_db_;
+      db_.reset(dummy_db_);
     } else {
-      ASSERT_OK(DB::Open(options_, dbname_, &db));
+      ASSERT_OK(DB::Open(options_, dbname_, &db_));
     }
-    db_.reset(db);
   }
 
   virtual void OpenDBAndBackupEngine(
@@ -912,13 +910,13 @@ class BackupEngineTest : public testing::Test {
       ASSERT_OK(backup_engine_->RestoreDBFromLatestBackup(dbname_, dbname_,
                                                           restore_options));
     }
-    DB* db = OpenDB();
+    auto db = OpenDB();
     // Check DB contents
-    AssertExists(db, start_exist, end_exist);
+    AssertExists(db.get(), start_exist, end_exist);
     if (end != 0) {
-      AssertEmpty(db, end_exist, end);
+      AssertEmpty(db.get(), end_exist, end);
     }
-    delete db;
+    db.reset();
     if (opened_backup_engine) {
       CloseBackupEngine();
     }
@@ -1061,6 +1059,7 @@ class BackupEngineTest : public testing::Test {
   // all the dbs!
   DummyDB* dummy_db_;  // owned as db_ when present
   std::unique_ptr<DB> db_;
+  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_.get()); }
   std::unique_ptr<BackupEngine> backup_engine_;
 
   // options
@@ -1201,7 +1200,7 @@ TEST_F(BackupEngineTest, IncrementalRestore) {
     // Since we started with a blank db, restore copied all the files.
     test_db_fs_->AssertWrittenFiles(all_files);
 
-    db_.reset(OpenDB());
+    db_ = OpenDB();
 
     // Check DB contents.
     AssertExists(db_.get(), 0, keys_iteration * 2);
@@ -1253,7 +1252,7 @@ TEST_F(BackupEngineTest, IncrementalRestore) {
     test_db_fs_->AssertWrittenFiles(should_have_written);
 
     // Check DB contents.
-    db_.reset(OpenDB());
+    db_ = OpenDB();
     AssertExists(db_.get(), 0, keys_iteration * 2);
 
     db_.reset();  // Close DB.
@@ -1305,7 +1304,7 @@ TEST_F(BackupEngineTest, IncrementalRestore) {
     // 'Hole' has been patched, 'in-policy' db files were retained.
     test_db_fs_->AssertWrittenFiles(should_have_written);
 
-    db_.reset(OpenDB());
+    db_ = OpenDB();
     Status s = db_->VerifyChecksum();
 
     // Check DB contents.
@@ -1422,9 +1421,9 @@ TEST_P(BackupEngineTestWithParam, OfflineIntegrationTest) {
       DestroyDBWithoutCheck(dbname_, options_);
 
       // ---- make sure it's empty ----
-      DB* db = OpenDB();
-      AssertEmpty(db, 0, fill_up_to);
-      delete db;
+      auto db = OpenDB();
+      AssertEmpty(db.get(), 0, fill_up_to);
+      db.reset();
 
       // ---- restore the DB ----
       OpenBackupEngine();
@@ -1476,9 +1475,9 @@ TEST_P(BackupEngineTestWithParam, OnlineIntegrationTest) {
   DestroyDBWithoutCheck(dbname_, options_);
 
   // ---- make sure it's empty ----
-  DB* db = OpenDB();
-  AssertEmpty(db, 0, max_key);
-  delete db;
+  auto db = OpenDB();
+  AssertEmpty(db.get(), 0, max_key);
+  db.reset();
 
   // ---- restore every backup and verify all the data is there ----
   OpenBackupEngine();
@@ -2089,10 +2088,9 @@ TEST_F(BackupEngineTest, FlushCompactDuringBackupCheckpoint) {
           "BackupEngineTest::FlushCompactDuringBackupCheckpoint:Before");
       FillDB(db_.get(), keys_iteration, 2 * keys_iteration);
       ASSERT_OK(db_->Flush(FlushOptions()));
-      DBImpl* dbi = static_cast<DBImpl*>(db_.get());
-      ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
       ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-      ASSERT_OK(dbi->TEST_WaitForCompact());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
       TEST_SYNC_POINT(
           "BackupEngineTest::FlushCompactDuringBackupCheckpoint:After");
     }};
@@ -2139,7 +2137,7 @@ TEST_F(BackupEngineTest, BackupOptions) {
     // Must reset() before reset(OpenDB()) again.
     // Calling OpenDB() while *db_ is existing will cause LOCK issue
     db_.reset();
-    db_.reset(OpenDB());
+    db_ = OpenDB();
     ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
     ASSERT_OK(ROCKSDB_NAMESPACE::GetLatestOptionsFileName(db_->GetName(),
                                                           options_.env, &name));
@@ -2167,13 +2165,12 @@ TEST_F(BackupEngineTest, SetOptionsBackupRaceCondition) {
   ROCKSDB_NAMESPACE::port::Thread setoptions_thread{[this]() {
     TEST_SYNC_POINT(
         "BackupEngineTest::SetOptionsBackupRaceCondition:BeforeSetOptions");
-    DBImpl* dbi = static_cast<DBImpl*>(db_.get());
     // Change arbitrary option to trigger OPTIONS file deletion
-    ASSERT_OK(dbi->SetOptions(dbi->DefaultColumnFamily(),
+    ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(),
                               {{"paranoid_file_checks", "false"}}));
-    ASSERT_OK(dbi->SetOptions(dbi->DefaultColumnFamily(),
+    ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(),
                               {{"paranoid_file_checks", "true"}}));
-    ASSERT_OK(dbi->SetOptions(dbi->DefaultColumnFamily(),
+    ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(),
                               {{"paranoid_file_checks", "false"}}));
     TEST_SYNC_POINT(
         "BackupEngineTest::SetOptionsBackupRaceCondition:AfterSetOptions");
@@ -2431,14 +2428,13 @@ TEST_F(BackupEngineTest, TableFileCorruptionBeforeIncremental) {
         engine_options_->share_files_with_checksum_naming = option;
       }
       OpenDBAndBackupEngine(true, false, share);
-      DBImpl* dbi = static_cast<DBImpl*>(db_.get());
       // A small SST file
-      ASSERT_OK(dbi->Put(WriteOptions(), "x", "y"));
-      ASSERT_OK(dbi->Flush(FlushOptions()));
+      ASSERT_OK(db_->Put(WriteOptions(), "x", "y"));
+      ASSERT_OK(db_->Flush(FlushOptions()));
       // And a bigger one
-      ASSERT_OK(dbi->Put(WriteOptions(), "y", Random(42).RandomString(500)));
-      ASSERT_OK(dbi->Flush(FlushOptions()));
-      ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
+      ASSERT_OK(db_->Put(WriteOptions(), "y", Random(42).RandomString(500)));
+      ASSERT_OK(db_->Flush(FlushOptions()));
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
       CloseAndReopenDB(/*read_only*/ true);
 
       std::vector<FileAttributes> table_files;
@@ -2483,9 +2479,8 @@ TEST_F(BackupEngineTest, TableFileCorruptionBeforeIncremental) {
       db_.reset();
       ASSERT_OK(backup_engine_->RestoreDBFromBackup(2, dbname_, dbname_));
       {
-        DB* db = OpenDB();
+        auto db = OpenDB();
         s = db->VerifyChecksum();
-        delete db;
       }
       if (option != kLegacyCrc32cAndFileSize && !corrupt_before_first_backup) {
         // Second backup is OK because it used (uncorrupt) file from first
@@ -2527,11 +2522,10 @@ TEST_F(BackupEngineTest, TableFileCorruptionBeforeIncremental) {
 
 TEST_F(BackupEngineTest, PropertiesBlockCorruptionIncremental) {
   OpenDBAndBackupEngine(true, false, kShareWithChecksum);
-  DBImpl* dbi = static_cast<DBImpl*>(db_.get());
   // A small SST file
-  ASSERT_OK(dbi->Put(WriteOptions(), "x", "y"));
-  ASSERT_OK(dbi->Flush(FlushOptions()));
-  ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
+  ASSERT_OK(db_->Put(WriteOptions(), "x", "y"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
 
   ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
 
@@ -3348,9 +3342,9 @@ TEST_F(BackupEngineTest, ReadOnlyBackupEngine) {
   std::vector<std::string> should_have_written;
   test_backup_fs_->AssertWrittenFiles(should_have_written);
 
-  DB* db = OpenDB();
-  AssertExists(db, 0, 200);
-  delete db;
+  auto db = OpenDB();
+  AssertExists(db.get(), 0, 200);
+  db.reset();
 }
 
 TEST_F(BackupEngineTest, OpenBackupAsReadOnlyDB) {
@@ -3383,7 +3377,7 @@ TEST_F(BackupEngineTest, OpenBackupAsReadOnlyDB) {
   // Caution: DBOptions only holds a raw pointer to Env, so something else
   // must keep it alive.
   // Case 1: Keeping BackupEngine open suffices to keep Env alive
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   Options opts = options_;
   // Ensure some key defaults are set
   opts.wal_dir = "";
@@ -3395,11 +3389,10 @@ TEST_F(BackupEngineTest, OpenBackupAsReadOnlyDB) {
   backup_info = BackupInfo();
   ASSERT_OK(DB::OpenForReadOnly(opts, name, &db));
 
-  AssertExists(db, 0, 100);
-  AssertEmpty(db, 100, 200);
+  AssertExists(db.get(), 0, 100);
+  AssertEmpty(db.get(), 100, 200);
 
-  delete db;
-  db = nullptr;
+  db.reset();
 
   // Case 2: Keeping BackupInfo alive rather than BackupEngine also suffices
   ASSERT_OK(backup_engine_->GetBackupInfo(/*id*/ 2U, &backup_info,
@@ -3411,12 +3404,14 @@ TEST_F(BackupEngineTest, OpenBackupAsReadOnlyDB) {
   // Note: keeping backup_info alive
   ASSERT_OK(DB::OpenForReadOnly(opts, name, &db));
 
-  AssertExists(db, 0, 200);
-  delete db;
-  db = nullptr;
+  AssertExists(db.get(), 0, 200);
+  db.reset();
 
   // Now try opening read-write and make sure it fails, for safety.
-  ASSERT_TRUE(DB::Open(opts, name, &db).IsIOError());
+  {
+    std::unique_ptr<DB> dbptr;
+    ASSERT_TRUE(DB::Open(opts, name, &dbptr).IsIOError());
+  }
 }
 
 TEST_F(BackupEngineTest, ProgressCallbackDuringBackup) {
@@ -3540,6 +3535,7 @@ TEST_F(BackupEngineTest, EnvFailures) {
 TEST_F(BackupEngineTest, ChangeManifestDuringBackupCreation) {
   DestroyDBWithoutCheck(dbname_, options_);
   options_.max_manifest_file_size = 0;  // always rollover manifest for file add
+  options_.max_manifest_space_amp_pct = 0;
   OpenDBAndBackupEngine(true);
   FillDB(db_.get(), 0, 100, kAutoFlushOnly);
 
@@ -3562,16 +3558,15 @@ TEST_F(BackupEngineTest, ChangeManifestDuringBackupCreation) {
   // The last manifest roll would've already been cleaned up by the full scan
   // that happens when CreateNewBackup invokes EnableFileDeletions. We need to
   // trigger another roll to verify non-full scan purges stale manifests.
-  DBImpl* db_impl = static_cast_with_check<DBImpl>(db_.get());
   std::string prev_manifest_path =
-      DescriptorFileName(dbname_, db_impl->TEST_Current_Manifest_FileNo());
+      DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
   FillDB(db_.get(), 0, 100, kAutoFlushOnly);
   ASSERT_OK(db_chroot_env_->FileExists(prev_manifest_path));
   ASSERT_OK(db_->Flush(FlushOptions()));
   // Even though manual flush completed above, the background thread may not
   // have finished its cleanup work. `TEST_WaitForBackgroundWork()` will wait
   // until all the background thread's work has completed, including cleanup.
-  ASSERT_OK(db_impl->TEST_WaitForBackgroundWork());
+  ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
   ASSERT_TRUE(db_chroot_env_->FileExists(prev_manifest_path).IsNotFound());
 
   CloseDBAndBackupEngine();
@@ -3937,7 +3932,7 @@ TEST_F(BackupEngineTest, Concurrency) {
       // by doing it async and ensuring we either get OK or InvalidArgument
       restore_verify_threads[i] =
           std::thread([this, &db_opts, restore_db_dir, to_restore] {
-            DB* restored;
+            std::unique_ptr<DB> restored;
             Status s;
             for (;;) {
               s = DB::Open(db_opts, restore_db_dir, &restored);
@@ -3953,10 +3948,9 @@ TEST_F(BackupEngineTest, Concurrency) {
               }
             }
             int factor = std::min(static_cast<int>(to_restore), max_factor);
-            AssertExists(restored, 0, factor * keys_iteration);
-            AssertEmpty(restored, factor * keys_iteration,
+            AssertExists(restored.get(), 0, factor * keys_iteration);
+            AssertEmpty(restored.get(), factor * keys_iteration,
                         (factor + 1) * keys_iteration);
-            delete restored;
           });
 
       // (Ok now) Restore one of the backups, or "latest"
@@ -4415,14 +4409,13 @@ TEST_F(BackupEngineTest, FileTemperatures) {
                         kShareWithChecksum);
 
   // generate a bottommost file (combined from 2) and a non-bottommost file
-  DBImpl* dbi = static_cast_with_check<DBImpl>(db_.get());
   ASSERT_OK(db_->Put(WriteOptions(), "a", "val"));
   ASSERT_OK(db_->Put(WriteOptions(), "c", "val"));
   ASSERT_OK(db_->Flush(FlushOptions()));
   ASSERT_OK(db_->Put(WriteOptions(), "b", "val"));
   ASSERT_OK(db_->Put(WriteOptions(), "d", "val"));
   ASSERT_OK(db_->Flush(FlushOptions()));
-  ASSERT_OK(dbi->TEST_WaitForCompact());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_OK(db_->Put(WriteOptions(), "e", "val"));
   ASSERT_OK(db_->Flush(FlushOptions()));
 
@@ -4580,7 +4573,7 @@ TEST_F(BackupEngineTest, ExcludeFiles) {
 
     // Ensure each backup is same set of files
     db_.reset();
-    DB* db = nullptr;
+    std::unique_ptr<DB> db;
     ASSERT_OK(DB::OpenForReadOnly(options_, dbname_, &db));
 
     // A callback that throws should cleanly fail the backup creation.
@@ -4590,12 +4583,12 @@ TEST_F(BackupEngineTest, ExcludeFiles) {
                                     MaybeExcludeBackupFile* /*files_end*/) {
       throw 42;
     };
-    ASSERT_TRUE(backup_engine_->CreateNewBackup(cbo, db).IsAborted());
+    ASSERT_TRUE(backup_engine_->CreateNewBackup(cbo, db.get()).IsAborted());
     cbo.exclude_files_callback = [](MaybeExcludeBackupFile* /*files_begin*/,
                                     MaybeExcludeBackupFile* /*files_end*/) {
       throw std::out_of_range("blah");
     };
-    ASSERT_TRUE(backup_engine_->CreateNewBackup(cbo, db).IsAborted());
+    ASSERT_TRUE(backup_engine_->CreateNewBackup(cbo, db.get()).IsAborted());
 
     // Include files only in given bucket, based on modulus and remainder
     constexpr int modulus = 4;
@@ -4616,22 +4609,21 @@ TEST_F(BackupEngineTest, ExcludeFiles) {
     BackupID first_id{};
     BackupID last_alt_id{};
     remainder = 0;
-    ASSERT_OK(backup_engine_->CreateNewBackup(cbo, db, &first_id));
+    ASSERT_OK(backup_engine_->CreateNewBackup(cbo, db.get(), &first_id));
     AssertBackupInfoConsistency(/*allow excluded*/ true);
     remainder = 1;
-    ASSERT_OK(alt_backup_engine->CreateNewBackup(cbo, db));
+    ASSERT_OK(alt_backup_engine->CreateNewBackup(cbo, db.get()));
     AssertBackupInfoConsistency(/*allow excluded*/ true);
     remainder = 2;
-    ASSERT_OK(backup_engine_->CreateNewBackup(cbo, db));
+    ASSERT_OK(backup_engine_->CreateNewBackup(cbo, db.get()));
     AssertBackupInfoConsistency(/*allow excluded*/ true);
     remainder = 3;
-    ASSERT_OK(alt_backup_engine->CreateNewBackup(cbo, db, &last_alt_id));
+    ASSERT_OK(alt_backup_engine->CreateNewBackup(cbo, db.get(), &last_alt_id));
     AssertBackupInfoConsistency(/*allow excluded*/ true);
 
     // Close DB
     ASSERT_OK(db->Close());
-    delete db;
-    db = nullptr;
+    db.reset();
 
     auto backup_engine = backup_engine_.get();
     for (auto be_pair : {std::make_pair(backup_engine, alt_backup_engine),
@@ -4649,8 +4641,8 @@ TEST_F(BackupEngineTest, ExcludeFiles) {
 
       // Check DB contents
       db = OpenDB();
-      AssertExists(db, 0, keys_iteration);
-      delete db;
+      AssertExists(db.get(), 0, keys_iteration);
+      db.reset();
     }
 
     // Should still work after close and re-open
@@ -4788,6 +4780,79 @@ TEST_F(BackupEngineTest, IOBufferSize) {
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
 }
 
+// Test stopping backup at different points in the backup lifecycle
+// Uses randomized stop points with geometric distribution to better catch
+// edge cases across multiple iterations.
+TEST_F(BackupEngineTest, StopBackupAtDifferentStages) {
+  const int keys_iteration = 5000;
+  const int num_iterations = 10;
+
+  // Enable multi-threaded backup
+  engine_options_->max_background_operations = 7;
+
+  // Generate DB once and reuse across iterations
+  OpenDBAndBackupEngine(true);
+  FillDB(db_.get(), 0, keys_iteration);
+
+  Random rnd(301);
+
+  for (int iteration = 0; iteration < num_iterations; iteration++) {
+    // Generate stop threshold using skewed distribution
+    // Smaller numbers are more likely, which is more interesting for testing
+    // Range: [0, 2^7-1] = [0, 127] with exponential bias towards 0
+    int stop_after_calls = rnd.Skewed(7);
+
+    RelaxedAtomic<int> call_count{0};
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "BackupEngineImpl::ShouldStopBackup", [&](void* arg) {
+          call_count.FetchAddRelaxed(1);
+          if (call_count.LoadRelaxed() > stop_after_calls) {
+            bool* should_stop = static_cast<bool*>(arg);
+            *should_stop = true;
+          }
+        });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    // Create backup - it may complete successfully or be stopped
+    IOStatus s = backup_engine_->CreateNewBackup(db_.get());
+
+    // Verify that ShouldStopBackup was called
+    ASSERT_GT(call_count.LoadRelaxed(), 0);
+
+    if (s.IsIncomplete()) {
+      // Backup was stopped - verify it's the expected error
+      ASSERT_TRUE(s.ToString().find("Backup stopped") != std::string::npos)
+          << "Unexpected incomplete status for threshold " << stop_after_calls
+          << ": " << s.ToString();
+      ASSERT_GT(call_count.LoadRelaxed(), stop_after_calls)
+          << "Expected call_count > stop_after_calls";
+
+      // Verify that no valid backup was created
+      std::vector<BackupInfo> backup_info;
+      backup_engine_->GetBackupInfo(&backup_info);
+      ASSERT_EQ(0, backup_info.size());
+    } else {
+      // Backup completed successfully before reaching the stop threshold
+      ASSERT_OK(s) << "Unexpected error for threshold " << stop_after_calls;
+      ASSERT_LE(call_count.LoadRelaxed(), stop_after_calls)
+          << "Backup completed but call_count exceeded threshold";
+
+      // Verify a backup was created
+      std::vector<BackupInfo> backup_info;
+      backup_engine_->GetBackupInfo(&backup_info);
+      ASSERT_EQ(1, backup_info.size());
+
+      // Clean up the successful backup for next iteration
+      ASSERT_OK(backup_engine_->DeleteBackup(backup_info[0].backup_id));
+    }
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  }
+
+  CloseDBAndBackupEngine();
+}
+
 }  // namespace
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/blob_db/blob_compaction_filter.cc b/utilities/blob_db/blob_compaction_filter.cc
index 1ab292f2534a..069daa0a63d3 100644
--- a/utilities/blob_db/blob_compaction_filter.cc
+++ b/utilities/blob_db/blob_compaction_filter.cc
@@ -32,7 +32,7 @@ CompactionFilter::Decision BlobIndexCompactionFilterBase::FilterV2(
     if (ucf == nullptr) {
       return Decision::kKeep;
     }
-    // Apply user compaction filter for inlined data.
+    // Apply user compaction filter for non-blob data.
     CompactionFilter::Decision decision =
         ucf->FilterV2(level, key, value_type, value, new_value, skip_until);
     if (decision == Decision::kChangeValue) {
@@ -52,34 +52,12 @@ CompactionFilter::Decision BlobIndexCompactionFilterBase::FilterV2(
     expired_size_ += key.size() + value.size();
     return Decision::kRemove;
   }
-  if (!blob_index.IsInlined() &&
-      blob_index.file_number() < context_.next_file_number &&
+  if (blob_index.file_number() < context_.next_file_number &&
       context_.current_blob_files.count(blob_index.file_number()) == 0) {
-    // Corresponding blob file gone (most likely, evicted by FIFO eviction).
     evicted_count_++;
     evicted_size_ += key.size() + value.size();
     return Decision::kRemove;
   }
-  if (context_.fifo_eviction_seq > 0 && blob_index.HasTTL() &&
-      blob_index.expiration() < context_.evict_expiration_up_to) {
-    // Hack: Internal key is passed to BlobIndexCompactionFilter for it to
-    // get sequence number.
-    ParsedInternalKey ikey;
-    if (!ParseInternalKey(
-             key, &ikey,
-             context_.blob_db_impl->db_options_.allow_data_in_errors)
-             .ok()) {
-      assert(false);
-      return Decision::kKeep;
-    }
-    // Remove keys that could have been remove by last FIFO eviction.
-    // If get error while parsing key, ignore and continue.
-    if (ikey.sequence < context_.fifo_eviction_seq) {
-      evicted_count_++;
-      evicted_size_ += key.size() + value.size();
-      return Decision::kRemove;
-    }
-  }
   // Apply user compaction filter for all non-TTL blob data.
   if (ucf != nullptr && !blob_index.HasTTL()) {
     // Hack: Internal key is passed to BlobIndexCompactionFilter for it to
@@ -94,10 +72,7 @@ CompactionFilter::Decision BlobIndexCompactionFilterBase::FilterV2(
     }
     // Read value from blob file.
     PinnableSlice blob;
-    CompressionType compression_type = kNoCompression;
-    constexpr bool need_decompress = true;
-    if (!ReadBlobFromOldFile(ikey.user_key, blob_index, &blob, need_decompress,
-                             &compression_type)) {
+    if (!ReadBlobFromOldFile(ikey.user_key, blob_index, &blob)) {
       return Decision::kIOError;
     }
     CompactionFilter::Decision decision = ucf->FilterV2(
@@ -112,22 +87,10 @@ CompactionFilter::Decision BlobIndexCompactionFilterBase::FilterV2(
 
 CompactionFilter::Decision BlobIndexCompactionFilterBase::HandleValueChange(
     const Slice& key, std::string* new_value) const {
-  BlobDBImpl* const blob_db_impl = context_.blob_db_impl;
-  assert(blob_db_impl);
-
-  if (new_value->size() < blob_db_impl->bdb_options_.min_blob_size) {
-    // Keep new_value inlined.
-    return Decision::kChangeValue;
-  }
   if (!OpenNewBlobFileIfNeeded()) {
     return Decision::kIOError;
   }
   Slice new_blob_value(*new_value);
-  std::string compression_output;
-  if (blob_db_impl->bdb_options_.compression != kNoCompression) {
-    new_blob_value =
-        blob_db_impl->GetCompressedSlice(new_blob_value, &compression_output);
-  }
   uint64_t new_blob_file_number = 0;
   uint64_t new_blob_offset = 0;
   if (!WriteBlobToNewFile(key, new_blob_value, &new_blob_file_number,
@@ -138,8 +101,7 @@ CompactionFilter::Decision BlobIndexCompactionFilterBase::HandleValueChange(
     return Decision::kIOError;
   }
   BlobIndex::EncodeBlob(new_value, new_blob_file_number, new_blob_offset,
-                        new_blob_value.size(),
-                        blob_db_impl->bdb_options_.compression);
+                        new_blob_value.size(), kNoCompression);
   return Decision::kChangeBlobIndex;
 }
 
@@ -201,14 +163,13 @@ bool BlobIndexCompactionFilterBase::OpenNewBlobFileIfNeeded() const {
 }
 
 bool BlobIndexCompactionFilterBase::ReadBlobFromOldFile(
-    const Slice& key, const BlobIndex& blob_index, PinnableSlice* blob,
-    bool need_decompress, CompressionType* compression_type) const {
+    const Slice& key, const BlobIndex& blob_index, PinnableSlice* blob) const {
   BlobDBImpl* const blob_db_impl = context_.blob_db_impl;
   assert(blob_db_impl);
 
-  Status s = blob_db_impl->GetRawBlobFromFile(
-      key, blob_index.file_number(), blob_index.offset(), blob_index.size(),
-      blob, compression_type);
+  Status s = blob_db_impl->GetRawBlobFromFile(key, blob_index.file_number(),
+                                              blob_index.offset(),
+                                              blob_index.size(), blob);
 
   if (!s.ok()) {
     ROCKS_LOG_ERROR(
@@ -221,21 +182,6 @@ bool BlobIndexCompactionFilterBase::ReadBlobFromOldFile(
     return false;
   }
 
-  if (need_decompress && *compression_type != kNoCompression) {
-    s = blob_db_impl->DecompressSlice(*blob, *compression_type, blob);
-    if (!s.ok()) {
-      ROCKS_LOG_ERROR(
-          blob_db_impl->db_options_.info_log,
-          "Uncompression error during blob read from file: %" PRIu64
-          " blob_offset: %" PRIu64 " blob_size: %" PRIu64
-          " key: %s status: '%s'",
-          blob_index.file_number(), blob_index.offset(), blob_index.size(),
-          key.ToString(/* output_hex */ true).c_str(), s.ToString().c_str());
-
-      return false;
-    }
-  }
-
   return true;
 }
 
@@ -306,8 +252,7 @@ bool BlobIndexCompactionFilterBase::CloseAndRegisterNewBlobFile() const {
     // TODO: plumb Env::IOActivity, Env::IOPriority
     s = blob_db_impl->CloseBlobFile(WriteOptions(), blob_file_);
 
-    // Note: we delay registering the new blob file until it's closed to
-    // prevent FIFO eviction from processing it during compaction/GC.
+    // Note: we delay registering the new blob file until it's closed.
     blob_db_impl->RegisterBlobFile(blob_file_);
   }
 
@@ -336,18 +281,12 @@ CompactionFilter::BlobDecision BlobIndexCompactionFilterGC::PrepareBlobOutput(
   assert(blob_db_impl->bdb_options_.enable_garbage_collection);
 
   BlobIndex blob_index;
-  const Status s = blob_index.DecodeFrom(existing_value);
+  Status s = blob_index.DecodeFrom(existing_value);
   if (!s.ok()) {
     gc_stats_.SetError();
     return BlobDecision::kCorruption;
   }
 
-  if (blob_index.IsInlined()) {
-    gc_stats_.AddBlob(blob_index.value().size());
-
-    return BlobDecision::kKeep;
-  }
-
   gc_stats_.AddBlob(blob_index.size());
 
   if (blob_index.HasTTL()) {
@@ -368,31 +307,11 @@ CompactionFilter::BlobDecision BlobIndexCompactionFilterGC::PrepareBlobOutput(
   }
 
   PinnableSlice blob;
-  CompressionType compression_type = kNoCompression;
-  std::string compression_output;
-  if (!ReadBlobFromOldFile(key, blob_index, &blob, false, &compression_type)) {
+  if (!ReadBlobFromOldFile(key, blob_index, &blob)) {
     gc_stats_.SetError();
     return BlobDecision::kIOError;
   }
 
-  // If the compression_type is changed, re-compress it with the new compression
-  // type.
-  if (compression_type != blob_db_impl->bdb_options_.compression) {
-    if (compression_type != kNoCompression) {
-      const Status status =
-          blob_db_impl->DecompressSlice(blob, compression_type, &blob);
-      if (!status.ok()) {
-        gc_stats_.SetError();
-        return BlobDecision::kCorruption;
-      }
-    }
-    if (blob_db_impl->bdb_options_.compression != kNoCompression) {
-      blob_db_impl->GetCompressedSlice(blob, &compression_output);
-      blob = PinnableSlice(&compression_output);
-      blob.PinSelf();
-    }
-  }
-
   uint64_t new_blob_file_number = 0;
   uint64_t new_blob_offset = 0;
   if (!WriteBlobToNewFile(key, blob, &new_blob_file_number, &new_blob_offset)) {
@@ -406,7 +325,7 @@ CompactionFilter::BlobDecision BlobIndexCompactionFilterGC::PrepareBlobOutput(
   }
 
   BlobIndex::EncodeBlob(new_value, new_blob_file_number, new_blob_offset,
-                        blob.size(), compression_type);
+                        blob.size(), kNoCompression);
 
   gc_stats_.AddRelocatedBlob(blob_index.size());
 
diff --git a/utilities/blob_db/blob_compaction_filter.h b/utilities/blob_db/blob_compaction_filter.h
index cb83d0d034f5..1c55a53c2460 100644
--- a/utilities/blob_db/blob_compaction_filter.h
+++ b/utilities/blob_db/blob_compaction_filter.h
@@ -21,8 +21,6 @@ struct BlobCompactionContext {
   BlobDBImpl* blob_db_impl = nullptr;
   uint64_t next_file_number = 0;
   std::unordered_set<uint64_t> current_blob_files;
-  SequenceNumber fifo_eviction_seq = 0;
-  uint64_t evict_expiration_up_to = 0;
 };
 
 struct BlobCompactionContextGC {
@@ -59,8 +57,7 @@ class BlobIndexCompactionFilterBase : public LayeredCompactionFilterBase {
   bool IsBlobFileOpened() const;
   virtual bool OpenNewBlobFileIfNeeded() const;
   bool ReadBlobFromOldFile(const Slice& key, const BlobIndex& blob_index,
-                           PinnableSlice* blob, bool need_decompress,
-                           CompressionType* compression_type) const;
+                           PinnableSlice* blob) const;
   bool WriteBlobToNewFile(const Slice& key, const Slice& blob,
                           uint64_t* new_blob_file_number,
                           uint64_t* new_blob_offset) const;
diff --git a/utilities/blob_db/blob_db.cc b/utilities/blob_db/blob_db.cc
index 25960bdd6c84..16e75417d510 100644
--- a/utilities/blob_db/blob_db.cc
+++ b/utilities/blob_db/blob_db.cc
@@ -68,39 +68,18 @@ Status BlobDB::Open(const DBOptions& db_options,
 BlobDB::BlobDB() : StackableDB(nullptr) {}
 
 void BlobDBOptions::Dump(Logger* log) const {
-  ROCKS_LOG_HEADER(
-      log, "                                  BlobDBOptions.blob_dir: %s",
-      blob_dir.c_str());
-  ROCKS_LOG_HEADER(
-      log, "                             BlobDBOptions.path_relative: %d",
-      path_relative);
-  ROCKS_LOG_HEADER(
-      log, "                                   BlobDBOptions.is_fifo: %d",
-      is_fifo);
   ROCKS_LOG_HEADER(
       log, "                               BlobDBOptions.max_db_size: %" PRIu64,
       max_db_size);
   ROCKS_LOG_HEADER(
       log, "                            BlobDBOptions.ttl_range_secs: %" PRIu64,
       ttl_range_secs);
-  ROCKS_LOG_HEADER(
-      log, "                             BlobDBOptions.min_blob_size: %" PRIu64,
-      min_blob_size);
-  ROCKS_LOG_HEADER(
-      log, "                            BlobDBOptions.bytes_per_sync: %" PRIu64,
-      bytes_per_sync);
   ROCKS_LOG_HEADER(
       log, "                            BlobDBOptions.blob_file_size: %" PRIu64,
       blob_file_size);
-  ROCKS_LOG_HEADER(
-      log, "                               BlobDBOptions.compression: %d",
-      static_cast<int>(compression));
   ROCKS_LOG_HEADER(
       log, "                 BlobDBOptions.enable_garbage_collection: %d",
       enable_garbage_collection);
-  ROCKS_LOG_HEADER(
-      log, "                 BlobDBOptions.garbage_collection_cutoff: %f",
-      garbage_collection_cutoff);
   ROCKS_LOG_HEADER(
       log, "                  BlobDBOptions.disable_background_tasks: %d",
       disable_background_tasks);
diff --git a/utilities/blob_db/blob_db.h b/utilities/blob_db/blob_db.h
index 503d476fa51d..fc8b6e3099ab 100644
--- a/utilities/blob_db/blob_db.h
+++ b/utilities/blob_db/blob_db.h
@@ -25,21 +25,14 @@ namespace blob_db {
 // users to use blob DB.
 
 constexpr uint64_t kNoExpiration = std::numeric_limits<uint64_t>::max();
+// Name of the directory under the base DB where blobs will be stored.
+constexpr const char* kBlobDirName = "blob_dir";
 
-struct BlobDBOptions {
-  // Name of the directory under the base DB where blobs will be stored. Using
-  // a directory where the base DB stores its SST files is not supported.
-  // Default is "blob_dir"
-  std::string blob_dir = "blob_dir";
-
-  // whether the blob_dir path is relative or absolute.
-  bool path_relative = true;
-
-  // When max_db_size is reached, evict blob files to free up space
-  // instead of returnning NoSpace error on write. Blob files will be
-  // evicted from oldest to newest, based on file creation time.
-  bool is_fifo = false;
+// Allows OS to incrementally sync blob files to disk for every
+// kBytesPerSync bytes written.
+constexpr uint64_t kBytesPerSync = 512 * 1024;
 
+struct BlobDBOptions {
   // Maximum size of the database (including SST files and blob files).
   //
   // Default: 0 (no limits)
@@ -53,31 +46,14 @@ struct BlobDBOptions {
   // and so on
   uint64_t ttl_range_secs = 3600;
 
-  // The smallest value to store in blob log. Values smaller than this threshold
-  // will be inlined in base DB together with the key.
-  uint64_t min_blob_size = 0;
-
-  // Allows OS to incrementally sync blob files to disk for every
-  // bytes_per_sync bytes written. Users shouldn't rely on it for
-  // persistency guarantee.
-  uint64_t bytes_per_sync = 512 * 1024;
-
   // the target size of each blob file. File will become immutable
   // after it exceeds that size
   uint64_t blob_file_size = 256 * 1024 * 1024;
 
-  // what compression to use for Blob's
-  CompressionType compression = kNoCompression;
-
   // If enabled, BlobDB cleans up stale blobs in non-TTL files during compaction
   // by rewriting the remaining live blobs to new files.
   bool enable_garbage_collection = false;
 
-  // The cutoff in terms of blob file age for garbage collection. Blobs in
-  // the oldest N non-TTL blob files will be rewritten when encountered during
-  // compaction, where N = garbage_collection_cutoff * number_of_non_TTL_files.
-  double garbage_collection_cutoff = 0.25;
-
   // Disable all background job. Used for test only.
   bool disable_background_tasks = false;
 
@@ -121,20 +97,6 @@ class BlobDB : public StackableDB {
     return PutWithTTL(options, key, value, ttl);
   }
 
-  // Put with expiration. Key with expiration time equal to
-  // std::numeric_limits<uint64_t>::max() means the key don't expire.
-  virtual Status PutUntil(const WriteOptions& options, const Slice& key,
-                          const Slice& value, uint64_t expiration) = 0;
-  virtual Status PutUntil(const WriteOptions& options,
-                          ColumnFamilyHandle* column_family, const Slice& key,
-                          const Slice& value, uint64_t expiration) {
-    if (column_family->GetID() != DefaultColumnFamily()->GetID()) {
-      return Status::NotSupported(
-          "Blob DB doesn't support non-default column family.");
-    }
-    return PutUntil(options, key, value, expiration);
-  }
-
   using ROCKSDB_NAMESPACE::StackableDB::Get;
   Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family,
              const Slice& key, PinnableSlice* value,
@@ -212,10 +174,6 @@ class BlobDB : public StackableDB {
                      std::vector<ColumnFamilyHandle*>* handles,
                      BlobDB** blob_db);
 
-  virtual BlobDBOptions GetBlobDBOptions() const = 0;
-
-  virtual Status SyncBlobFiles(const WriteOptions& write_options) = 0;
-
   ~BlobDB() override {}
 
  protected:
diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index 00d15e90ccf1..677fa60dfb95 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -28,28 +28,23 @@
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/utilities/stackable_db.h"
-#include "rocksdb/utilities/transaction.h"
-#include "table/block_based/block.h"
-#include "table/block_based/block_based_table_builder.h"
-#include "table/block_based/block_builder.h"
-#include "table/meta_blocks.h"
 #include "test_util/sync_point.h"
 #include "util/cast_util.h"
 #include "util/crc32c.h"
 #include "util/mutexlock.h"
-#include "util/random.h"
 #include "util/stop_watch.h"
 #include "util/timer_queue.h"
 #include "utilities/blob_db/blob_compaction_filter.h"
 #include "utilities/blob_db/blob_db_iterator.h"
 #include "utilities/blob_db/blob_db_listener.h"
 
-namespace {
-int kBlockBasedTableVersionFormat = 2;
-}  // end namespace
-
 namespace ROCKSDB_NAMESPACE::blob_db {
 
+// The cutoff in terms of blob file age for garbage collection. Blobs in the
+// oldest N non-TTL blob files will be rewritten when encountered during
+// compaction, where N = kGarbageCollectionCutoff * number_of_non_TTL_files.
+constexpr double kGarbageCollectionCutoff = 0.25;
+
 bool BlobFileComparator::operator()(
     const std::shared_ptr<BlobFile>& lhs,
     const std::shared_ptr<BlobFile>& rhs) const {
@@ -87,15 +82,10 @@ BlobDBImpl::BlobDBImpl(const std::string& dbname,
       closed_(true),
       open_file_count_(0),
       total_blob_size_(0),
-      live_sst_size_(0),
-      fifo_eviction_seq_(0),
-      evict_expiration_up_to_(0),
-      debug_level_(0) {
+      live_sst_size_(0) {
   clock_ = env_->GetSystemClock().get();
-  blob_dir_ = (bdb_options_.path_relative)
-                  ? dbname + "/" + bdb_options_.blob_dir
-                  : bdb_options_.blob_dir;
-  file_options_.bytes_per_sync = blob_db_options.bytes_per_sync;
+  blob_dir_ = dbname + "/" + kBlobDirName;
+  file_options_.bytes_per_sync = kBytesPerSync;
 }
 
 BlobDBImpl::~BlobDBImpl() {
@@ -123,9 +113,8 @@ Status BlobDBImpl::CloseImpl() {
   // Close base DB before BlobDBImpl destructs to stop event listener and
   // compaction filter call.
   Status s = db_->Close();
-  // delete db_ anyway even if close failed.
-  delete db_;
-  // Reset pointers to avoid StackableDB delete the pointer again.
+  // Reset ownership to free the underlying DB.
+  shared_db_ptr_.reset();
   db_ = nullptr;
   db_impl_ = nullptr;
   if (!s.ok()) {
@@ -137,8 +126,6 @@ Status BlobDBImpl::CloseImpl() {
   return s;
 }
 
-BlobDBOptions BlobDBImpl::GetBlobDBOptions() const { return bdb_options_; }
-
 Status BlobDBImpl::Open(std::vector<ColumnFamilyHandle*>* handles) {
   assert(handles != nullptr);
   assert(db_ == nullptr);
@@ -147,12 +134,6 @@ Status BlobDBImpl::Open(std::vector<ColumnFamilyHandle*>* handles) {
     return Status::NotSupported("No blob directory in options");
   }
 
-  if (bdb_options_.garbage_collection_cutoff < 0.0 ||
-      bdb_options_.garbage_collection_cutoff > 1.0) {
-    return Status::InvalidArgument(
-        "Garbage collection cutoff must be in the interval [0.0, 1.0]");
-  }
-
   // Temporarily disable compactions in the base DB during open; save the user
   // defined value beforehand so we can restore it once BlobDB is initialized.
   // Note: this is only needed if garbage collection is enabled.
@@ -220,7 +201,12 @@ Status BlobDBImpl::Open(std::vector<ColumnFamilyHandle*>* handles) {
 
   // Open base db.
   ColumnFamilyDescriptor cf_descriptor(kDefaultColumnFamilyName, cf_options_);
-  s = DB::Open(db_options_, dbname_, {cf_descriptor}, handles, &db_);
+  std::unique_ptr<DB> db;
+  s = DB::Open(db_options_, dbname_, {cf_descriptor}, handles, &db);
+  if (s.ok()) {
+    shared_db_ptr_ = std::move(db);
+    db_ = shared_db_ptr_.get();
+  }
   if (!s.ok()) {
     return s;
   }
@@ -284,7 +270,7 @@ Status BlobDBImpl::Open(std::vector<ColumnFamilyHandle*>* handles) {
     return s;
   }
 
-  UpdateLiveSSTSize(WriteOptions(Env::IOActivity::kDBOpen));
+  UpdateLiveSSTSize();
 
   // Start background jobs.
   if (!bdb_options_.disable_background_tasks) {
@@ -598,7 +584,6 @@ bool BlobDBImpl::MarkBlobFileObsoleteIfNeeded(
   assert(blob_file->Immutable());
   assert(bdb_options_.enable_garbage_collection);
 
-  // Note: FIFO eviction could have marked this file obsolete already.
   if (blob_file->Obsolete()) {
     return true;
   }
@@ -712,7 +697,7 @@ std::shared_ptr<BlobFile> BlobDBImpl::NewBlobFile(
       static_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->GetID();
   auto blob_file = std::make_shared<BlobFile>(
       this, blob_dir_, file_num, db_options_.info_log.get(), column_family_id,
-      bdb_options_.compression, has_ttl, expiration_range);
+      has_ttl, expiration_range);
 
   ROCKS_LOG_DEBUG(db_options_.info_log, "New blob file created: %s reason='%s'",
                   blob_file->PathName().c_str(), reason.c_str());
@@ -755,11 +740,6 @@ Status BlobDBImpl::CreateWriterLocked(const std::shared_ptr<BlobFile>& bfile) {
       statistics_, Histograms::BLOB_DB_BLOB_FILE_WRITE_MICROS));
 
   uint64_t boffset = bfile->GetFileSize();
-  if (debug_level_ >= 2 && boffset) {
-    ROCKS_LOG_DEBUG(db_options_.info_log,
-                    "Open blob file: %s with offset: %" PRIu64, fpath.c_str(),
-                    boffset);
-  }
 
   BlobLogWriter::ElemType et = BlobLogWriter::kEtNone;
   if (bfile->file_size_ == BlobLogHeader::kSize) {
@@ -1039,18 +1019,27 @@ Status BlobDBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
 
 Status BlobDBImpl::Put(const WriteOptions& options, const Slice& key,
                        const Slice& value) {
-  return PutUntil(options, key, value, kNoExpiration);
+  StopWatch write_sw(clock_, statistics_, BLOB_DB_WRITE_MICROS);
+  RecordTick(statistics_, BLOB_DB_NUM_PUT);
+  Status s;
+  WriteBatch batch;
+  {
+    // Release write_mutex_ before DB write to avoid race condition with
+    // flush begin listener, which also require write_mutex_ to sync
+    // blob files.
+    MutexLock l(&write_mutex_);
+    s = PutBlobValue(options, key, value, kNoExpiration, &batch);
+  }
+  if (s.ok()) {
+    s = db_->Write(options, &batch);
+  }
+  return s;
 }
 
 Status BlobDBImpl::PutWithTTL(const WriteOptions& options, const Slice& key,
                               const Slice& value, uint64_t ttl) {
   uint64_t now = EpochNow();
   uint64_t expiration = kNoExpiration - now > ttl ? now + ttl : kNoExpiration;
-  return PutUntil(options, key, value, expiration);
-}
-
-Status BlobDBImpl::PutUntil(const WriteOptions& options, const Slice& key,
-                            const Slice& value, uint64_t expiration) {
   StopWatch write_sw(clock_, statistics_, BLOB_DB_WRITE_MICROS);
   RecordTick(statistics_, BLOB_DB_NUM_PUT);
   Status s;
@@ -1072,77 +1061,54 @@ Status BlobDBImpl::PutBlobValue(const WriteOptions& write_options,
                                 const Slice& key, const Slice& value,
                                 uint64_t expiration, WriteBatch* batch) {
   write_mutex_.AssertHeld();
-  Status s;
-  std::string index_entry;
-  uint32_t column_family_id =
-      static_cast_with_check<ColumnFamilyHandleImpl>(DefaultColumnFamily())
-          ->GetID();
-  if (value.size() < bdb_options_.min_blob_size) {
-    if (expiration == kNoExpiration) {
-      // Put as normal value
-      s = batch->Put(key, value);
-      RecordTick(statistics_, BLOB_DB_WRITE_INLINED);
-    } else {
-      // Inlined with TTL
-      BlobIndex::EncodeInlinedTTL(&index_entry, expiration, value);
-      s = WriteBatchInternal::PutBlobIndex(batch, column_family_id, key,
-                                           index_entry);
-      RecordTick(statistics_, BLOB_DB_WRITE_INLINED_TTL);
-    }
-  } else {
-    std::string compression_output;
-    Slice value_compressed = GetCompressedSlice(value, &compression_output);
-
-    std::string headerbuf;
-    BlobLogWriter::ConstructBlobHeader(&headerbuf, key, value_compressed,
-                                       expiration);
-
-    // Check DB size limit before selecting blob file to
-    // Since CheckSizeAndEvictBlobFiles() can close blob files, it needs to be
-    // done before calling SelectBlobFile().
-    s = CheckSizeAndEvictBlobFiles(
-        write_options, headerbuf.size() + key.size() + value_compressed.size());
-    if (!s.ok()) {
-      return s;
-    }
+  std::string headerbuf;
+  BlobLogWriter::ConstructBlobHeader(&headerbuf, key, value, expiration);
 
-    std::shared_ptr<BlobFile> blob_file;
+  // Check DB size limit before selecting blob file.
+  Status s = CheckDbSizeLimit(headerbuf.size() + key.size() + value.size());
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::shared_ptr<BlobFile> blob_file;
+  if (expiration != kNoExpiration) {
+    s = SelectBlobFileTTL(write_options, expiration, &blob_file);
+  } else {
+    s = SelectBlobFile(write_options, &blob_file);
+  }
+  std::string index_entry;
+  if (s.ok()) {
+    assert(blob_file != nullptr);
+    s = AppendBlob(write_options, blob_file, headerbuf, key, value, expiration,
+                   &index_entry);
+  }
+  if (s.ok()) {
     if (expiration != kNoExpiration) {
-      s = SelectBlobFileTTL(write_options, expiration, &blob_file);
-    } else {
-      s = SelectBlobFile(write_options, &blob_file);
-    }
-    if (s.ok()) {
-      assert(blob_file != nullptr);
-      assert(blob_file->GetCompressionType() == bdb_options_.compression);
-      s = AppendBlob(write_options, blob_file, headerbuf, key, value_compressed,
-                     expiration, &index_entry);
-    }
-    if (s.ok()) {
-      if (expiration != kNoExpiration) {
-        WriteLock file_lock(&blob_file->mutex_);
-        blob_file->ExtendExpirationRange(expiration);
-      }
-      s = CloseBlobFileIfNeeded(write_options, blob_file);
-    }
-    if (s.ok()) {
-      s = WriteBatchInternal::PutBlobIndex(batch, column_family_id, key,
-                                           index_entry);
+      WriteLock file_lock(&blob_file->mutex_);
+      blob_file->ExtendExpirationRange(expiration);
     }
-    if (s.ok()) {
-      if (expiration == kNoExpiration) {
-        RecordTick(statistics_, BLOB_DB_WRITE_BLOB);
-      } else {
-        RecordTick(statistics_, BLOB_DB_WRITE_BLOB_TTL);
-      }
+    s = CloseBlobFileIfNeeded(write_options, blob_file);
+  }
+  if (s.ok()) {
+    const uint32_t column_family_id =
+        static_cast_with_check<ColumnFamilyHandleImpl>(DefaultColumnFamily())
+            ->GetID();
+    s = WriteBatchInternal::PutBlobIndex(batch, column_family_id, key,
+                                         index_entry);
+  }
+  if (s.ok()) {
+    if (expiration == kNoExpiration) {
+      RecordTick(statistics_, BLOB_DB_WRITE_BLOB);
     } else {
-      ROCKS_LOG_ERROR(
-          db_options_.info_log,
-          "Failed to append blob to FILE: %s: KEY: %s VALSZ: %" ROCKSDB_PRIszt
-          " status: '%s' blob_file: '%s'",
-          blob_file->PathName().c_str(), key.ToString().c_str(), value.size(),
-          s.ToString().c_str(), blob_file->DumpState().c_str());
+      RecordTick(statistics_, BLOB_DB_WRITE_BLOB_TTL);
     }
+  } else {
+    ROCKS_LOG_ERROR(
+        db_options_.info_log,
+        "Failed to append blob to FILE: %s: KEY: %s VALSZ: %" ROCKSDB_PRIszt
+        " status: '%s' blob_file: '%s'",
+        blob_file->PathName().c_str(), key.ToString().c_str(), value.size(),
+        s.ToString().c_str(), blob_file->DumpState().c_str());
   }
 
   RecordTick(statistics_, BLOB_DB_NUM_KEYS_WRITTEN);
@@ -1153,49 +1119,6 @@ Status BlobDBImpl::PutBlobValue(const WriteOptions& write_options,
   return s;
 }
 
-Slice BlobDBImpl::GetCompressedSlice(const Slice& raw,
-                                     std::string* compression_output) const {
-  if (bdb_options_.compression == kNoCompression) {
-    return raw;
-  }
-  StopWatch compression_sw(clock_, statistics_, BLOB_DB_COMPRESSION_MICROS);
-  CompressionType type = bdb_options_.compression;
-  CompressionOptions opts;
-  CompressionContext context(type, opts);
-  CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), type,
-                       0 /* sample_for_compression */);
-  CompressBlock(raw, info, &type, kBlockBasedTableVersionFormat, false,
-                compression_output, nullptr, nullptr);
-  return *compression_output;
-}
-
-Status BlobDBImpl::DecompressSlice(const Slice& compressed_value,
-                                   CompressionType compression_type,
-                                   PinnableSlice* value_output) const {
-  assert(compression_type != kNoCompression);
-
-  BlockContents contents;
-  auto cfh = static_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily());
-
-  {
-    StopWatch decompression_sw(clock_, statistics_,
-                               BLOB_DB_DECOMPRESSION_MICROS);
-    UncompressionContext context(compression_type);
-    UncompressionInfo info(context, UncompressionDict::GetEmptyDict(),
-                           compression_type);
-    Status s = UncompressBlockData(
-        info, compressed_value.data(), compressed_value.size(), &contents,
-        kBlockBasedTableVersionFormat, cfh->cfd()->ioptions());
-    if (!s.ok()) {
-      return Status::Corruption("Unable to decompress blob.");
-    }
-  }
-
-  value_output->PinSelf(contents.data);
-
-  return Status::OK();
-}
-
 Status BlobDBImpl::CompactFiles(
     const CompactionOptions& compact_options,
     const std::vector<std::string>& input_file_names, const int output_level,
@@ -1233,8 +1156,6 @@ void BlobDBImpl::GetCompactionContextCommon(BlobCompactionContext* context) {
   for (auto& p : blob_files_) {
     context->current_blob_files.insert(p.first);
   }
-  context->fifo_eviction_seq = fifo_eviction_seq_;
-  context->evict_expiration_up_to = evict_expiration_up_to_;
 }
 
 void BlobDBImpl::GetCompactionContext(BlobCompactionContext* context) {
@@ -1254,15 +1175,15 @@ void BlobDBImpl::GetCompactionContext(BlobCompactionContext* context,
 
   if (!live_imm_non_ttl_blob_files_.empty()) {
     auto it = live_imm_non_ttl_blob_files_.begin();
-    std::advance(it, bdb_options_.garbage_collection_cutoff *
-                         live_imm_non_ttl_blob_files_.size());
+    std::advance(
+        it, kGarbageCollectionCutoff * live_imm_non_ttl_blob_files_.size());
     context_gc->cutoff_file_number = it != live_imm_non_ttl_blob_files_.end()
                                          ? it->first
                                          : std::numeric_limits<uint64_t>::max();
   }
 }
 
-void BlobDBImpl::UpdateLiveSSTSize(const WriteOptions& write_options) {
+void BlobDBImpl::UpdateLiveSSTSize() {
   uint64_t live_sst_size = 0;
   bool ok = GetIntProperty(DB::Properties::kLiveSstFilesSize, &live_sst_size);
   if (ok) {
@@ -1275,90 +1196,21 @@ void BlobDBImpl::UpdateLiveSSTSize(const WriteOptions& write_options) {
         db_options_.info_log,
         "Failed to update total SST file size after flush or compaction.");
   }
-  {
-    // Trigger FIFO eviction if needed.
-    MutexLock l(&write_mutex_);
-    Status s = CheckSizeAndEvictBlobFiles(write_options, 0, true /*force*/);
-    if (s.IsNoSpace()) {
-      ROCKS_LOG_WARN(db_options_.info_log,
-                     "DB grow out-of-space after SST size updated. Current live"
-                     " SST size: %" PRIu64
-                     " , current blob files size: %" PRIu64 ".",
-                     live_sst_size_.load(), total_blob_size_.load());
-    }
-  }
 }
 
-Status BlobDBImpl::CheckSizeAndEvictBlobFiles(const WriteOptions& write_options,
-                                              uint64_t blob_size,
-                                              bool force_evict) {
-  write_mutex_.AssertHeld();
-
-  uint64_t live_sst_size = live_sst_size_.load();
-  if (bdb_options_.max_db_size == 0 ||
-      live_sst_size + total_blob_size_.load() + blob_size <=
-          bdb_options_.max_db_size) {
+Status BlobDBImpl::CheckDbSizeLimit(uint64_t blob_size) {
+  if (bdb_options_.max_db_size == 0) {
     return Status::OK();
   }
 
-  if (bdb_options_.is_fifo == false ||
-      (!force_evict && live_sst_size + blob_size > bdb_options_.max_db_size)) {
-    // FIFO eviction is disabled, or no space to insert new blob even we evict
-    // all blob files.
-    return Status::NoSpace(
-        "Write failed, as writing it would exceed max_db_size limit.");
+  uint64_t live_sst_size = live_sst_size_.load();
+  uint64_t total_blob_size = total_blob_size_.load();
+  if (live_sst_size + total_blob_size + blob_size <= bdb_options_.max_db_size) {
+    return Status::OK();
   }
 
-  std::vector<std::shared_ptr<BlobFile>> candidate_files;
-  CopyBlobFiles(&candidate_files);
-  std::sort(candidate_files.begin(), candidate_files.end(),
-            BlobFileComparator());
-  fifo_eviction_seq_ = GetLatestSequenceNumber();
-
-  WriteLock l(&mutex_);
-
-  while (!candidate_files.empty() &&
-         live_sst_size + total_blob_size_.load() + blob_size >
-             bdb_options_.max_db_size) {
-    std::shared_ptr<BlobFile> blob_file = candidate_files.back();
-    candidate_files.pop_back();
-    WriteLock file_lock(&blob_file->mutex_);
-    if (blob_file->Obsolete()) {
-      // File already obsoleted by someone else.
-      assert(blob_file->Immutable());
-      continue;
-    }
-    // FIFO eviction can evict open blob files.
-    if (!blob_file->Immutable()) {
-      Status s = CloseBlobFile(write_options, blob_file);
-      if (!s.ok()) {
-        return s;
-      }
-    }
-    assert(blob_file->Immutable());
-    auto expiration_range = blob_file->GetExpirationRange();
-    ROCKS_LOG_INFO(db_options_.info_log,
-                   "Evict oldest blob file since DB out of space. Current "
-                   "live SST file size: %" PRIu64 ", total blob size: %" PRIu64
-                   ", max db size: %" PRIu64 ", evicted blob file #%" PRIu64
-                   ".",
-                   live_sst_size, total_blob_size_.load(),
-                   bdb_options_.max_db_size, blob_file->BlobFileNumber());
-    ObsoleteBlobFile(blob_file, fifo_eviction_seq_, true /*update_size*/);
-    evict_expiration_up_to_ = expiration_range.first;
-    RecordTick(statistics_, BLOB_DB_FIFO_NUM_FILES_EVICTED);
-    RecordTick(statistics_, BLOB_DB_FIFO_NUM_KEYS_EVICTED,
-               blob_file->BlobCount());
-    RecordTick(statistics_, BLOB_DB_FIFO_BYTES_EVICTED,
-               blob_file->GetFileSize());
-    TEST_SYNC_POINT("BlobDBImpl::EvictOldestBlobFile:Evicted");
-  }
-  if (live_sst_size + total_blob_size_.load() + blob_size >
-      bdb_options_.max_db_size) {
-    return Status::NoSpace(
-        "Write failed, as writing it would exceed max_db_size limit.");
-  }
-  return Status::OK();
+  return Status::NoSpace(
+      "Write failed, as writing it would exceed max_db_size limit.");
 }
 
 Status BlobDBImpl::AppendBlob(const WriteOptions& write_options,
@@ -1395,11 +1247,10 @@ Status BlobDBImpl::AppendBlob(const WriteOptions& write_options,
 
   if (expiration == kNoExpiration) {
     BlobIndex::EncodeBlob(index_entry, bfile->BlobFileNumber(), blob_offset,
-                          value.size(), bdb_options_.compression);
+                          value.size(), kNoCompression);
   } else {
     BlobIndex::EncodeBlobTTL(index_entry, expiration, bfile->BlobFileNumber(),
-                             blob_offset, value.size(),
-                             bdb_options_.compression);
+                             blob_offset, value.size(), kNoCompression);
   }
 
   return s;
@@ -1490,46 +1341,14 @@ Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
     }
   }
 
-  if (blob_index.IsInlined()) {
-    // TODO(yiwu): If index_entry is a PinnableSlice, we can also pin the same
-    // memory buffer to avoid extra copy.
-    value->PinSelf(blob_index.value());
-    return Status::OK();
-  }
-
-  CompressionType compression_type = kNoCompression;
-  s = GetRawBlobFromFile(key, blob_index.file_number(), blob_index.offset(),
-                         blob_index.size(), value, &compression_type);
-  if (!s.ok()) {
-    return s;
-  }
-
-  if (compression_type != kNoCompression) {
-    s = DecompressSlice(*value, compression_type, value);
-    if (!s.ok()) {
-      if (debug_level_ >= 2) {
-        ROCKS_LOG_ERROR(
-            db_options_.info_log,
-            "Uncompression error during blob read from file: %" PRIu64
-            " blob_offset: %" PRIu64 " blob_size: %" PRIu64
-            " key: %s status: '%s'",
-            blob_index.file_number(), blob_index.offset(), blob_index.size(),
-            key.ToString(/* output_hex */ true).c_str(), s.ToString().c_str());
-      }
-      return s;
-    }
-  }
-
-  return Status::OK();
+  return GetRawBlobFromFile(key, blob_index.file_number(), blob_index.offset(),
+                            blob_index.size(), value);
 }
 
 Status BlobDBImpl::GetRawBlobFromFile(const Slice& key, uint64_t file_number,
                                       uint64_t offset, uint64_t size,
-                                      PinnableSlice* value,
-                                      CompressionType* compression_type) {
+                                      PinnableSlice* value) {
   assert(value);
-  assert(compression_type);
-  assert(*compression_type == kNoCompression);
 
   if (!size) {
     value->PinSelf("");
@@ -1541,15 +1360,6 @@ Status BlobDBImpl::GetRawBlobFromFile(const Slice& key, uint64_t file_number,
   // valid offset.
   if (offset <
       (BlobLogHeader::kSize + BlobLogRecord::kHeaderSize + key.size())) {
-    if (debug_level_ >= 2) {
-      ROCKS_LOG_ERROR(db_options_.info_log,
-                      "Invalid blob index file_number: %" PRIu64
-                      " blob_offset: %" PRIu64 " blob_size: %" PRIu64
-                      " key: %s",
-                      file_number, offset, size,
-                      key.ToString(/* output_hex */ true).c_str());
-    }
-
     return Status::NotFound("Invalid blob offset");
   }
 
@@ -1567,8 +1377,6 @@ Status BlobDBImpl::GetRawBlobFromFile(const Slice& key, uint64_t file_number,
     blob_file = it->second;
   }
 
-  *compression_type = blob_file->GetCompressionType();
-
   // takes locks when called
   std::shared_ptr<RandomAccessFileReader> reader;
   Status s = GetBlobFileReader(blob_file, &reader);
@@ -1641,15 +1449,6 @@ Status BlobDBImpl::GetRawBlobFromFile(const Slice& key, uint64_t file_number,
                                blob_record.size() - sizeof(uint32_t));
   crc = crc32c::Mask(crc);  // Adjust for storage
   if (crc != crc_exp) {
-    if (debug_level_ >= 2) {
-      ROCKS_LOG_ERROR(
-          db_options_.info_log,
-          "Blob crc mismatch file: %" PRIu64 " blob_offset: %" PRIu64
-          " blob_size: %" PRIu64 " key: %s status: '%s'",
-          file_number, offset, size,
-          key.ToString(/* output_hex */ true).c_str(), s.ToString().c_str());
-    }
-
     return Status::Corruption("Corruption. Blob CRC mismatch");
   }
 
@@ -2108,14 +1907,6 @@ std::pair<bool, int64_t> BlobDBImpl::DeleteObsoleteFiles(bool aborted) {
   return std::make_pair(!aborted, -1);
 }
 
-void BlobDBImpl::CopyBlobFiles(
-    std::vector<std::shared_ptr<BlobFile>>* bfiles_copy) {
-  ReadLock rl(&mutex_);
-  for (auto const& p : blob_files_) {
-    bfiles_copy->push_back(p.second);
-  }
-}
-
 Iterator* BlobDBImpl::NewIterator(const ReadOptions& _read_options) {
   if (_read_options.io_activity != Env::IOActivity::kUnknown &&
       _read_options.io_activity != Env::IOActivity::kDBIterator) {
@@ -2146,14 +1937,12 @@ Iterator* BlobDBImpl::NewIterator(const ReadOptions& _read_options) {
 }
 
 Status DestroyBlobDB(const std::string& dbname, const Options& options,
-                     const BlobDBOptions& bdb_options) {
+                     const BlobDBOptions& /*bdb_options*/) {
   const ImmutableDBOptions soptions(SanitizeOptions(dbname, options));
   Env* env = soptions.env;
 
   Status status;
-  std::string blobdir;
-  blobdir = (bdb_options.path_relative) ? dbname + "/" + bdb_options.blob_dir
-                                        : bdb_options.blob_dir;
+  std::string blobdir = dbname + "/" + kBlobDirName;
 
   std::vector<std::string> filenames;
   if (env->GetChildren(blobdir, &filenames).ok()) {
diff --git a/utilities/blob_db/blob_db_impl.h b/utilities/blob_db/blob_db_impl.h
index 75776e6a8a7f..227fc0726a8f 100644
--- a/utilities/blob_db/blob_db_impl.h
+++ b/utilities/blob_db/blob_db_impl.h
@@ -26,7 +26,6 @@
 #include "rocksdb/listener.h"
 #include "rocksdb/options.h"
 #include "rocksdb/statistics.h"
-#include "rocksdb/wal_filter.h"
 #include "util/mutexlock.h"
 #include "util/timer_queue.h"
 #include "utilities/blob_db/blob_db.h"
@@ -75,9 +74,6 @@ class BlobDBImpl : public BlobDB {
   friend class BlobIndexCompactionFilterGC;
 
  public:
-  // deletions check period
-  static constexpr uint32_t kDeleteCheckPeriodMillisecs = 2 * 1000;
-
   // sanity check task
   static constexpr uint32_t kSanityCheckPeriodMillisecs = 20 * 60 * 1000;
 
@@ -93,10 +89,6 @@ class BlobDBImpl : public BlobDB {
   // how often to schedule expired files eviction.
   static constexpr uint32_t kEvictExpiredFilesPeriodMillisecs = 10 * 1000;
 
-  // when should oldest file be evicted:
-  // on reaching 90% of blob_dir_size
-  static constexpr double kEvictOldestFileAtSize = 0.9;
-
   using BlobDB::Put;
   Status Put(const WriteOptions& options, const Slice& key,
              const Slice& value) override;
@@ -136,10 +128,6 @@ class BlobDBImpl : public BlobDB {
   Status PutWithTTL(const WriteOptions& options, const Slice& key,
                     const Slice& value, uint64_t ttl) override;
 
-  using BlobDB::PutUntil;
-  Status PutUntil(const WriteOptions& options, const Slice& key,
-                  const Slice& value, uint64_t expiration) override;
-
   using BlobDB::CompactFiles;
   Status CompactFiles(
       const CompactionOptions& compact_options,
@@ -148,8 +136,6 @@ class BlobDBImpl : public BlobDB {
       std::vector<std::string>* const output_file_names = nullptr,
       CompactionJobInfo* compaction_job_info = nullptr) override;
 
-  BlobDBOptions GetBlobDBOptions() const override;
-
   BlobDBImpl(const std::string& dbname, const BlobDBOptions& bdb_options,
              const DBOptions& db_options,
              const ColumnFamilyOptions& cf_options);
@@ -169,8 +155,6 @@ class BlobDBImpl : public BlobDB {
 
   Status Open(std::vector<ColumnFamilyHandle*>* handles);
 
-  Status SyncBlobFiles(const WriteOptions& write_options) override;
-
   // Common part of the two GetCompactionContext methods below.
   // REQUIRES: read lock on mutex_
   void GetCompactionContextCommon(BlobCompactionContext* context);
@@ -198,10 +182,10 @@ class BlobDBImpl : public BlobDB {
                              SequenceNumber obsolete_seq = 0,
                              bool update_size = true);
 
-  void TEST_EvictExpiredFiles();
-
   void TEST_DeleteObsoleteFiles();
 
+  void TEST_EvictExpiredFiles();
+
   uint64_t TEST_live_sst_size();
 
   const std::string& TEST_blob_dir() const { return blob_dir_; }
@@ -222,6 +206,8 @@ class BlobDBImpl : public BlobDB {
   // Return true if a snapshot is created.
   bool SetSnapshotIfNeeded(ReadOptions* read_options);
 
+  Status SyncBlobFiles(const WriteOptions& write_options);
+
   Status GetImpl(const ReadOptions& read_options,
                  ColumnFamilyHandle* column_family, const Slice& key,
                  PinnableSlice* value, uint64_t* expiration = nullptr);
@@ -231,15 +217,7 @@ class BlobDBImpl : public BlobDB {
 
   Status GetRawBlobFromFile(const Slice& key, uint64_t file_number,
                             uint64_t offset, uint64_t size,
-                            PinnableSlice* value,
-                            CompressionType* compression_type);
-
-  Slice GetCompressedSlice(const Slice& raw,
-                           std::string* compression_output) const;
-
-  Status DecompressSlice(const Slice& compressed_value,
-                         CompressionType compression_type,
-                         PinnableSlice* value_output) const;
+                            PinnableSlice* value);
 
   // Close a file by appending a footer, and removes file from open files list.
   // REQUIRES: lock held on write_mutex_, write lock held on both the db mutex_
@@ -299,16 +277,13 @@ class BlobDBImpl : public BlobDB {
   // or GC). Check whether any snapshots exist which refer to the same.
   std::pair<bool, int64_t> DeleteObsoleteFiles(bool aborted);
 
-  // periodically check if open blob files and their TTL's has expired
-  // if expired, close the sequential writer and make the file immutable
-  std::pair<bool, int64_t> EvictExpiredFiles(bool aborted);
-
   // if the number of open files, approaches ULIMIT's this
   // task will close random readers, which are kept around for
   // efficiency
   std::pair<bool, int64_t> ReclaimOpenFiles(bool aborted);
 
-  std::pair<bool, int64_t> RemoveTimerQ(TimerQueue* tq, bool aborted);
+  // Evict expired blob files from the TTL queue.
+  std::pair<bool, int64_t> EvictExpiredFiles(bool aborted);
 
   // Adds the background tasks to the timer queue
   void StartBackgroundTasks();
@@ -371,7 +346,11 @@ class BlobDBImpl : public BlobDB {
   void MarkUnreferencedBlobFilesObsolete();
   void MarkUnreferencedBlobFilesObsoleteDuringOpen();
 
-  void UpdateLiveSSTSize(const WriteOptions& write_options);
+  void UpdateLiveSSTSize();
+
+  // Check if writing blob_size bytes would exceed max_db_size limit.
+  // Returns Status::NoSpace() if limit would be exceeded.
+  Status CheckDbSizeLimit(uint64_t blob_size);
 
   Status GetBlobFileReader(const std::shared_ptr<BlobFile>& blob_file,
                            std::shared_ptr<RandomAccessFileReader>* reader);
@@ -392,20 +371,9 @@ class BlobDBImpl : public BlobDB {
   // checks if there is no snapshot which is referencing the
   // blobs
   bool VisibleToActiveSnapshot(const std::shared_ptr<BlobFile>& file);
-  bool FileDeleteOk_SnapshotCheckLocked(const std::shared_ptr<BlobFile>& bfile);
-
-  void CopyBlobFiles(std::vector<std::shared_ptr<BlobFile>>* bfiles_copy);
 
   uint64_t EpochNow() { return clock_->NowMicros() / 1000000; }
 
-  // Check if inserting a new blob will make DB grow out of space.
-  // If is_fifo = true, FIFO eviction will be triggered to make room for the
-  // new blob. If force_evict = true, FIFO eviction will evict blob files
-  // even eviction will not make enough room for the new blob.
-  Status CheckSizeAndEvictBlobFiles(const WriteOptions& write_options,
-                                    uint64_t blob_size,
-                                    bool force_evict = false);
-
   Status CloseImpl();
 
   // name of the database directory
@@ -474,16 +442,6 @@ class BlobDBImpl : public BlobDB {
   // total size of SST files.
   std::atomic<uint64_t> live_sst_size_;
 
-  // Latest FIFO eviction timestamp
-  //
-  // REQUIRES: access with metex_ lock held.
-  uint64_t fifo_eviction_seq_;
-
-  // The expiration up to which latest FIFO eviction evicts.
-  //
-  // REQUIRES: access with metex_ lock held.
-  uint64_t evict_expiration_up_to_;
-
   std::list<std::shared_ptr<BlobFile>> obsolete_files_;
 
   // DeleteObsoleteFiles, DiableFileDeletions and EnableFileDeletions block
@@ -505,8 +463,6 @@ class BlobDBImpl : public BlobDB {
   //
   // REQUIRES: access with delete_file_mutex_ held.
   int disable_file_deletions_ = 0;
-
-  uint32_t debug_level_;
 };
 
 }  // namespace blob_db
diff --git a/utilities/blob_db/blob_db_impl_filesnapshot.cc b/utilities/blob_db/blob_db_impl_filesnapshot.cc
index 250297404570..e46f3c8fbf38 100644
--- a/utilities/blob_db/blob_db_impl_filesnapshot.cc
+++ b/utilities/blob_db/blob_db_impl_filesnapshot.cc
@@ -59,10 +59,6 @@ Status BlobDBImpl::EnableFileDeletions() {
 Status BlobDBImpl::GetLiveFiles(std::vector<std::string>& ret,
                                 uint64_t* manifest_file_size,
                                 bool flush_memtable) {
-  if (!bdb_options_.path_relative) {
-    return Status::NotSupported(
-        "Not able to get relative blob file path from absolute blob_dir.");
-  }
   // Hold a lock in the beginning to avoid updates to base DB during the call
   ReadLock rl(&mutex_);
   Status s = db_->GetLiveFiles(ret, manifest_file_size, flush_memtable);
@@ -74,14 +70,12 @@ Status BlobDBImpl::GetLiveFiles(std::vector<std::string>& ret,
     auto blob_file = bfile_pair.second;
     // Path should be relative to db_name, but begin with slash.
     ret.emplace_back(
-        BlobFileName("", bdb_options_.blob_dir, blob_file->BlobFileNumber()));
+        BlobFileName("", kBlobDirName, blob_file->BlobFileNumber()));
   }
   return Status::OK();
 }
 
 void BlobDBImpl::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
-  // Path should be relative to db_name.
-  assert(bdb_options_.path_relative);
   // Hold a lock in the beginning to avoid updates to base DB during the call
   ReadLock rl(&mutex_);
   db_->GetLiveFilesMetaData(metadata);
@@ -91,7 +85,7 @@ void BlobDBImpl::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
     filemetadata.size = blob_file->GetFileSize();
     const uint64_t file_number = blob_file->BlobFileNumber();
     // Path should be relative to db_name, but begin with slash.
-    filemetadata.name = BlobFileName("", bdb_options_.blob_dir, file_number);
+    filemetadata.name = BlobFileName("", kBlobDirName, file_number);
     filemetadata.file_number = file_number;
     if (blob_file->HasTTL()) {
       filemetadata.oldest_ancester_time = blob_file->GetExpirationRange().first;
diff --git a/utilities/blob_db/blob_db_listener.h b/utilities/blob_db/blob_db_listener.h
index ce2ec182f5dc..822f71363391 100644
--- a/utilities/blob_db/blob_db_listener.h
+++ b/utilities/blob_db/blob_db_listener.h
@@ -27,14 +27,13 @@ class BlobDBListener : public EventListener {
 
   void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& /*info*/) override {
     assert(blob_db_impl_ != nullptr);
-    blob_db_impl_->UpdateLiveSSTSize(WriteOptions(Env::IOActivity::kFlush));
+    blob_db_impl_->UpdateLiveSSTSize();
   }
 
   void OnCompactionCompleted(DB* /*db*/,
                              const CompactionJobInfo& /*info*/) override {
     assert(blob_db_impl_ != nullptr);
-    blob_db_impl_->UpdateLiveSSTSize(
-        WriteOptions(Env::IOActivity::kCompaction));
+    blob_db_impl_->UpdateLiveSSTSize();
   }
 
   const char* Name() const override { return kClassName(); }
diff --git a/utilities/blob_db/blob_db_test.cc b/utilities/blob_db/blob_db_test.cc
index a0e5b9da0dec..5d3674f09634 100644
--- a/utilities/blob_db/blob_db_test.cc
+++ b/utilities/blob_db/blob_db_test.cc
@@ -84,6 +84,7 @@ class BlobDBTest : public testing::Test {
       options.stats_dump_period_sec = 0;
       options.stats_persist_period_sec = 0;
     }
+    bdb_options_ = bdb_options;
     return BlobDB::Open(options, bdb_options, dbname_, &blob_db_);
   }
 
@@ -109,17 +110,16 @@ class BlobDBTest : public testing::Test {
   void Destroy() {
     if (blob_db_) {
       Options options = blob_db_->GetOptions();
-      BlobDBOptions bdb_options = blob_db_->GetBlobDBOptions();
       delete blob_db_;
       blob_db_ = nullptr;
-      ASSERT_OK(DestroyBlobDB(dbname_, options, bdb_options));
+      ASSERT_OK(DestroyBlobDB(dbname_, options, bdb_options_));
     }
   }
 
-  BlobDBImpl *blob_db_impl() { return static_cast<BlobDBImpl *>(blob_db_); }
+  BlobDBImpl* blob_db_impl() { return static_cast<BlobDBImpl*>(blob_db_); }
 
-  Status Put(const Slice &key, const Slice &value,
-             std::map<std::string, std::string> *data = nullptr) {
+  Status Put(const Slice& key, const Slice& value,
+             std::map<std::string, std::string>* data = nullptr) {
     Status s = blob_db_->Put(WriteOptions(), key, value);
     if (data != nullptr) {
       (*data)[key.ToString()] = value.ToString();
@@ -127,16 +127,16 @@ class BlobDBTest : public testing::Test {
     return s;
   }
 
-  void Delete(const std::string &key,
-              std::map<std::string, std::string> *data = nullptr) {
+  void Delete(const std::string& key,
+              std::map<std::string, std::string>* data = nullptr) {
     ASSERT_OK(blob_db_->Delete(WriteOptions(), key));
     if (data != nullptr) {
       data->erase(key);
     }
   }
 
-  Status PutWithTTL(const Slice &key, const Slice &value, uint64_t ttl,
-                    std::map<std::string, std::string> *data = nullptr) {
+  Status PutWithTTL(const Slice& key, const Slice& value, uint64_t ttl,
+                    std::map<std::string, std::string>* data = nullptr) {
     Status s = blob_db_->PutWithTTL(WriteOptions(), key, value, ttl);
     if (data != nullptr) {
       (*data)[key.ToString()] = value.ToString();
@@ -144,12 +144,8 @@ class BlobDBTest : public testing::Test {
     return s;
   }
 
-  Status PutUntil(const Slice &key, const Slice &value, uint64_t expiration) {
-    return blob_db_->PutUntil(WriteOptions(), key, value, expiration);
-  }
-
-  void PutRandomWithTTL(const std::string &key, uint64_t ttl, Random *rnd,
-                        std::map<std::string, std::string> *data = nullptr) {
+  void PutRandomWithTTL(const std::string& key, uint64_t ttl, Random* rnd,
+                        std::map<std::string, std::string>* data = nullptr) {
     int len = rnd->Next() % kMaxBlobSize + 1;
     std::string value = rnd->HumanReadableString(len);
     ASSERT_OK(
@@ -159,24 +155,13 @@ class BlobDBTest : public testing::Test {
     }
   }
 
-  void PutRandomUntil(const std::string &key, uint64_t expiration, Random *rnd,
-                      std::map<std::string, std::string> *data = nullptr) {
-    int len = rnd->Next() % kMaxBlobSize + 1;
-    std::string value = rnd->HumanReadableString(len);
-    ASSERT_OK(blob_db_->PutUntil(WriteOptions(), Slice(key), Slice(value),
-                                 expiration));
-    if (data != nullptr) {
-      (*data)[key] = value;
-    }
-  }
-
-  void PutRandom(const std::string &key, Random *rnd,
-                 std::map<std::string, std::string> *data = nullptr) {
+  void PutRandom(const std::string& key, Random* rnd,
+                 std::map<std::string, std::string>* data = nullptr) {
     PutRandom(blob_db_, key, rnd, data);
   }
 
-  void PutRandom(DB *db, const std::string &key, Random *rnd,
-                 std::map<std::string, std::string> *data = nullptr) {
+  void PutRandom(DB* db, const std::string& key, Random* rnd,
+                 std::map<std::string, std::string>* data = nullptr) {
     int len = rnd->Next() % kMaxBlobSize + 1;
     std::string value = rnd->HumanReadableString(len);
     ASSERT_OK(db->Put(WriteOptions(), Slice(key), Slice(value)));
@@ -186,8 +171,8 @@ class BlobDBTest : public testing::Test {
   }
 
   void PutRandomToWriteBatch(
-      const std::string &key, Random *rnd, WriteBatch *batch,
-      std::map<std::string, std::string> *data = nullptr) {
+      const std::string& key, Random* rnd, WriteBatch* batch,
+      std::map<std::string, std::string>* data = nullptr) {
     int len = rnd->Next() % kMaxBlobSize + 1;
     std::string value = rnd->HumanReadableString(len);
     ASSERT_OK(batch->Put(key, value));
@@ -197,14 +182,14 @@ class BlobDBTest : public testing::Test {
   }
 
   // Verify blob db contain expected data and nothing more.
-  void VerifyDB(const std::map<std::string, std::string> &data) {
+  void VerifyDB(const std::map<std::string, std::string>& data) {
     VerifyDB(blob_db_, data);
   }
 
-  void VerifyDB(DB *db, const std::map<std::string, std::string> &data) {
+  void VerifyDB(DB* db, const std::map<std::string, std::string>& data) {
     // Verify normal Get
-    auto *cfh = db->DefaultColumnFamily();
-    for (auto &p : data) {
+    auto* cfh = db->DefaultColumnFamily();
+    for (auto& p : data) {
       PinnableSlice value_slice;
       ASSERT_OK(db->Get(ReadOptions(), cfh, p.first, &value_slice));
       ASSERT_EQ(p.second, value_slice.ToString());
@@ -214,9 +199,9 @@ class BlobDBTest : public testing::Test {
     }
 
     // Verify iterators
-    Iterator *iter = db->NewIterator(ReadOptions());
+    Iterator* iter = db->NewIterator(ReadOptions());
     iter->SeekToFirst();
-    for (auto &p : data) {
+    for (auto& p : data) {
       ASSERT_TRUE(iter->Valid());
       ASSERT_EQ(p.first, iter->key().ToString());
       ASSERT_EQ(p.second, iter->value().ToString());
@@ -228,16 +213,16 @@ class BlobDBTest : public testing::Test {
   }
 
   void VerifyBaseDB(
-      const std::map<std::string, KeyVersion> &expected_versions) {
-    auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
-    DB *db = blob_db_->GetRootDB();
+      const std::map<std::string, KeyVersion>& expected_versions) {
+    auto* bdb_impl = static_cast<BlobDBImpl*>(blob_db_);
+    DB* db = blob_db_->GetRootDB();
     const size_t kMaxKeys = 10000;
     std::vector<KeyVersion> versions;
-    ASSERT_OK(GetAllKeyVersions(db, "", "", kMaxKeys, &versions));
+    ASSERT_OK(GetAllKeyVersions(db, {}, {}, kMaxKeys, &versions));
     ASSERT_EQ(expected_versions.size(), versions.size());
     size_t i = 0;
-    for (auto &key_version : expected_versions) {
-      const KeyVersion &expected_version = key_version.second;
+    for (auto& key_version : expected_versions) {
+      const KeyVersion& expected_version = key_version.second;
       ASSERT_EQ(expected_version.user_key, versions[i].user_key);
       ASSERT_EQ(expected_version.sequence, versions[i].sequence);
       ASSERT_EQ(expected_version.type, versions[i].type);
@@ -255,16 +240,16 @@ class BlobDBTest : public testing::Test {
   }
 
   void VerifyBaseDBBlobIndex(
-      const std::map<std::string, BlobIndexVersion> &expected_versions) {
+      const std::map<std::string, BlobIndexVersion>& expected_versions) {
     const size_t kMaxKeys = 10000;
     std::vector<KeyVersion> versions;
     ASSERT_OK(
-        GetAllKeyVersions(blob_db_->GetRootDB(), "", "", kMaxKeys, &versions));
+        GetAllKeyVersions(blob_db_->GetRootDB(), {}, {}, kMaxKeys, &versions));
     ASSERT_EQ(versions.size(), expected_versions.size());
 
     size_t i = 0;
-    for (const auto &expected_pair : expected_versions) {
-      const BlobIndexVersion &expected_version = expected_pair.second;
+    for (const auto& expected_pair : expected_versions) {
+      const BlobIndexVersion& expected_version = expected_pair.second;
 
       ASSERT_EQ(versions[i].user_key, expected_version.user_key);
       ASSERT_EQ(versions[i].sequence, expected_version.sequence);
@@ -280,10 +265,7 @@ class BlobDBTest : public testing::Test {
       BlobIndex blob_index;
       ASSERT_OK(blob_index.DecodeFrom(versions[i].value));
 
-      const uint64_t file_number = !blob_index.IsInlined()
-                                       ? blob_index.file_number()
-                                       : kInvalidBlobFileNumber;
-      ASSERT_EQ(file_number, expected_version.file_number);
+      ASSERT_EQ(blob_index.file_number(), expected_version.file_number);
 
       const uint64_t expiration =
           blob_index.HasTTL() ? blob_index.expiration() : kNoExpiration;
@@ -312,13 +294,13 @@ class BlobDBTest : public testing::Test {
   std::shared_ptr<MockSystemClock> mock_clock_;
   std::unique_ptr<Env> mock_env_;
   std::unique_ptr<FaultInjectionTestEnv> fault_injection_env_;
-  BlobDB *blob_db_;
+  BlobDB* blob_db_;
+  BlobDBOptions bdb_options_;
 };  // class BlobDBTest
 
 TEST_F(BlobDBTest, Put) {
   Random rnd(301);
   BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 0;
   bdb_options.disable_background_tasks = true;
   Open(bdb_options);
   std::map<std::string, std::string> data;
@@ -334,8 +316,6 @@ TEST_F(BlobDBTest, PutWithTTL) {
   options.env = mock_env_.get();
   BlobDBOptions bdb_options;
   bdb_options.ttl_range_secs = 1000;
-  bdb_options.min_blob_size = 0;
-  bdb_options.blob_file_size = 256 * 1000 * 1000;
   bdb_options.disable_background_tasks = true;
   Open(bdb_options, options);
   std::map<std::string, std::string> data;
@@ -346,33 +326,7 @@ TEST_F(BlobDBTest, PutWithTTL) {
                      (ttl <= 50 ? nullptr : &data));
   }
   mock_clock_->SetCurrentTime(100);
-  auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
-  auto blob_files = bdb_impl->TEST_GetBlobFiles();
-  ASSERT_EQ(1, blob_files.size());
-  ASSERT_TRUE(blob_files[0]->HasTTL());
-  ASSERT_OK(bdb_impl->TEST_CloseBlobFile(blob_files[0]));
-  VerifyDB(data);
-}
-
-TEST_F(BlobDBTest, PutUntil) {
-  Random rnd(301);
-  Options options;
-  options.env = mock_env_.get();
-  BlobDBOptions bdb_options;
-  bdb_options.ttl_range_secs = 1000;
-  bdb_options.min_blob_size = 0;
-  bdb_options.blob_file_size = 256 * 1000 * 1000;
-  bdb_options.disable_background_tasks = true;
-  Open(bdb_options, options);
-  std::map<std::string, std::string> data;
-  mock_clock_->SetCurrentTime(50);
-  for (size_t i = 0; i < 100; i++) {
-    uint64_t expiration = rnd.Next() % 100 + 50;
-    PutRandomUntil("key" + std::to_string(i), expiration, &rnd,
-                   (expiration <= 100 ? nullptr : &data));
-  }
-  mock_clock_->SetCurrentTime(100);
-  auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
+  auto* bdb_impl = static_cast<BlobDBImpl*>(blob_db_);
   auto blob_files = bdb_impl->TEST_GetBlobFiles();
   ASSERT_EQ(1, blob_files.size());
   ASSERT_TRUE(blob_files[0]->HasTTL());
@@ -383,7 +337,6 @@ TEST_F(BlobDBTest, PutUntil) {
 TEST_F(BlobDBTest, StackableDBGet) {
   Random rnd(301);
   BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 0;
   bdb_options.disable_background_tasks = true;
   Open(bdb_options);
   std::map<std::string, std::string> data;
@@ -391,8 +344,8 @@ TEST_F(BlobDBTest, StackableDBGet) {
     PutRandom("key" + std::to_string(i), &rnd, &data);
   }
   for (size_t i = 0; i < 100; i++) {
-    StackableDB *db = blob_db_;
-    ColumnFamilyHandle *column_family = db->DefaultColumnFamily();
+    StackableDB* db = blob_db_;
+    ColumnFamilyHandle* column_family = db->DefaultColumnFamily();
     std::string key = "key" + std::to_string(i);
     PinnableSlice pinnable_value;
     ASSERT_OK(db->Get(ReadOptions(), column_family, key, &pinnable_value));
@@ -426,10 +379,9 @@ TEST_F(BlobDBTest, GetIOError) {
   Options options;
   options.env = fault_injection_env_.get();
   BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 0;  // Make sure value write to blob file
   bdb_options.disable_background_tasks = true;
   Open(bdb_options, options);
-  ColumnFamilyHandle *column_family = blob_db_->DefaultColumnFamily();
+  ColumnFamilyHandle* column_family = blob_db_->DefaultColumnFamily();
   PinnableSlice value;
   ASSERT_OK(Put("foo", "bar"));
   fault_injection_env_->SetFilesystemActive(false, Status::IOError());
@@ -443,7 +395,6 @@ TEST_F(BlobDBTest, PutIOError) {
   Options options;
   options.env = fault_injection_env_.get();
   BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 0;  // Make sure value write to blob file
   bdb_options.disable_background_tasks = true;
   Open(bdb_options, options);
   fault_injection_env_->SetFilesystemActive(false, Status::IOError());
@@ -455,7 +406,6 @@ TEST_F(BlobDBTest, PutIOError) {
 TEST_F(BlobDBTest, WriteBatch) {
   Random rnd(301);
   BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 0;
   bdb_options.disable_background_tasks = true;
   Open(bdb_options);
   std::map<std::string, std::string> data;
@@ -474,7 +424,6 @@ TEST_F(BlobDBTest, WriteBatch) {
 TEST_F(BlobDBTest, Delete) {
   Random rnd(301);
   BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 0;
   bdb_options.disable_background_tasks = true;
   Open(bdb_options);
   std::map<std::string, std::string> data;
@@ -490,7 +439,6 @@ TEST_F(BlobDBTest, Delete) {
 TEST_F(BlobDBTest, DeleteBatch) {
   Random rnd(301);
   BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 0;
   bdb_options.disable_background_tasks = true;
   Open(bdb_options);
   for (size_t i = 0; i < 100; i++) {
@@ -508,7 +456,6 @@ TEST_F(BlobDBTest, DeleteBatch) {
 TEST_F(BlobDBTest, Override) {
   Random rnd(301);
   BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 0;
   bdb_options.disable_background_tasks = true;
   Open(bdb_options);
   std::map<std::string, std::string> data;
@@ -522,208 +469,6 @@ TEST_F(BlobDBTest, Override) {
   VerifyDB(data);
 }
 
-#ifdef SNAPPY
-TEST_F(BlobDBTest, Compression) {
-  Random rnd(301);
-  BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 0;
-  bdb_options.disable_background_tasks = true;
-  bdb_options.compression = CompressionType::kSnappyCompression;
-  Open(bdb_options);
-  std::map<std::string, std::string> data;
-  for (size_t i = 0; i < 100; i++) {
-    PutRandom("put-key" + std::to_string(i), &rnd, &data);
-  }
-  for (int i = 0; i < 100; i++) {
-    WriteBatch batch;
-    for (size_t j = 0; j < 10; j++) {
-      PutRandomToWriteBatch("write-batch-key" + std::to_string(j * 100 + i),
-                            &rnd, &batch, &data);
-    }
-    ASSERT_OK(blob_db_->Write(WriteOptions(), &batch));
-  }
-  VerifyDB(data);
-}
-
-TEST_F(BlobDBTest, DecompressAfterReopen) {
-  Random rnd(301);
-  BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 0;
-  bdb_options.disable_background_tasks = true;
-  bdb_options.compression = CompressionType::kSnappyCompression;
-  Open(bdb_options);
-  std::map<std::string, std::string> data;
-  for (size_t i = 0; i < 100; i++) {
-    PutRandom("put-key" + std::to_string(i), &rnd, &data);
-  }
-  VerifyDB(data);
-  bdb_options.compression = CompressionType::kNoCompression;
-  Reopen(bdb_options);
-  VerifyDB(data);
-}
-
-TEST_F(BlobDBTest, EnableDisableCompressionGC) {
-  Random rnd(301);
-  BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 0;
-  bdb_options.garbage_collection_cutoff = 1.0;
-  bdb_options.disable_background_tasks = true;
-  bdb_options.compression = kSnappyCompression;
-  Open(bdb_options);
-  std::map<std::string, std::string> data;
-  size_t data_idx = 0;
-  for (; data_idx < 100; data_idx++) {
-    PutRandom("put-key" + std::to_string(data_idx), &rnd, &data);
-  }
-  VerifyDB(data);
-  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
-  ASSERT_EQ(1, blob_files.size());
-  ASSERT_EQ(kSnappyCompression, blob_files[0]->GetCompressionType());
-
-  // disable compression
-  bdb_options.compression = kNoCompression;
-  Reopen(bdb_options);
-
-  // Add more data with new compression type
-  for (; data_idx < 200; data_idx++) {
-    PutRandom("put-key" + std::to_string(data_idx), &rnd, &data);
-  }
-  VerifyDB(data);
-
-  blob_files = blob_db_impl()->TEST_GetBlobFiles();
-  ASSERT_EQ(2, blob_files.size());
-  ASSERT_EQ(kNoCompression, blob_files[1]->GetCompressionType());
-
-  // Enable GC. If we do it earlier the snapshot release triggered compaction
-  // may compact files and trigger GC before we can verify there are two files.
-  bdb_options.enable_garbage_collection = true;
-  Reopen(bdb_options);
-
-  // Trigger compaction
-  ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  blob_db_impl()->TEST_DeleteObsoleteFiles();
-  VerifyDB(data);
-
-  blob_files = blob_db_impl()->TEST_GetBlobFiles();
-  for (const auto &bfile : blob_files) {
-    ASSERT_EQ(kNoCompression, bfile->GetCompressionType());
-  }
-
-  // enabling the compression again
-  bdb_options.compression = kSnappyCompression;
-  Reopen(bdb_options);
-
-  // Add more data with new compression type
-  for (; data_idx < 300; data_idx++) {
-    PutRandom("put-key" + std::to_string(data_idx), &rnd, &data);
-  }
-  VerifyDB(data);
-
-  // Trigger compaction
-  ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  blob_db_impl()->TEST_DeleteObsoleteFiles();
-  VerifyDB(data);
-
-  blob_files = blob_db_impl()->TEST_GetBlobFiles();
-  for (const auto &bfile : blob_files) {
-    ASSERT_EQ(kSnappyCompression, bfile->GetCompressionType());
-  }
-}
-
-#ifdef LZ4
-// Test switch compression types and run GC, it needs both Snappy and LZ4
-// support.
-TEST_F(BlobDBTest, ChangeCompressionGC) {
-  Random rnd(301);
-  BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 0;
-  bdb_options.garbage_collection_cutoff = 1.0;
-  bdb_options.disable_background_tasks = true;
-  bdb_options.compression = kLZ4Compression;
-  Open(bdb_options);
-  std::map<std::string, std::string> data;
-  size_t data_idx = 0;
-  for (; data_idx < 100; data_idx++) {
-    PutRandom("put-key" + std::to_string(data_idx), &rnd, &data);
-  }
-  VerifyDB(data);
-  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
-  ASSERT_EQ(1, blob_files.size());
-  ASSERT_EQ(kLZ4Compression, blob_files[0]->GetCompressionType());
-
-  // Change compression type
-  bdb_options.compression = kSnappyCompression;
-  Reopen(bdb_options);
-
-  // Add more data with Snappy compression type
-  for (; data_idx < 200; data_idx++) {
-    PutRandom("put-key" + std::to_string(data_idx), &rnd, &data);
-  }
-  VerifyDB(data);
-
-  // Verify blob file compression type
-  blob_files = blob_db_impl()->TEST_GetBlobFiles();
-  ASSERT_EQ(2, blob_files.size());
-  ASSERT_EQ(kSnappyCompression, blob_files[1]->GetCompressionType());
-
-  // Enable GC. If we do it earlier the snapshot release triggered compaction
-  // may compact files and trigger GC before we can verify there are two files.
-  bdb_options.enable_garbage_collection = true;
-  Reopen(bdb_options);
-
-  ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  VerifyDB(data);
-
-  blob_db_impl()->TEST_DeleteObsoleteFiles();
-  blob_files = blob_db_impl()->TEST_GetBlobFiles();
-  for (const auto &bfile : blob_files) {
-    ASSERT_EQ(kSnappyCompression, bfile->GetCompressionType());
-  }
-
-  // Disable compression
-  bdb_options.compression = kNoCompression;
-  Reopen(bdb_options);
-  for (; data_idx < 300; data_idx++) {
-    PutRandom("put-key" + std::to_string(data_idx), &rnd, &data);
-  }
-  VerifyDB(data);
-
-  ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  VerifyDB(data);
-
-  blob_db_impl()->TEST_DeleteObsoleteFiles();
-  blob_files = blob_db_impl()->TEST_GetBlobFiles();
-  for (const auto &bfile : blob_files) {
-    ASSERT_EQ(kNoCompression, bfile->GetCompressionType());
-  }
-
-  // switching different compression types to generate mixed compression types
-  bdb_options.compression = kSnappyCompression;
-  Reopen(bdb_options);
-  for (; data_idx < 400; data_idx++) {
-    PutRandom("put-key" + std::to_string(data_idx), &rnd, &data);
-  }
-  VerifyDB(data);
-
-  bdb_options.compression = kLZ4Compression;
-  Reopen(bdb_options);
-  for (; data_idx < 500; data_idx++) {
-    PutRandom("put-key" + std::to_string(data_idx), &rnd, &data);
-  }
-  VerifyDB(data);
-
-  ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  VerifyDB(data);
-
-  blob_db_impl()->TEST_DeleteObsoleteFiles();
-  blob_files = blob_db_impl()->TEST_GetBlobFiles();
-  for (const auto &bfile : blob_files) {
-    ASSERT_EQ(kLZ4Compression, bfile->GetCompressionType());
-  }
-}
-#endif  // LZ4
-#endif  // SNAPPY
-
 TEST_F(BlobDBTest, MultipleWriters) {
   Open(BlobDBOptions());
 
@@ -760,21 +505,19 @@ TEST_F(BlobDBTest, SstFileManager) {
   std::shared_ptr<SstFileManager> sst_file_manager(
       NewSstFileManager(mock_env_.get()));
   sst_file_manager->SetDeleteRateBytesPerSecond(1024 * 1024);
-  SstFileManagerImpl *sfm =
-      static_cast<SstFileManagerImpl *>(sst_file_manager.get());
+  SstFileManagerImpl* sfm =
+      static_cast<SstFileManagerImpl*>(sst_file_manager.get());
 
   BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 0;
   bdb_options.enable_garbage_collection = true;
-  bdb_options.garbage_collection_cutoff = 1.0;
   Options db_options;
 
   int files_scheduled_to_delete = 0;
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "SstFileManagerImpl::ScheduleFileDeletion", [&](void *arg) {
+      "SstFileManagerImpl::ScheduleFileDeletion", [&](void* arg) {
         assert(arg);
-        const std::string *const file_path =
-            static_cast<const std::string *>(arg);
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
         if (file_path->find(".blob") != std::string::npos) {
           ++files_scheduled_to_delete;
         }
@@ -784,12 +527,22 @@ TEST_F(BlobDBTest, SstFileManager) {
 
   Open(bdb_options, db_options);
 
-  // Create one obselete file and clean it.
+  // Create 4 blob files. With GC cutoff of 0.25, the oldest file (file 1)
+  // will be in the GC zone: floor(0.25 * 4) = 1.
   ASSERT_OK(blob_db_->Put(WriteOptions(), "foo", "bar"));
   auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
   ASSERT_EQ(1, blob_files.size());
   std::shared_ptr<BlobFile> bfile = blob_files[0];
   ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(bfile));
+
+  // Create 3 more blob files (files 2-4, outside GC zone).
+  for (int i = 1; i < 4; i++) {
+    ASSERT_OK(blob_db_->Put(WriteOptions(), "key" + std::to_string(i), "val"));
+    blob_files = blob_db_impl()->TEST_GetBlobFiles();
+    ASSERT_EQ(static_cast<size_t>(i + 1), blob_files.size());
+    ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[i]));
+  }
+
   ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   blob_db_impl()->TEST_DeleteObsoleteFiles();
 
@@ -797,7 +550,8 @@ TEST_F(BlobDBTest, SstFileManager) {
   ASSERT_EQ(1, files_scheduled_to_delete);
   Destroy();
   // Make sure that DestroyBlobDB() also goes through delete scheduler.
-  ASSERT_EQ(2, files_scheduled_to_delete);
+  // Remaining files: 3 original (files 2-4) + 1 GC output file = 4 files.
+  ASSERT_EQ(5, files_scheduled_to_delete);
   SyncPoint::GetInstance()->DisableProcessing();
   sfm->WaitForEmptyTrash();
 }
@@ -805,10 +559,10 @@ TEST_F(BlobDBTest, SstFileManager) {
 TEST_F(BlobDBTest, SstFileManagerRestart) {
   int files_scheduled_to_delete = 0;
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "SstFileManagerImpl::ScheduleFileDeletion", [&](void *arg) {
+      "SstFileManagerImpl::ScheduleFileDeletion", [&](void* arg) {
         assert(arg);
-        const std::string *const file_path =
-            static_cast<const std::string *>(arg);
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
         if (file_path->find(".blob") != std::string::npos) {
           ++files_scheduled_to_delete;
         }
@@ -818,11 +572,10 @@ TEST_F(BlobDBTest, SstFileManagerRestart) {
   std::shared_ptr<SstFileManager> sst_file_manager(
       NewSstFileManager(mock_env_.get()));
   sst_file_manager->SetDeleteRateBytesPerSecond(1024 * 1024);
-  SstFileManagerImpl *sfm =
-      static_cast<SstFileManagerImpl *>(sst_file_manager.get());
+  SstFileManagerImpl* sfm =
+      static_cast<SstFileManagerImpl*>(sst_file_manager.get());
 
   BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 0;
   Options db_options;
 
   SyncPoint::GetInstance()->EnableProcessing();
@@ -834,7 +587,7 @@ TEST_F(BlobDBTest, SstFileManagerRestart) {
   Close();
 
   // Create 3 dummy trash files under the blob_dir
-  const auto &fs = db_options.env->GetFileSystem();
+  const auto& fs = db_options.env->GetFileSystem();
   ASSERT_OK(CreateFile(fs, blob_dir + "/000666.blob.trash", "", false));
   ASSERT_OK(CreateFile(fs, blob_dir + "/000888.blob.trash", "", true));
   ASSERT_OK(CreateFile(fs, blob_dir + "/something_not_match.trash", "", false));
@@ -849,7 +602,7 @@ TEST_F(BlobDBTest, SstFileManagerRestart) {
   std::vector<std::string> all_files;
   ASSERT_OK(db_options.env->GetChildren(blob_dir, &all_files));
   int nfiles = 0;
-  for (const auto &f : all_files) {
+  for (const auto& f : all_files) {
     assert(!f.empty());
     if (f[0] == '.') {
       continue;
@@ -863,22 +616,28 @@ TEST_F(BlobDBTest, SstFileManagerRestart) {
 
 TEST_F(BlobDBTest, SnapshotAndGarbageCollection) {
   BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 0;
   bdb_options.enable_garbage_collection = true;
-  bdb_options.garbage_collection_cutoff = 1.0;
   bdb_options.disable_background_tasks = true;
 
   Options options;
   options.disable_auto_compactions = true;
 
-  // i = when to take snapshot
+  // This test verifies that snapshots protect blob files from deletion during
+  // garbage collection. With fixed GC cutoff of 0.25 and 8 immutable files,
+  // floor(0.25 * 8) = 2 files are in the GC zone (files 1 and 2).
+  //
+  // We run 4 iterations with different snapshot timing:
+  //   i=0: snapshot after key1 (before key2) - protects file 1
+  //   i=1: snapshot after key2 (before key3) - protects files 1 and 2
+  //   i=2: snapshot after key9 (after all keys) - no protection needed
+  //   i=3: snapshot after Delete(key2) - no protection needed
   for (int i = 0; i < 4; i++) {
     Destroy();
     Open(bdb_options, options);
 
-    const Snapshot *snapshot = nullptr;
+    const Snapshot* snapshot = nullptr;
 
-    // First file
+    // Create first blob file (will be in GC zone).
     ASSERT_OK(Put("key1", "value"));
     if (i == 0) {
       snapshot = blob_db_->GetSnapshot();
@@ -888,7 +647,8 @@ TEST_F(BlobDBTest, SnapshotAndGarbageCollection) {
     ASSERT_EQ(1, blob_files.size());
     ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[0]));
 
-    // Second file
+    // Create second blob file (will be in GC zone). We track this file
+    // to verify it becomes obsolete after GC relocates its blob.
     ASSERT_OK(Put("key2", "value"));
     if (i == 1) {
       snapshot = blob_db_->GetSnapshot();
@@ -896,39 +656,66 @@ TEST_F(BlobDBTest, SnapshotAndGarbageCollection) {
 
     blob_files = blob_db_impl()->TEST_GetBlobFiles();
     ASSERT_EQ(2, blob_files.size());
-    auto bfile = blob_files[1];
-    ASSERT_FALSE(bfile->Immutable());
-    ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(bfile));
+    auto gc_target_file = blob_files[1];
+    ASSERT_FALSE(gc_target_file->Immutable());
+    ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(gc_target_file));
+
+    // Create files 3-8, all closed (these are outside GC zone).
+    for (int j = 3; j <= 8; j++) {
+      ASSERT_OK(Put("key" + std::to_string(j), "value"));
+      blob_files = blob_db_impl()->TEST_GetBlobFiles();
+      ASSERT_EQ(static_cast<size_t>(j), blob_files.size());
+      ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[j - 1]));
+    }
 
-    // Third file
-    ASSERT_OK(Put("key3", "value"));
+    // Create file 9 but leave it open (mutable). Only immutable files are
+    // counted for GC cutoff calculation.
+    ASSERT_OK(Put("key9", "value"));
     if (i == 2) {
       snapshot = blob_db_->GetSnapshot();
     }
 
+    // Verify we have 9 total files (8 immutable + 1 mutable).
+    blob_files = blob_db_impl()->TEST_GetBlobFiles();
+    ASSERT_EQ(9, blob_files.size());
+
+    // Trigger GC via compaction. Blobs in files 1 and 2 will be relocated
+    // to a new GC output file.
     ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-    ASSERT_TRUE(bfile->Obsolete());
+
+    // Verify gc_target_file (file 2) is now obsolete.
+    ASSERT_TRUE(gc_target_file->Obsolete());
+    // Verify the obsolete sequence matches the latest sequence number.
     ASSERT_EQ(blob_db_->GetLatestSequenceNumber(),
-              bfile->GetObsoleteSequence());
+              gc_target_file->GetObsoleteSequence());
 
     Delete("key2");
     if (i == 3) {
       snapshot = blob_db_->GetSnapshot();
     }
 
-    ASSERT_EQ(4, blob_db_impl()->TEST_GetBlobFiles().size());
+    // Verify we now have 10 files (9 original + 1 GC output file).
+    // Files 1 and 2 are obsolete but not yet deleted.
+    ASSERT_EQ(10, blob_db_impl()->TEST_GetBlobFiles().size());
     blob_db_impl()->TEST_DeleteObsoleteFiles();
 
     if (i >= 2) {
-      // The snapshot shouldn't see data in bfile
-      ASSERT_EQ(2, blob_db_impl()->TEST_GetBlobFiles().size());
+      // Snapshot was taken after all keys were written, so it sees the
+      // post-compaction state where blob indexes point to the GC output file.
+      // Obsolete files 1 and 2 can be deleted immediately.
+      // Verify 8 files remain (10 - 2 obsolete files deleted).
+      ASSERT_EQ(8, blob_db_impl()->TEST_GetBlobFiles().size());
       blob_db_->ReleaseSnapshot(snapshot);
     } else {
-      // The snapshot will see data in bfile, so the file shouldn't be deleted
-      ASSERT_EQ(4, blob_db_impl()->TEST_GetBlobFiles().size());
+      // Snapshot was taken before compaction completed, so it may still
+      // reference blobs in the obsolete files. Files cannot be deleted.
+      // Verify all 10 files still exist.
+      ASSERT_EQ(10, blob_db_impl()->TEST_GetBlobFiles().size());
       blob_db_->ReleaseSnapshot(snapshot);
       blob_db_impl()->TEST_DeleteObsoleteFiles();
-      ASSERT_EQ(2, blob_db_impl()->TEST_GetBlobFiles().size());
+      // After releasing the snapshot, obsolete files can be deleted.
+      // Verify 8 files remain.
+      ASSERT_EQ(8, blob_db_impl()->TEST_GetBlobFiles().size());
     }
   }
 }
@@ -938,8 +725,8 @@ TEST_F(BlobDBTest, ColumnFamilyNotSupported) {
   options.env = mock_env_.get();
   mock_clock_->SetCurrentTime(0);
   Open(BlobDBOptions(), options);
-  ColumnFamilyHandle *default_handle = blob_db_->DefaultColumnFamily();
-  ColumnFamilyHandle *handle = nullptr;
+  ColumnFamilyHandle* default_handle = blob_db_->DefaultColumnFamily();
+  ColumnFamilyHandle* handle = nullptr;
   std::string value;
   std::vector<std::string> values;
   // The call simply pass through to base db. It should succeed.
@@ -948,8 +735,6 @@ TEST_F(BlobDBTest, ColumnFamilyNotSupported) {
   ASSERT_TRUE(blob_db_->Put(WriteOptions(), handle, "k", "v").IsNotSupported());
   ASSERT_TRUE(blob_db_->PutWithTTL(WriteOptions(), handle, "k", "v", 60)
                   .IsNotSupported());
-  ASSERT_TRUE(blob_db_->PutUntil(WriteOptions(), handle, "k", "v", 100)
-                  .IsNotSupported());
   WriteBatch batch;
   ASSERT_OK(batch.Put("k1", "v1"));
   ASSERT_OK(batch.Put(handle, "k2", "v2"));
@@ -970,10 +755,7 @@ TEST_F(BlobDBTest, GetLiveFilesMetaData) {
   Random rnd(301);
 
   BlobDBOptions bdb_options;
-  bdb_options.blob_dir = "blob_dir";
-  bdb_options.path_relative = true;
   bdb_options.ttl_range_secs = 10;
-  bdb_options.min_blob_size = 0;
   bdb_options.disable_background_tasks = true;
 
   Options options;
@@ -986,8 +768,9 @@ TEST_F(BlobDBTest, GetLiveFilesMetaData) {
     PutRandom("key" + std::to_string(i), &rnd, &data);
   }
 
-  constexpr uint64_t expiration = 1000ULL;
-  PutRandomUntil("key100", expiration, &rnd, &data);
+  // At time 0, the stored expiration equals TTL
+  constexpr uint64_t ttl = 1000ULL;
+  PutRandomWithTTL("key100", ttl, &rnd, &data);
 
   std::vector<LiveFileMetaData> metadata;
   blob_db_->GetLiveFilesMetaData(&metadata);
@@ -1003,7 +786,7 @@ TEST_F(BlobDBTest, GetLiveFilesMetaData) {
   const std::string filename2("/blob_dir/000002.blob");
   ASSERT_EQ(filename2, metadata[1].name);
   ASSERT_EQ(2, metadata[1].file_number);
-  ASSERT_EQ(expiration, metadata[1].oldest_ancester_time);
+  ASSERT_EQ(ttl, metadata[1].oldest_ancester_time);
   ASSERT_EQ(kDefaultColumnFamilyName, metadata[1].column_family_name);
 
   std::vector<std::string> livefile;
@@ -1046,16 +829,15 @@ TEST_F(BlobDBTest, MigrateFromPlainRocksDB) {
   // Write to plain rocksdb.
   Options options;
   options.create_if_missing = true;
-  DB *db = nullptr;
+  std::unique_ptr<DB> db;
   ASSERT_OK(DB::Open(options, dbname_, &db));
   for (size_t i = 0; i < kNumIteration; i++) {
     auto key_index = rnd.Next() % kNumKey;
     std::string key = "key" + std::to_string(key_index);
-    PutRandom(db, key, &rnd, &data);
+    PutRandom(db.get(), key, &rnd, &data);
   }
-  VerifyDB(db, data);
-  delete db;
-  db = nullptr;
+  VerifyDB(db.get(), data);
+  db.reset();
 
   // Open as blob db. Verify it can read existing data.
   Open();
@@ -1085,7 +867,6 @@ TEST_F(BlobDBTest, MigrateFromPlainRocksDB) {
       ASSERT_EQ(data[key], value);
     }
   }
-  delete db;
 }
 
 // Test to verify that a NoSpace IOError Status is returned on reaching
@@ -1096,7 +877,6 @@ TEST_F(BlobDBTest, OutOfSpace) {
   options.env = mock_env_.get();
   BlobDBOptions bdb_options;
   bdb_options.max_db_size = 200;
-  bdb_options.is_fifo = false;
   bdb_options.disable_background_tasks = true;
   Open(bdb_options);
 
@@ -1112,256 +892,13 @@ TEST_F(BlobDBTest, OutOfSpace) {
   ASSERT_TRUE(s.IsNoSpace());
 }
 
-TEST_F(BlobDBTest, FIFOEviction) {
-  BlobDBOptions bdb_options;
-  bdb_options.max_db_size = 200;
-  bdb_options.blob_file_size = 100;
-  bdb_options.is_fifo = true;
-  bdb_options.disable_background_tasks = true;
-  Open(bdb_options);
-
-  std::atomic<int> evict_count{0};
-  SyncPoint::GetInstance()->SetCallBack(
-      "BlobDBImpl::EvictOldestBlobFile:Evicted",
-      [&](void *) { evict_count++; });
-  SyncPoint::GetInstance()->EnableProcessing();
-
-  // Each stored blob has an overhead of 32 bytes currently.
-  // So a 100 byte blob should take up 132 bytes.
-  std::string value(100, 'v');
-  ASSERT_OK(blob_db_->PutWithTTL(WriteOptions(), "key1", value, 10));
-  VerifyDB({{"key1", value}});
-
-  ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size());
-
-  // Adding another 100 bytes blob would take the total size to 264 bytes
-  // (2*132). max_db_size will be exceeded
-  // than max_db_size and trigger FIFO eviction.
-  ASSERT_OK(blob_db_->PutWithTTL(WriteOptions(), "key2", value, 60));
-  ASSERT_EQ(1, evict_count);
-  // key1 will exist until corresponding file be deleted.
-  VerifyDB({{"key1", value}, {"key2", value}});
-
-  // Adding another 100 bytes blob without TTL.
-  ASSERT_OK(blob_db_->Put(WriteOptions(), "key3", value));
-  ASSERT_EQ(2, evict_count);
-  // key1 and key2 will exist until corresponding file be deleted.
-  VerifyDB({{"key1", value}, {"key2", value}, {"key3", value}});
-
-  // The fourth blob file, without TTL.
-  ASSERT_OK(blob_db_->Put(WriteOptions(), "key4", value));
-  ASSERT_EQ(3, evict_count);
-  VerifyDB(
-      {{"key1", value}, {"key2", value}, {"key3", value}, {"key4", value}});
-
-  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
-  ASSERT_EQ(4, blob_files.size());
-  ASSERT_TRUE(blob_files[0]->Obsolete());
-  ASSERT_TRUE(blob_files[1]->Obsolete());
-  ASSERT_TRUE(blob_files[2]->Obsolete());
-  ASSERT_FALSE(blob_files[3]->Obsolete());
-  auto obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles();
-  ASSERT_EQ(3, obsolete_files.size());
-  ASSERT_EQ(blob_files[0], obsolete_files[0]);
-  ASSERT_EQ(blob_files[1], obsolete_files[1]);
-  ASSERT_EQ(blob_files[2], obsolete_files[2]);
-
-  blob_db_impl()->TEST_DeleteObsoleteFiles();
-  obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles();
-  ASSERT_TRUE(obsolete_files.empty());
-  VerifyDB({{"key4", value}});
-}
-
-TEST_F(BlobDBTest, FIFOEviction_NoOldestFileToEvict) {
-  Options options;
-  BlobDBOptions bdb_options;
-  bdb_options.max_db_size = 1000;
-  bdb_options.blob_file_size = 5000;
-  bdb_options.is_fifo = true;
-  bdb_options.disable_background_tasks = true;
-  Open(bdb_options);
-
-  std::atomic<int> evict_count{0};
-  SyncPoint::GetInstance()->SetCallBack(
-      "BlobDBImpl::EvictOldestBlobFile:Evicted",
-      [&](void *) { evict_count++; });
-  SyncPoint::GetInstance()->EnableProcessing();
-
-  std::string value(2000, 'v');
-  ASSERT_TRUE(Put("foo", std::string(2000, 'v')).IsNoSpace());
-  ASSERT_EQ(0, evict_count);
-}
-
-TEST_F(BlobDBTest, FIFOEviction_NoEnoughBlobFilesToEvict) {
-  BlobDBOptions bdb_options;
-  bdb_options.is_fifo = true;
-  bdb_options.min_blob_size = 100;
-  bdb_options.disable_background_tasks = true;
-  Options options;
-  // Use mock env to stop wall clock.
-  options.env = mock_env_.get();
-  options.disable_auto_compactions = true;
-  auto statistics = CreateDBStatistics();
-  options.statistics = statistics;
-  Open(bdb_options, options);
-
-  SyncPoint::GetInstance()->LoadDependency(
-      {{"DBImpl::NotifyOnFlushCompleted::PostAllOnFlushCompleted",
-        "BlobDBTest.FIFOEviction_NoEnoughBlobFilesToEvict:AfterFlush"}});
-
-  SyncPoint::GetInstance()->EnableProcessing();
-
-  ASSERT_EQ(0, blob_db_impl()->TEST_live_sst_size());
-  std::string small_value(50, 'v');
-  std::map<std::string, std::string> data;
-  // Insert some data into LSM tree to make sure FIFO eviction take SST
-  // file size into account.
-  for (int i = 0; i < 1000; i++) {
-    ASSERT_OK(Put("key" + std::to_string(i), small_value, &data));
-  }
-  ASSERT_OK(blob_db_->Flush(FlushOptions()));
-
-  uint64_t live_sst_size = 0;
-  ASSERT_TRUE(blob_db_->GetIntProperty(DB::Properties::kTotalSstFilesSize,
-                                       &live_sst_size));
-  ASSERT_TRUE(live_sst_size > 0);
-
-  TEST_SYNC_POINT(
-      "BlobDBTest.FIFOEviction_NoEnoughBlobFilesToEvict:AfterFlush");
-
-  ASSERT_EQ(live_sst_size, blob_db_impl()->TEST_live_sst_size());
-
-  bdb_options.max_db_size = live_sst_size + 2000;
-  Reopen(bdb_options, options);
-  ASSERT_EQ(live_sst_size, blob_db_impl()->TEST_live_sst_size());
-
-  std::string value_1k(1000, 'v');
-  ASSERT_OK(PutWithTTL("large_key1", value_1k, 60, &data));
-  ASSERT_EQ(0, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
-  VerifyDB(data);
-  // large_key2 evicts large_key1
-  ASSERT_OK(PutWithTTL("large_key2", value_1k, 60, &data));
-  ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
-  blob_db_impl()->TEST_DeleteObsoleteFiles();
-  data.erase("large_key1");
-  VerifyDB(data);
-  // large_key3 get no enough space even after evicting large_key2, so it
-  // instead return no space error.
-  std::string value_2k(2000, 'v');
-  ASSERT_TRUE(PutWithTTL("large_key3", value_2k, 60).IsNoSpace());
-  ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
-  // Verify large_key2 still exists.
-  VerifyDB(data);
-
-  SyncPoint::GetInstance()->DisableProcessing();
-}
-
-// Test flush or compaction will trigger FIFO eviction since they update
-// total SST file size.
-TEST_F(BlobDBTest, FIFOEviction_TriggerOnSSTSizeChange) {
-  BlobDBOptions bdb_options;
-  bdb_options.max_db_size = 1000;
-  bdb_options.is_fifo = true;
-  bdb_options.min_blob_size = 100;
-  bdb_options.disable_background_tasks = true;
-  Options options;
-  // Use mock env to stop wall clock.
-  options.env = mock_env_.get();
-  auto statistics = CreateDBStatistics();
-  options.statistics = statistics;
-  options.compression = kNoCompression;
-  Open(bdb_options, options);
-
-  SyncPoint::GetInstance()->LoadDependency(
-      {{"DBImpl::NotifyOnFlushCompleted::PostAllOnFlushCompleted",
-        "BlobDBTest.FIFOEviction_TriggerOnSSTSizeChange:AfterFlush"}});
-
-  SyncPoint::GetInstance()->EnableProcessing();
-
-  std::string value(800, 'v');
-  ASSERT_OK(PutWithTTL("large_key", value, 60));
-  ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size());
-  ASSERT_EQ(0, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
-  VerifyDB({{"large_key", value}});
-
-  // Insert some small keys and flush to bring DB out of space.
-  std::map<std::string, std::string> data;
-  for (int i = 0; i < 10; i++) {
-    ASSERT_OK(Put("key" + std::to_string(i), "v", &data));
-  }
-  ASSERT_OK(blob_db_->Flush(FlushOptions()));
-
-  TEST_SYNC_POINT("BlobDBTest.FIFOEviction_TriggerOnSSTSizeChange:AfterFlush");
-
-  // Verify large_key is deleted by FIFO eviction.
-  blob_db_impl()->TEST_DeleteObsoleteFiles();
-  ASSERT_EQ(0, blob_db_impl()->TEST_GetBlobFiles().size());
-  ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
-  VerifyDB(data);
-
-  SyncPoint::GetInstance()->DisableProcessing();
-}
-
-TEST_F(BlobDBTest, InlineSmallValues) {
-  constexpr uint64_t kMaxExpiration = 1000;
-  Random rnd(301);
-  BlobDBOptions bdb_options;
-  bdb_options.ttl_range_secs = kMaxExpiration;
-  bdb_options.min_blob_size = 100;
-  bdb_options.blob_file_size = 256 * 1000 * 1000;
-  bdb_options.disable_background_tasks = true;
-  Options options;
-  options.env = mock_env_.get();
-  mock_clock_->SetCurrentTime(0);
-  Open(bdb_options, options);
-  std::map<std::string, std::string> data;
-  std::map<std::string, KeyVersion> versions;
-  for (size_t i = 0; i < 1000; i++) {
-    bool is_small_value = rnd.Next() % 2;
-    bool has_ttl = rnd.Next() % 2;
-    uint64_t expiration = rnd.Next() % kMaxExpiration;
-    int len = is_small_value ? 50 : 200;
-    std::string key = "key" + std::to_string(i);
-    std::string value = rnd.HumanReadableString(len);
-    std::string blob_index;
-    data[key] = value;
-    SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1;
-    if (!has_ttl) {
-      ASSERT_OK(blob_db_->Put(WriteOptions(), key, value));
-    } else {
-      ASSERT_OK(blob_db_->PutUntil(WriteOptions(), key, value, expiration));
-    }
-    ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), sequence);
-    versions[key] =
-        KeyVersion(key, value, sequence,
-                   (is_small_value && !has_ttl) ? kTypeValue : kTypeBlobIndex);
-  }
-  VerifyDB(data);
-  VerifyBaseDB(versions);
-  auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
-  auto blob_files = bdb_impl->TEST_GetBlobFiles();
-  ASSERT_EQ(2, blob_files.size());
-  std::shared_ptr<BlobFile> non_ttl_file;
-  std::shared_ptr<BlobFile> ttl_file;
-  if (blob_files[0]->HasTTL()) {
-    ttl_file = blob_files[0];
-    non_ttl_file = blob_files[1];
-  } else {
-    non_ttl_file = blob_files[0];
-    ttl_file = blob_files[1];
-  }
-  ASSERT_FALSE(non_ttl_file->HasTTL());
-  ASSERT_TRUE(ttl_file->HasTTL());
-}
-
 TEST_F(BlobDBTest, UserCompactionFilter) {
   class CustomerFilter : public CompactionFilter {
    public:
-    bool Filter(int /*level*/, const Slice & /*key*/, const Slice &value,
-                std::string *new_value, bool *value_changed) const override {
+    bool Filter(int /*level*/, const Slice& /*key*/, const Slice& value,
+                std::string* new_value, bool* value_changed) const override {
       *value_changed = false;
-      // changing value size to test value transitions between inlined data
-      // and stored-in-blob data
+      // Test compaction filter modifying blob values
       if (value.size() % 4 == 1) {
         *new_value = value.ToString();
         // double size by duplicating value
@@ -1380,31 +917,23 @@ TEST_F(BlobDBTest, UserCompactionFilter) {
       return false;
     }
     bool IgnoreSnapshots() const override { return true; }
-    const char *Name() const override { return "CustomerFilter"; }
+    const char* Name() const override { return "CustomerFilter"; }
   };
   class CustomerFilterFactory : public CompactionFilterFactory {
-    const char *Name() const override { return "CustomerFilterFactory"; }
+    const char* Name() const override { return "CustomerFilterFactory"; }
     std::unique_ptr<CompactionFilter> CreateCompactionFilter(
-        const CompactionFilter::Context & /*context*/) override {
+        const CompactionFilter::Context& /*context*/) override {
       return std::unique_ptr<CompactionFilter>(new CustomerFilter());
     }
   };
 
   constexpr size_t kNumPuts = 1 << 10;
-  // Generate both inlined and blob value
   constexpr uint64_t kMinValueSize = 1 << 6;
   constexpr uint64_t kMaxValueSize = 1 << 8;
-  constexpr uint64_t kMinBlobSize = 1 << 7;
-  static_assert(kMinValueSize < kMinBlobSize);
-  static_assert(kMaxValueSize > kMinBlobSize);
 
   BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = kMinBlobSize;
   bdb_options.blob_file_size = kMaxValueSize * 10;
   bdb_options.disable_background_tasks = true;
-  if (Snappy_Supported()) {
-    bdb_options.compression = CompressionType::kSnappyCompression;
-  }
   // case_num == 0: Test user defined compaction filter
   // case_num == 1: Test user defined compaction filter factory
   for (int case_num = 0; case_num < 2; case_num++) {
@@ -1467,24 +996,22 @@ TEST_F(BlobDBTest, UserCompactionFilter) {
 TEST_F(BlobDBTest, UserCompactionFilter_BlobIOError) {
   class CustomerFilter : public CompactionFilter {
    public:
-    bool Filter(int /*level*/, const Slice & /*key*/, const Slice &value,
-                std::string *new_value, bool *value_changed) const override {
+    bool Filter(int /*level*/, const Slice& /*key*/, const Slice& value,
+                std::string* new_value, bool* value_changed) const override {
       *new_value = value.ToString() + "_new";
       *value_changed = true;
       return false;
     }
     bool IgnoreSnapshots() const override { return true; }
-    const char *Name() const override { return "CustomerFilter"; }
+    const char* Name() const override { return "CustomerFilter"; }
   };
 
   constexpr size_t kNumPuts = 100;
   constexpr int kValueSize = 100;
 
   BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 0;
   bdb_options.blob_file_size = kValueSize * 10;
   bdb_options.disable_background_tasks = true;
-  bdb_options.compression = CompressionType::kNoCompression;
 
   std::vector<std::string> io_failure_cases = {
       "BlobDBImpl::CreateBlobFileAndWriter",
@@ -1518,7 +1045,7 @@ TEST_F(BlobDBTest, UserCompactionFilter_BlobIOError) {
     VerifyDB(data);
 
     SyncPoint::GetInstance()->SetCallBack(
-        io_failure_cases[case_num], [&](void * /*arg*/) {
+        io_failure_cases[case_num], [&](void* /*arg*/) {
           fault_injection_env_->SetFilesystemActive(false, Status::IOError());
         });
     SyncPoint::GetInstance()->EnableProcessing();
@@ -1542,13 +1069,11 @@ TEST_F(BlobDBTest, UserCompactionFilter_BlobIOError) {
 TEST_F(BlobDBTest, FilterExpiredBlobIndex) {
   constexpr size_t kNumKeys = 100;
   constexpr size_t kNumPuts = 1000;
-  constexpr uint64_t kMaxExpiration = 1000;
+  constexpr uint64_t kMaxTTL = 1000;
   constexpr uint64_t kCompactTime = 500;
-  constexpr uint64_t kMinBlobSize = 100;
   Random rnd(301);
   mock_clock_->SetCurrentTime(0);
   BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = kMinBlobSize;
   bdb_options.disable_background_tasks = true;
   Options options;
   options.env = mock_env_.get();
@@ -1557,25 +1082,18 @@ TEST_F(BlobDBTest, FilterExpiredBlobIndex) {
   std::map<std::string, std::string> data;
   std::map<std::string, std::string> data_after_compact;
   for (size_t i = 0; i < kNumPuts; i++) {
-    bool is_small_value = rnd.Next() % 2;
     bool has_ttl = rnd.Next() % 2;
-    uint64_t expiration = rnd.Next() % kMaxExpiration;
-    int len = is_small_value ? 10 : 200;
+    // At time 0, stored expiration equals TTL
+    uint64_t ttl = rnd.Next() % kMaxTTL;
+    int len = rnd.Next() % 200 + 10;
     std::string key = "key" + std::to_string(rnd.Next() % kNumKeys);
     std::string value = rnd.HumanReadableString(len);
     if (!has_ttl) {
-      if (is_small_value) {
-        std::string blob_entry;
-        BlobIndex::EncodeInlinedTTL(&blob_entry, expiration, value);
-        // Fake blob index with TTL. See what it will do.
-        ASSERT_GT(kMinBlobSize, blob_entry.size());
-        value = blob_entry;
-      }
       ASSERT_OK(Put(key, value));
       data_after_compact[key] = value;
     } else {
-      ASSERT_OK(PutUntil(key, value, expiration));
-      if (expiration <= kCompactTime) {
+      ASSERT_OK(blob_db_->PutWithTTL(WriteOptions(), key, value, ttl));
+      if (ttl <= kCompactTime) {
         data_after_compact.erase(key);
       } else {
         data_after_compact[key] = value;
@@ -1588,16 +1106,16 @@ TEST_F(BlobDBTest, FilterExpiredBlobIndex) {
   mock_clock_->SetCurrentTime(kCompactTime);
   // Take a snapshot before compaction. Make sure expired blob indexes is
   // filtered regardless of snapshot.
-  const Snapshot *snapshot = blob_db_->GetSnapshot();
+  const Snapshot* snapshot = blob_db_->GetSnapshot();
   // Issue manual compaction to trigger compaction filter.
   ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   blob_db_->ReleaseSnapshot(snapshot);
   // Verify expired blob index are filtered.
   std::vector<KeyVersion> versions;
   const size_t kMaxKeys = 10000;
-  ASSERT_OK(GetAllKeyVersions(blob_db_, "", "", kMaxKeys, &versions));
+  ASSERT_OK(GetAllKeyVersions(blob_db_, {}, {}, kMaxKeys, &versions));
   ASSERT_EQ(data_after_compact.size(), versions.size());
-  for (auto &version : versions) {
+  for (auto& version : versions) {
     ASSERT_TRUE(data_after_compact.count(version.user_key) > 0);
   }
   VerifyDB(data_after_compact);
@@ -1607,7 +1125,6 @@ TEST_F(BlobDBTest, FilterExpiredBlobIndex) {
 // blob file has been removed.
 TEST_F(BlobDBTest, FilterFileNotAvailable) {
   BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 0;
   bdb_options.disable_background_tasks = true;
   Options options;
   options.disable_auto_compactions = true;
@@ -1627,16 +1144,16 @@ TEST_F(BlobDBTest, FilterFileNotAvailable) {
 
   const size_t kMaxKeys = 10000;
 
-  DB *base_db = blob_db_->GetRootDB();
+  DB* base_db = blob_db_->GetRootDB();
   std::vector<KeyVersion> versions;
-  ASSERT_OK(GetAllKeyVersions(base_db, "", "", kMaxKeys, &versions));
+  ASSERT_OK(GetAllKeyVersions(base_db, {}, {}, kMaxKeys, &versions));
   ASSERT_EQ(2, versions.size());
   ASSERT_EQ("bar", versions[0].user_key);
   ASSERT_EQ("foo", versions[1].user_key);
   VerifyDB({{"bar", "v2"}, {"foo", "v1"}});
 
   ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  ASSERT_OK(GetAllKeyVersions(base_db, "", "", kMaxKeys, &versions));
+  ASSERT_OK(GetAllKeyVersions(base_db, {}, {}, kMaxKeys, &versions));
   ASSERT_EQ(2, versions.size());
   ASSERT_EQ("bar", versions[0].user_key);
   ASSERT_EQ("foo", versions[1].user_key);
@@ -1646,7 +1163,7 @@ TEST_F(BlobDBTest, FilterFileNotAvailable) {
   blob_db_impl()->TEST_ObsoleteBlobFile(blob_files[0]);
   blob_db_impl()->TEST_DeleteObsoleteFiles();
   ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  ASSERT_OK(GetAllKeyVersions(base_db, "", "", kMaxKeys, &versions));
+  ASSERT_OK(GetAllKeyVersions(base_db, {}, {}, kMaxKeys, &versions));
   ASSERT_EQ(1, versions.size());
   ASSERT_EQ("bar", versions[0].user_key);
   VerifyDB({{"bar", "v2"}});
@@ -1655,132 +1172,30 @@ TEST_F(BlobDBTest, FilterFileNotAvailable) {
   blob_db_impl()->TEST_ObsoleteBlobFile(blob_files[1]);
   blob_db_impl()->TEST_DeleteObsoleteFiles();
   ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  ASSERT_OK(GetAllKeyVersions(base_db, "", "", kMaxKeys, &versions));
+  ASSERT_OK(GetAllKeyVersions(base_db, {}, {}, kMaxKeys, &versions));
   ASSERT_EQ(0, versions.size());
   VerifyDB({});
 }
 
-// Test compaction filter should filter any inlined TTL keys that would have
-// been dropped by last FIFO eviction if they are store out-of-line.
-TEST_F(BlobDBTest, FilterForFIFOEviction) {
-  Random rnd(215);
-  BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 100;
-  bdb_options.ttl_range_secs = 60;
-  bdb_options.max_db_size = 0;
-  bdb_options.disable_background_tasks = true;
-  Options options;
-  // Use mock env to stop wall clock.
-  mock_clock_->SetCurrentTime(0);
-  options.env = mock_env_.get();
-  auto statistics = CreateDBStatistics();
-  options.statistics = statistics;
-  options.disable_auto_compactions = true;
-  Open(bdb_options, options);
-
-  SyncPoint::GetInstance()->LoadDependency(
-      {{"DBImpl::NotifyOnFlushCompleted::PostAllOnFlushCompleted",
-        "BlobDBTest.FilterForFIFOEviction:AfterFlush"}});
-
-  SyncPoint::GetInstance()->EnableProcessing();
-
-  std::map<std::string, std::string> data;
-  std::map<std::string, std::string> data_after_compact;
-  // Insert some small values that will be inlined.
-  for (int i = 0; i < 1000; i++) {
-    std::string key = "key" + std::to_string(i);
-    std::string value = rnd.HumanReadableString(50);
-    uint64_t ttl = rnd.Next() % 120 + 1;
-    ASSERT_OK(PutWithTTL(key, value, ttl, &data));
-    if (ttl >= 60) {
-      data_after_compact[key] = value;
-    }
-  }
-  uint64_t num_keys_to_evict = data.size() - data_after_compact.size();
-  ASSERT_OK(blob_db_->Flush(FlushOptions()));
-
-  TEST_SYNC_POINT("BlobDBTest.FilterForFIFOEviction:AfterFlush");
-
-  uint64_t live_sst_size = blob_db_impl()->TEST_live_sst_size();
-  ASSERT_GT(live_sst_size, 0);
-  VerifyDB(data);
-
-  bdb_options.max_db_size = live_sst_size + 30000;
-  bdb_options.is_fifo = true;
-  Reopen(bdb_options, options);
-  VerifyDB(data);
-
-  // Put two large values, each on a different blob file.
-  std::string large_value(10000, 'v');
-  ASSERT_OK(PutWithTTL("large_key1", large_value, 90));
-  ASSERT_OK(PutWithTTL("large_key2", large_value, 150));
-  ASSERT_EQ(2, blob_db_impl()->TEST_GetBlobFiles().size());
-  ASSERT_EQ(0, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
-  data["large_key1"] = large_value;
-  data["large_key2"] = large_value;
-  VerifyDB(data);
-
-  // Put a third large value which will bring the DB out of space.
-  // FIFO eviction will evict the file of large_key1.
-  ASSERT_OK(PutWithTTL("large_key3", large_value, 150));
-  ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
-  ASSERT_EQ(2, blob_db_impl()->TEST_GetBlobFiles().size());
-  blob_db_impl()->TEST_DeleteObsoleteFiles();
-  ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size());
-  data.erase("large_key1");
-  data["large_key3"] = large_value;
-  VerifyDB(data);
-
-  // Putting some more small values. These values shouldn't be evicted by
-  // compaction filter since they are inserted after FIFO eviction.
-  ASSERT_OK(PutWithTTL("foo", "v", 30, &data_after_compact));
-  ASSERT_OK(PutWithTTL("bar", "v", 30, &data_after_compact));
-
-  // FIFO eviction doesn't trigger again since there enough room for the flush.
-  ASSERT_OK(blob_db_->Flush(FlushOptions()));
-  ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
-
-  // Manual compact and check if compaction filter evict those keys with
-  // expiration < 60.
-  ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  // All keys with expiration < 60, plus large_key1 is filtered by
-  // compaction filter.
-  ASSERT_EQ(num_keys_to_evict + 1,
-            statistics->getTickerCount(BLOB_DB_BLOB_INDEX_EVICTED_COUNT));
-  ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
-  ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size());
-  data_after_compact["large_key2"] = large_value;
-  data_after_compact["large_key3"] = large_value;
-  VerifyDB(data_after_compact);
-
-  SyncPoint::GetInstance()->DisableProcessing();
-}
-
 TEST_F(BlobDBTest, GarbageCollection) {
   constexpr size_t kNumPuts = 1 << 10;
 
-  constexpr uint64_t kExpiration = 1000;
+  // At time 0, stored expiration equals TTL
+  constexpr uint64_t kTTL = 1000;
   constexpr uint64_t kCompactTime = 500;
 
   constexpr uint64_t kKeySize = 7;  // "key" + 4 digits
-
-  constexpr uint64_t kSmallValueSize = 1 << 6;
-  constexpr uint64_t kLargeValueSize = 1 << 8;
-  constexpr uint64_t kMinBlobSize = 1 << 7;
-  static_assert(kSmallValueSize < kMinBlobSize);
-  static_assert(kLargeValueSize > kMinBlobSize);
+  constexpr uint64_t kValueSize = 1 << 8;
 
   constexpr size_t kBlobsPerFile = 8;
   constexpr size_t kNumBlobFiles = kNumPuts / kBlobsPerFile;
   constexpr uint64_t kBlobFileSize =
       BlobLogHeader::kSize +
-      (BlobLogRecord::kHeaderSize + kKeySize + kLargeValueSize) * kBlobsPerFile;
+      (BlobLogRecord::kHeaderSize + kKeySize + kValueSize) * kBlobsPerFile;
 
   BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = kMinBlobSize;
   bdb_options.blob_file_size = kBlobFileSize;
   bdb_options.enable_garbage_collection = true;
-  bdb_options.garbage_collection_cutoff = 0.25;
   bdb_options.disable_background_tasks = true;
 
   Options options;
@@ -1795,14 +1210,14 @@ TEST_F(BlobDBTest, GarbageCollection) {
 
   Random rnd(301);
 
-  // Add a bunch of large non-TTL values. These will be written to non-TTL
+  // Add a bunch of non-TTL values. These will be written to non-TTL
   // blob files and will be subject to GC.
   for (size_t i = 0; i < kNumPuts; ++i) {
     std::ostringstream oss;
     oss << "key" << std::setw(4) << std::setfill('0') << i;
 
     const std::string key(oss.str());
-    const std::string value = rnd.HumanReadableString(kLargeValueSize);
+    const std::string value = rnd.HumanReadableString(kValueSize);
     const SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1;
 
     ASSERT_OK(Put(key, value));
@@ -1815,54 +1230,23 @@ TEST_F(BlobDBTest, GarbageCollection) {
                          sequence, kTypeBlobIndex);
   }
 
-  // Add some small and/or TTL values that will be ignored during GC.
-  // First, add a large TTL value will be written to its own TTL blob file.
+  // Add a TTL value that will be written to its own TTL blob file (ignored
+  // during GC).
   {
     const std::string key("key2000");
-    const std::string value = rnd.HumanReadableString(kLargeValueSize);
+    const std::string value = rnd.HumanReadableString(kValueSize);
     const SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1;
 
-    ASSERT_OK(PutUntil(key, value, kExpiration));
+    ASSERT_OK(blob_db_->PutWithTTL(WriteOptions(), key, value, kTTL));
     ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), sequence);
 
     data[key] = value;
     blob_value_versions[key] = KeyVersion(key, value, sequence, kTypeBlobIndex);
     blob_index_versions[key] =
-        BlobIndexVersion(key, /* file_number */ kNumBlobFiles + 1, kExpiration,
+        BlobIndexVersion(key, /* file_number */ kNumBlobFiles + 1, kTTL,
                          sequence, kTypeBlobIndex);
   }
 
-  // Now add a small TTL value (which will be inlined).
-  {
-    const std::string key("key3000");
-    const std::string value = rnd.HumanReadableString(kSmallValueSize);
-    const SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1;
-
-    ASSERT_OK(PutUntil(key, value, kExpiration));
-    ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), sequence);
-
-    data[key] = value;
-    blob_value_versions[key] = KeyVersion(key, value, sequence, kTypeBlobIndex);
-    blob_index_versions[key] = BlobIndexVersion(
-        key, kInvalidBlobFileNumber, kExpiration, sequence, kTypeBlobIndex);
-  }
-
-  // Finally, add a small non-TTL value (which will be stored as a regular
-  // value).
-  {
-    const std::string key("key4000");
-    const std::string value = rnd.HumanReadableString(kSmallValueSize);
-    const SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1;
-
-    ASSERT_OK(Put(key, value));
-    ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), sequence);
-
-    data[key] = value;
-    blob_value_versions[key] = KeyVersion(key, value, sequence, kTypeValue);
-    blob_index_versions[key] = BlobIndexVersion(
-        key, kInvalidBlobFileNumber, kNoExpiration, sequence, kTypeValue);
-  }
-
   VerifyDB(data);
   VerifyBaseDB(blob_value_versions);
   VerifyBaseDBBlobIndex(blob_index_versions);
@@ -1888,17 +1272,18 @@ TEST_F(BlobDBTest, GarbageCollection) {
   // compaction.
   VerifyDB(data);
 
-  for (auto &pair : blob_value_versions) {
-    KeyVersion &version = pair.second;
+  for (auto& pair : blob_value_versions) {
+    KeyVersion& version = pair.second;
     version.sequence = 0;
   }
 
   VerifyBaseDB(blob_value_versions);
 
-  const uint64_t cutoff = static_cast<uint64_t>(
-      bdb_options.garbage_collection_cutoff * kNumBlobFiles);
-  for (auto &pair : blob_index_versions) {
-    BlobIndexVersion &version = pair.second;
+  // GC cutoff is fixed at 0.25
+  constexpr double kGCCutoff = 0.25;
+  const uint64_t cutoff = static_cast<uint64_t>(kGCCutoff * kNumBlobFiles);
+  for (auto& pair : blob_index_versions) {
+    BlobIndexVersion& version = pair.second;
 
     version.sequence = 0;
 
@@ -1915,7 +1300,7 @@ TEST_F(BlobDBTest, GarbageCollection) {
 
   VerifyBaseDBBlobIndex(blob_index_versions);
 
-  const Statistics *const statistics = options.statistics.get();
+  const Statistics* const statistics = options.statistics.get();
   assert(statistics);
 
   ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_NUM_FILES), cutoff);
@@ -1924,7 +1309,7 @@ TEST_F(BlobDBTest, GarbageCollection) {
   ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_NUM_KEYS_RELOCATED),
             cutoff * kBlobsPerFile);
   ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_BYTES_RELOCATED),
-            cutoff * kBlobsPerFile * kLargeValueSize);
+            cutoff * kBlobsPerFile * kValueSize);
 
   // At this point, we should have 128 immutable non-TTL files with file numbers
   // 33..128 and 130..161. (129 was taken by the TTL blob file.)
@@ -1946,9 +1331,7 @@ TEST_F(BlobDBTest, GarbageCollection) {
 
 TEST_F(BlobDBTest, GarbageCollectionFailure) {
   BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 0;
   bdb_options.enable_garbage_collection = true;
-  bdb_options.garbage_collection_cutoff = 1.0;
   bdb_options.disable_background_tasks = true;
 
   Options db_options;
@@ -1956,14 +1339,31 @@ TEST_F(BlobDBTest, GarbageCollectionFailure) {
 
   Open(bdb_options, db_options);
 
-  // Write a couple of valid blobs.
+  // Create 4 blob files. With fixed GC cutoff of 0.25, the oldest file
+  // (floor(0.25 * 4) = 1) will be in the GC zone.
+  // The first file contains valid blobs for "foo" and "dead".
   ASSERT_OK(Put("foo", "bar"));
   ASSERT_OK(Put("dead", "beef"));
 
-  // Write a fake blob reference into the base DB that points to a non-existing
-  // blob file.
+  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  ASSERT_EQ(blob_files.size(), 1);
+  auto first_file = blob_files[0];
+  uint64_t first_file_number = first_file->BlobFileNumber();
+  ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(first_file));
+
+  // Create 3 more blob files (files 2-4, outside GC zone).
+  for (int i = 1; i < 4; i++) {
+    ASSERT_OK(Put("key" + std::to_string(i), "value"));
+    blob_files = blob_db_impl()->TEST_GetBlobFiles();
+    ASSERT_EQ(static_cast<size_t>(i + 1), blob_files.size());
+    ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[i]));
+  }
+
+  // Write a fake blob index that points to the first file (in GC zone)
+  // but with an invalid offset beyond the file size. This will cause
+  // GC to fail when it tries to read this blob.
   std::string blob_index;
-  BlobIndex::EncodeBlob(&blob_index, /* file_number */ 1000, /* offset */ 1234,
+  BlobIndex::EncodeBlob(&blob_index, first_file_number, /* offset */ 999999,
                         /* size */ 5678, kNoCompression);
 
   WriteBatch batch;
@@ -1971,17 +1371,17 @@ TEST_F(BlobDBTest, GarbageCollectionFailure) {
       &batch, blob_db_->DefaultColumnFamily()->GetID(), "key", blob_index));
   ASSERT_OK(blob_db_->GetRootDB()->Write(WriteOptions(), &batch));
 
-  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
-  ASSERT_EQ(blob_files.size(), 1);
-  auto blob_file = blob_files[0];
-  ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_file));
-
+  // Verify compaction fails with IO error due to invalid blob offset.
   ASSERT_TRUE(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)
                   .IsIOError());
 
-  const Statistics *const statistics = db_options.statistics.get();
+  const Statistics* const statistics = db_options.statistics.get();
   assert(statistics);
 
+  // Verify GC statistics:
+  // - Relocated 2 keys ("foo" and "dead") with 7 bytes ("bar" + "beef")
+  // - Failed on "key" which has invalid blob offset
+  // - Created 1 new GC output file before failing
   ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_NUM_FILES), 0);
   ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_NUM_NEW_FILES), 1);
   ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_FAILURES), 1);
@@ -1993,7 +1393,6 @@ TEST_F(BlobDBTest, GarbageCollectionFailure) {
 TEST_F(BlobDBTest, EvictExpiredFile) {
   BlobDBOptions bdb_options;
   bdb_options.ttl_range_secs = 100;
-  bdb_options.min_blob_size = 0;
   bdb_options.disable_background_tasks = true;
   Options options;
   options.env = mock_env_.get();
@@ -2116,7 +1515,7 @@ TEST_F(BlobDBTest, MaintainBlobFileToSstMapping) {
     const std::vector<bool> expected_obsolete{false, false, false, false,
                                               false};
     for (size_t i = 0; i < 5; ++i) {
-      const auto &blob_file = blob_files[i];
+      const auto& blob_file = blob_files[i];
       ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
       ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
     }
@@ -2144,7 +1543,7 @@ TEST_F(BlobDBTest, MaintainBlobFileToSstMapping) {
     const std::vector<bool> expected_obsolete{false, false, false, false,
                                               false};
     for (size_t i = 0; i < 5; ++i) {
-      const auto &blob_file = blob_files[i];
+      const auto& blob_file = blob_files[i];
       ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
       ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
     }
@@ -2173,7 +1572,7 @@ TEST_F(BlobDBTest, MaintainBlobFileToSstMapping) {
     const std::vector<bool> expected_obsolete{false, false, false, false,
                                               false};
     for (size_t i = 0; i < 5; ++i) {
-      const auto &blob_file = blob_files[i];
+      const auto& blob_file = blob_files[i];
       ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
       ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
     }
@@ -2211,7 +1610,7 @@ TEST_F(BlobDBTest, MaintainBlobFileToSstMapping) {
         {}, {7}, {3, 8, 23}, {4, 9}, {5, 10, 22}};
     const std::vector<bool> expected_obsolete{true, false, false, false, false};
     for (size_t i = 0; i < 5; ++i) {
-      const auto &blob_file = blob_files[i];
+      const auto& blob_file = blob_files[i];
       ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
       ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
     }
@@ -2241,7 +1640,7 @@ TEST_F(BlobDBTest, MaintainBlobFileToSstMapping) {
         {}, {7}, {3, 8, 23}, {4, 9}, {5, 10, 22}};
     const std::vector<bool> expected_obsolete{true, false, false, false, false};
     for (size_t i = 0; i < 5; ++i) {
-      const auto &blob_file = blob_files[i];
+      const auto& blob_file = blob_files[i];
       ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
       ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
     }
@@ -2272,7 +1671,7 @@ TEST_F(BlobDBTest, MaintainBlobFileToSstMapping) {
         {}, {}, {3, 8, 23, 25}, {4, 9}, {5, 10}};
     const std::vector<bool> expected_obsolete{true, false, false, false, false};
     for (size_t i = 0; i < 5; ++i) {
-      const auto &blob_file = blob_files[i];
+      const auto& blob_file = blob_files[i];
       ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
       ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
     }
@@ -2302,7 +1701,7 @@ TEST_F(BlobDBTest, MaintainBlobFileToSstMapping) {
         {}, {}, {3, 8, 23, 25}, {4, 9}, {5, 10}};
     const std::vector<bool> expected_obsolete{true, true, false, false, false};
     for (size_t i = 0; i < 5; ++i) {
-      const auto &blob_file = blob_files[i];
+      const auto& blob_file = blob_files[i];
       ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
       ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
     }
@@ -2323,7 +1722,6 @@ TEST_F(BlobDBTest, MaintainBlobFileToSstMapping) {
 TEST_F(BlobDBTest, ShutdownWait) {
   BlobDBOptions bdb_options;
   bdb_options.ttl_range_secs = 100;
-  bdb_options.min_blob_size = 0;
   bdb_options.disable_background_tasks = false;
   Options options;
   options.env = mock_env_.get();
@@ -2336,15 +1734,15 @@ TEST_F(BlobDBTest, ShutdownWait) {
   });
   // Force all tasks to be scheduled immediately.
   SyncPoint::GetInstance()->SetCallBack(
-      "TimeQueue::Add:item.end", [&](void *arg) {
-        std::chrono::steady_clock::time_point *tp =
-            static_cast<std::chrono::steady_clock::time_point *>(arg);
+      "TimeQueue::Add:item.end", [&](void* arg) {
+        std::chrono::steady_clock::time_point* tp =
+            static_cast<std::chrono::steady_clock::time_point*>(arg);
         *tp =
             std::chrono::steady_clock::now() - std::chrono::milliseconds(10000);
       });
 
   SyncPoint::GetInstance()->SetCallBack(
-      "BlobDBImpl::EvictExpiredFiles:cb", [&](void * /*arg*/) {
+      "BlobDBImpl::EvictExpiredFiles:cb", [&](void* /*arg*/) {
         // Sleep 3 ms to increase the chance of data race.
         // We've synced up the code so that EvictExpiredFiles()
         // is called concurrently with ~BlobDBImpl().
@@ -2387,8 +1785,6 @@ TEST_F(BlobDBTest, SyncBlobFileBeforeClose) {
   options.statistics = CreateDBStatistics();
 
   BlobDBOptions blob_options;
-  blob_options.min_blob_size = 0;
-  blob_options.bytes_per_sync = 1 << 20;
   blob_options.disable_background_tasks = true;
 
   Open(blob_options, options);
@@ -2407,8 +1803,6 @@ TEST_F(BlobDBTest, SyncBlobFileBeforeCloseIOError) {
   options.env = fault_injection_env_.get();
 
   BlobDBOptions blob_options;
-  blob_options.min_blob_size = 0;
-  blob_options.bytes_per_sync = 1 << 20;
   blob_options.disable_background_tasks = true;
 
   Open(blob_options, options);
@@ -2419,7 +1813,7 @@ TEST_F(BlobDBTest, SyncBlobFileBeforeCloseIOError) {
   ASSERT_EQ(blob_files.size(), 1);
 
   SyncPoint::GetInstance()->SetCallBack(
-      "BlobLogWriter::Sync", [this](void * /* arg */) {
+      "BlobLogWriter::Sync", [this](void* /* arg */) {
         fault_injection_env_->SetFilesystemActive(false, Status::IOError());
       });
   SyncPoint::GetInstance()->EnableProcessing();
@@ -2436,7 +1830,7 @@ TEST_F(BlobDBTest, SyncBlobFileBeforeCloseIOError) {
 }  // namespace ROCKSDB_NAMESPACE::blob_db
 
 // A black-box test for the ttl wrapper around rocksdb
-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
   ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
diff --git a/utilities/blob_db/blob_dump_tool.cc b/utilities/blob_db/blob_dump_tool.cc
index 933803f8f30d..535b36fdfa11 100644
--- a/utilities/blob_db/blob_dump_tool.cc
+++ b/utilities/blob_db/blob_dump_tool.cc
@@ -16,7 +16,6 @@
 #include "port/port.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/file_system.h"
-#include "table/format.h"
 #include "util/coding.h"
 #include "util/string_util.h"
 
@@ -26,9 +25,7 @@ BlobDumpTool::BlobDumpTool()
     : reader_(nullptr), buffer_(nullptr), buffer_size_(0) {}
 
 Status BlobDumpTool::Run(const std::string& filename, DisplayType show_key,
-                         DisplayType show_blob,
-                         DisplayType show_uncompressed_blob,
-                         bool show_summary) {
+                         DisplayType show_blob, bool show_summary) {
   constexpr size_t kReadaheadSize = 2 * 1024 * 1024;
   Status s;
   const auto fs = FileSystem::Default();
@@ -54,8 +51,7 @@ Status BlobDumpTool::Run(const std::string& filename, DisplayType show_key,
   reader_.reset(new RandomAccessFileReader(std::move(file), filename));
   uint64_t offset = 0;
   uint64_t footer_offset = 0;
-  CompressionType compression = kNoCompression;
-  s = DumpBlobLogHeader(&offset, &compression);
+  s = DumpBlobLogHeader(&offset);
   if (!s.ok()) {
     return s;
   }
@@ -66,12 +62,10 @@ Status BlobDumpTool::Run(const std::string& filename, DisplayType show_key,
   uint64_t total_records = 0;
   uint64_t total_key_size = 0;
   uint64_t total_blob_size = 0;
-  uint64_t total_uncompressed_blob_size = 0;
-  if (show_key != DisplayType::kNone || show_summary) {
+  if (show_key != DisplayType::kNone) {
     while (offset < footer_offset) {
-      s = DumpRecord(show_key, show_blob, show_uncompressed_blob, show_summary,
-                     compression, &offset, &total_records, &total_key_size,
-                     &total_blob_size, &total_uncompressed_blob_size);
+      s = DumpRecord(show_key, show_blob, &offset, &total_records,
+                     &total_key_size, &total_blob_size);
       if (!s.ok()) {
         break;
       }
@@ -82,10 +76,6 @@ Status BlobDumpTool::Run(const std::string& filename, DisplayType show_key,
     fprintf(stdout, "  total records: %" PRIu64 "\n", total_records);
     fprintf(stdout, "  total key size: %" PRIu64 "\n", total_key_size);
     fprintf(stdout, "  total blob size: %" PRIu64 "\n", total_blob_size);
-    if (compression != kNoCompression) {
-      fprintf(stdout, "  total raw blob size: %" PRIu64 "\n",
-              total_uncompressed_blob_size);
-    }
   }
   return s;
 }
@@ -111,8 +101,7 @@ Status BlobDumpTool::Read(uint64_t offset, size_t size, Slice* result) {
   return s;
 }
 
-Status BlobDumpTool::DumpBlobLogHeader(uint64_t* offset,
-                                       CompressionType* compression) {
+Status BlobDumpTool::DumpBlobLogHeader(uint64_t* offset) {
   Slice slice;
   Status s = Read(0, BlobLogHeader::kSize, &slice);
   if (!s.ok()) {
@@ -127,17 +116,10 @@ Status BlobDumpTool::DumpBlobLogHeader(uint64_t* offset,
   fprintf(stdout, "  Version          : %" PRIu32 "\n", header.version);
   fprintf(stdout, "  Column Family ID : %" PRIu32 "\n",
           header.column_family_id);
-  std::string compression_str;
-  if (!GetStringFromCompressionType(&compression_str, header.compression)
-           .ok()) {
-    compression_str = "Unrecongnized compression type (" +
-                      std::to_string((int)header.compression) + ")";
-  }
-  fprintf(stdout, "  Compression      : %s\n", compression_str.c_str());
+  fprintf(stdout, "  Compression      : kNoCompression\n");
   fprintf(stdout, "  Expiration range : %s\n",
           GetString(header.expiration_range).c_str());
   *offset = BlobLogHeader::kSize;
-  *compression = header.compression;
   return s;
 }
 
@@ -170,12 +152,9 @@ Status BlobDumpTool::DumpBlobLogFooter(uint64_t file_size,
 }
 
 Status BlobDumpTool::DumpRecord(DisplayType show_key, DisplayType show_blob,
-                                DisplayType show_uncompressed_blob,
-                                bool show_summary, CompressionType compression,
                                 uint64_t* offset, uint64_t* total_records,
                                 uint64_t* total_key_size,
-                                uint64_t* total_blob_size,
-                                uint64_t* total_uncompressed_blob_size) {
+                                uint64_t* total_blob_size) {
   if (show_key != DisplayType::kNone) {
     fprintf(stdout, "Read record with offset 0x%" PRIx64 " (%" PRIu64 "):\n",
             *offset, *offset);
@@ -202,22 +181,6 @@ Status BlobDumpTool::DumpRecord(DisplayType show_key, DisplayType show_blob,
   if (!s.ok()) {
     return s;
   }
-  // Decompress value
-  std::string uncompressed_value;
-  if (compression != kNoCompression &&
-      (show_uncompressed_blob != DisplayType::kNone || show_summary)) {
-    BlockContents contents;
-    UncompressionContext context(compression);
-    UncompressionInfo info(context, UncompressionDict::GetEmptyDict(),
-                           compression);
-    s = UncompressBlockData(
-        info, slice.data() + key_size, static_cast<size_t>(value_size),
-        &contents, 2 /*compress_format_version*/, ImmutableOptions(Options()));
-    if (!s.ok()) {
-      return s;
-    }
-    uncompressed_value = contents.data.ToString();
-  }
   if (show_key != DisplayType::kNone) {
     fprintf(stdout, "  key        : ");
     DumpSlice(Slice(slice.data(), static_cast<size_t>(key_size)), show_key);
@@ -227,16 +190,11 @@ Status BlobDumpTool::DumpRecord(DisplayType show_key, DisplayType show_blob,
                       static_cast<size_t>(value_size)),
                 show_blob);
     }
-    if (show_uncompressed_blob != DisplayType::kNone) {
-      fprintf(stdout, "  raw blob   : ");
-      DumpSlice(Slice(uncompressed_value), show_uncompressed_blob);
-    }
   }
   *offset += key_size + value_size;
   *total_records += 1;
   *total_key_size += key_size;
   *total_blob_size += value_size;
-  *total_uncompressed_blob_size += uncompressed_value.size();
   return s;
 }
 
diff --git a/utilities/blob_db/blob_dump_tool.h b/utilities/blob_db/blob_dump_tool.h
index 9876245883ef..a538a38996d4 100644
--- a/utilities/blob_db/blob_dump_tool.h
+++ b/utilities/blob_db/blob_dump_tool.h
@@ -28,8 +28,7 @@ class BlobDumpTool {
   BlobDumpTool();
 
   Status Run(const std::string& filename, DisplayType show_key,
-             DisplayType show_blob, DisplayType show_uncompressed_blob,
-             bool show_summary);
+             DisplayType show_blob, bool show_summary);
 
  private:
   std::unique_ptr<RandomAccessFileReader> reader_;
@@ -37,14 +36,11 @@ class BlobDumpTool {
   size_t buffer_size_;
 
   Status Read(uint64_t offset, size_t size, Slice* result);
-  Status DumpBlobLogHeader(uint64_t* offset, CompressionType* compression);
+  Status DumpBlobLogHeader(uint64_t* offset);
   Status DumpBlobLogFooter(uint64_t file_size, uint64_t* footer_offset);
   Status DumpRecord(DisplayType show_key, DisplayType show_blob,
-                    DisplayType show_uncompressed_blob, bool show_summary,
-                    CompressionType compression, uint64_t* offset,
-                    uint64_t* total_records, uint64_t* total_key_size,
-                    uint64_t* total_blob_size,
-                    uint64_t* total_uncompressed_blob_size);
+                    uint64_t* offset, uint64_t* total_records,
+                    uint64_t* total_key_size, uint64_t* total_blob_size);
   void DumpSlice(const Slice s, DisplayType type);
 
   template <class T>
diff --git a/utilities/blob_db/blob_file.cc b/utilities/blob_db/blob_file.cc
index 5a479dc8bd4b..38b65c297bd6 100644
--- a/utilities/blob_db/blob_file.cc
+++ b/utilities/blob_db/blob_file.cc
@@ -25,18 +25,16 @@ BlobFile::BlobFile(const BlobDBImpl* p, const std::string& bdir, uint64_t fn,
     : parent_(p), path_to_dir_(bdir), file_number_(fn), info_log_(info_log) {}
 
 BlobFile::BlobFile(const BlobDBImpl* p, const std::string& bdir, uint64_t fn,
-                   Logger* info_log, uint32_t column_family_id,
-                   CompressionType compression, bool has_ttl,
+                   Logger* info_log, uint32_t column_family_id, bool has_ttl,
                    const ExpirationRange& expiration_range)
     : parent_(p),
       path_to_dir_(bdir),
       file_number_(fn),
       info_log_(info_log),
       column_family_id_(column_family_id),
-      compression_(compression),
       has_ttl_(has_ttl),
       expiration_range_(expiration_range),
-      header_(column_family_id, compression, has_ttl, expiration_range),
+      header_(column_family_id, kNoCompression, has_ttl, expiration_range),
       header_valid_(true) {}
 
 BlobFile::~BlobFile() {
@@ -50,8 +48,6 @@ BlobFile::~BlobFile() {
   }
 }
 
-uint32_t BlobFile::GetColumnFamilyId() const { return column_family_id_; }
-
 std::string BlobFile::PathName() const {
   return BlobFileName(path_to_dir_, file_number_);
 }
@@ -259,7 +255,6 @@ Status BlobFile::ReadMetadata(const std::shared_ptr<FileSystem>& fs,
     return s;
   }
   column_family_id_ = header.column_family_id;
-  compression_ = header.compression;
   has_ttl_ = header.has_ttl;
   if (has_ttl_) {
     expiration_range_ = header.expiration_range;
diff --git a/utilities/blob_db/blob_file.h b/utilities/blob_db/blob_file.h
index f0ec83ebe8af..4110234d0a06 100644
--- a/utilities/blob_db/blob_file.h
+++ b/utilities/blob_db/blob_file.h
@@ -51,9 +51,6 @@ class BlobFile {
   // Column family id.
   uint32_t column_family_id_{std::numeric_limits<uint32_t>::max()};
 
-  // Compression type of blobs in the file
-  CompressionType compression_{kNoCompression};
-
   // If true, the keys in this file all has TTL. Otherwise all keys don't
   // have TTL.
   bool has_ttl_{false};
@@ -108,14 +105,11 @@ class BlobFile {
            Logger* info_log);
 
   BlobFile(const BlobDBImpl* parent, const std::string& bdir, uint64_t fnum,
-           Logger* info_log, uint32_t column_family_id,
-           CompressionType compression, bool has_ttl,
+           Logger* info_log, uint32_t column_family_id, bool has_ttl,
            const ExpirationRange& expiration_range);
 
   ~BlobFile();
 
-  uint32_t GetColumnFamilyId() const;
-
   // Returns log file's absolute pathname.
   std::string PathName() const;
 
@@ -201,8 +195,6 @@ class BlobFile {
 
   void SetHasTTL(bool has_ttl) { has_ttl_ = has_ttl; }
 
-  CompressionType GetCompressionType() const { return compression_; }
-
   std::shared_ptr<BlobLogWriter> GetWriter() const { return log_writer_; }
 
   // Read blob file header and footer. Return corruption if file header is
diff --git a/utilities/cache_dump_load_impl.cc b/utilities/cache_dump_load_impl.cc
index 042ed534112c..40552ce12066 100644
--- a/utilities/cache_dump_load_impl.cc
+++ b/utilities/cache_dump_load_impl.cc
@@ -24,7 +24,7 @@ namespace ROCKSDB_NAMESPACE {
 // DBs and we may only want to dump out the blocks belonging to certain DB(s).
 // Therefore, a filter is need to decide if the key of the block satisfy the
 // requirement.
-Status CacheDumperImpl::SetDumpFilter(std::vector<DB*> db_list) {
+Status CacheDumperImpl::SetDumpFilter(const std::vector<DB*>& db_list) {
   Status s = Status::OK();
   dump_all_keys_ = false;
   for (size_t i = 0; i < db_list.size(); i++) {
diff --git a/utilities/cache_dump_load_impl.h b/utilities/cache_dump_load_impl.h
index ee892f47488e..b9b62df2a4b3 100644
--- a/utilities/cache_dump_load_impl.h
+++ b/utilities/cache_dump_load_impl.h
@@ -100,7 +100,7 @@ class CacheDumperImpl : public CacheDumper {
     dumped_size_bytes_ = 0;
   }
   ~CacheDumperImpl() { writer_.reset(); }
-  Status SetDumpFilter(std::vector<DB*> db_list) override;
+  Status SetDumpFilter(const std::vector<DB*>& db_list) override;
   IOStatus DumpCacheEntriesToWriter() override;
 
  private:
diff --git a/utilities/cassandra/cassandra_functional_test.cc b/utilities/cassandra/cassandra_functional_test.cc
index 28fba3acb88a..f9d9aa4afbf8 100644
--- a/utilities/cassandra/cassandra_functional_test.cc
+++ b/utilities/cassandra/cassandra_functional_test.cc
@@ -25,7 +25,7 @@ const std::string kDbName = test::PerThreadDBPath("cassandra_functional_test");
 
 class CassandraStore {
  public:
-  explicit CassandraStore(std::shared_ptr<DB> db)
+  explicit CassandraStore(UnownedPtr<DB> db)
       : db_(db), write_option_(), get_option_() {
     assert(db);
   }
@@ -87,7 +87,7 @@ class CassandraStore {
   }
 
  private:
-  std::shared_ptr<DB> db_;
+  UnownedPtr<DB> db_;
   WriteOptions write_option_;
   ReadOptions get_option_;
 
@@ -122,8 +122,7 @@ class CassandraFunctionalTest : public testing::Test {
         DestroyDB(kDbName, Options()));  // Start each test with a fresh DB
   }
 
-  std::shared_ptr<DB> OpenDb() {
-    DB* db;
+  std::unique_ptr<DB> OpenDb() {
     Options options;
     options.create_if_missing = true;
     options.merge_operator.reset(
@@ -131,8 +130,9 @@ class CassandraFunctionalTest : public testing::Test {
     auto* cf_factory = new TestCompactionFilterFactory(
         purge_ttl_on_expiration_, gc_grace_period_in_seconds_);
     options.compaction_filter_factory.reset(cf_factory);
+    std::unique_ptr<DB> db;
     EXPECT_OK(DB::Open(options, kDbName, &db));
-    return std::shared_ptr<DB>(db);
+    return db;
   }
 
   bool purge_ttl_on_expiration_ = false;
@@ -142,7 +142,8 @@ class CassandraFunctionalTest : public testing::Test {
 // THE TEST CASES BEGIN HERE
 
 TEST_F(CassandraFunctionalTest, SimpleMergeTest) {
-  CassandraStore store(OpenDb());
+  auto db = OpenDb();
+  CassandraStore store(db.get());
   int64_t now = time(nullptr);
 
   store.Append(
@@ -190,7 +191,8 @@ constexpr int64_t kTestTimeoutSecs = 600;
 
 TEST_F(CassandraFunctionalTest,
        CompactionShouldConvertExpiredColumnsToTombstone) {
-  CassandraStore store(OpenDb());
+  auto db = OpenDb();
+  CassandraStore store(db.get());
   int64_t now = time(nullptr);
 
   store.Append(
@@ -232,7 +234,8 @@ TEST_F(CassandraFunctionalTest,
 TEST_F(CassandraFunctionalTest,
        CompactionShouldPurgeExpiredColumnsIfPurgeTtlIsOn) {
   purge_ttl_on_expiration_ = true;
-  CassandraStore store(OpenDb());
+  auto db = OpenDb();
+  CassandraStore store(db.get());
   int64_t now = time(nullptr);
 
   store.Append(
@@ -271,7 +274,8 @@ TEST_F(CassandraFunctionalTest,
 TEST_F(CassandraFunctionalTest,
        CompactionShouldRemoveRowWhenAllColumnsExpiredIfPurgeTtlIsOn) {
   purge_ttl_on_expiration_ = true;
-  CassandraStore store(OpenDb());
+  auto db = OpenDb();
+  CassandraStore store(db.get());
   int64_t now = time(nullptr);
 
   store.Append("k1", CreateTestRowValue({
@@ -296,7 +300,8 @@ TEST_F(CassandraFunctionalTest,
 TEST_F(CassandraFunctionalTest,
        CompactionShouldRemoveTombstoneExceedingGCGracePeriod) {
   purge_ttl_on_expiration_ = true;
-  CassandraStore store(OpenDb());
+  auto db = OpenDb();
+  CassandraStore store(db.get());
   int64_t now = time(nullptr);
 
   store.Append("k1",
@@ -327,7 +332,8 @@ TEST_F(CassandraFunctionalTest,
 
 TEST_F(CassandraFunctionalTest, CompactionShouldRemoveTombstoneFromPut) {
   purge_ttl_on_expiration_ = true;
-  CassandraStore store(OpenDb());
+  auto db = OpenDb();
+  CassandraStore store(db.get());
   int64_t now = time(nullptr);
 
   store.Put("k1",
diff --git a/utilities/cassandra/test_utils.cc b/utilities/cassandra/test_utils.cc
index 3615813500a8..a596ea98869b 100644
--- a/utilities/cassandra/test_utils.cc
+++ b/utilities/cassandra/test_utils.cc
@@ -51,7 +51,7 @@ RowValue CreateRowTombstone(int64_t timestamp) {
 }
 
 void VerifyRowValueColumns(
-    const std::vector<std::shared_ptr<ColumnBase>> &columns,
+    const std::vector<std::shared_ptr<ColumnBase>>& columns,
     std::size_t index_of_vector, int8_t expected_mask, int8_t expected_index,
     int64_t expected_timestamp) {
   EXPECT_EQ(expected_timestamp, columns[index_of_vector]->Timestamp());
diff --git a/utilities/cassandra/test_utils.h b/utilities/cassandra/test_utils.h
index be23f707606f..1fd789c27e8f 100644
--- a/utilities/cassandra/test_utils.h
+++ b/utilities/cassandra/test_utils.h
@@ -32,7 +32,7 @@ RowValue CreateTestRowValue(
 RowValue CreateRowTombstone(int64_t timestamp);
 
 void VerifyRowValueColumns(
-    const std::vector<std::shared_ptr<ColumnBase>> &columns,
+    const std::vector<std::shared_ptr<ColumnBase>>& columns,
     std::size_t index_of_vector, int8_t expected_mask, int8_t expected_index,
     int64_t expected_timestamp);
 
diff --git a/utilities/checkpoint/checkpoint_impl.cc b/utilities/checkpoint/checkpoint_impl.cc
index dcedfd2ddf65..c7ed298cf02f 100644
--- a/utilities/checkpoint/checkpoint_impl.cc
+++ b/utilities/checkpoint/checkpoint_impl.cc
@@ -340,6 +340,7 @@ Status CheckpointImpl::ExportColumnFamily(
   s = db_->GetEnv()->CreateDir(tmp_export_dir);
 
   if (s.ok()) {
+    // FIXME: should respect atomic_flush and flush all CFs if needed.
     s = db_->Flush(ROCKSDB_NAMESPACE::FlushOptions(), handle);
   }
 
diff --git a/utilities/checkpoint/checkpoint_test.cc b/utilities/checkpoint/checkpoint_test.cc
index e71c795f654b..9e0729b64cd0 100644
--- a/utilities/checkpoint/checkpoint_test.cc
+++ b/utilities/checkpoint/checkpoint_test.cc
@@ -46,7 +46,7 @@ class CheckpointTest : public testing::Test {
   std::string dbname_;
   std::string alternative_wal_dir_;
   Env* env_;
-  DB* db_;
+  std::unique_ptr<DB> db_;
   Options last_options_;
   std::vector<ColumnFamilyHandle*> handles_;
   std::string snapshot_name_;
@@ -65,7 +65,7 @@ class CheckpointTest : public testing::Test {
     EXPECT_OK(DestroyDB(dbname_, delete_options));
     // Destroy it for not alternative WAL dir is used.
     EXPECT_OK(DestroyDB(dbname_, options));
-    db_ = nullptr;
+    db_.reset();
     snapshot_name_ = test::PerThreadDBPath(env_, "snapshot");
     std::string snapshot_tmp_name = snapshot_name_ + ".tmp";
     EXPECT_OK(DestroyDB(snapshot_name_, options));
@@ -102,6 +102,8 @@ class CheckpointTest : public testing::Test {
     DestroyDir(env_, export_path_).PermitUncheckedError();
   }
 
+  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_.get()); }
+
   // Return the current option configuration.
   Options CurrentOptions() {
     Options options;
@@ -170,8 +172,7 @@ class CheckpointTest : public testing::Test {
       delete h;
     }
     handles_.clear();
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
   }
 
   void DestroyAndReopen(const Options& options) {
@@ -268,14 +269,12 @@ class CheckpointTest : public testing::Test {
 TEST_F(CheckpointTest, GetSnapshotLink) {
   for (uint64_t log_size_for_flush : {0, 1000000}) {
     Options options;
-    DB* snapshotDB;
     ReadOptions roptions;
     std::string result;
     Checkpoint* checkpoint;
 
     options = CurrentOptions();
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     ASSERT_OK(DestroyDB(dbname_, options));
 
     // Create a database
@@ -284,7 +283,7 @@ TEST_F(CheckpointTest, GetSnapshotLink) {
     std::string key = std::string("foo");
     ASSERT_OK(Put(key, "v1"));
     // Take a snapshot
-    ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+    ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
     ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_, log_size_for_flush));
     ASSERT_OK(Put(key, "v2"));
     ASSERT_EQ("v2", Get(key));
@@ -292,13 +291,12 @@ TEST_F(CheckpointTest, GetSnapshotLink) {
     ASSERT_EQ("v2", Get(key));
     // Open snapshot and verify contents while DB is running
     options.create_if_missing = false;
+    std::unique_ptr<DB> snapshotDB;
     ASSERT_OK(DB::Open(options, snapshot_name_, &snapshotDB));
     ASSERT_OK(snapshotDB->Get(roptions, key, &result));
     ASSERT_EQ("v1", result);
-    delete snapshotDB;
-    snapshotDB = nullptr;
-    delete db_;
-    db_ = nullptr;
+    snapshotDB.reset();
+    db_.reset();
 
     // Destroy original DB
     ASSERT_OK(DestroyDB(dbname_, options));
@@ -308,8 +306,7 @@ TEST_F(CheckpointTest, GetSnapshotLink) {
     dbname_ = snapshot_name_;
     ASSERT_OK(DB::Open(options, dbname_, &db_));
     ASSERT_EQ("v1", Get(key));
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     ASSERT_OK(DestroyDB(dbname_, options));
     delete checkpoint;
 
@@ -335,7 +332,7 @@ TEST_F(CheckpointTest, CheckpointWithBlob) {
 
   // Create a checkpoint
   Checkpoint* checkpoint = nullptr;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
 
   std::unique_ptr<Checkpoint> checkpoint_guard(checkpoint);
 
@@ -360,11 +357,9 @@ TEST_F(CheckpointTest, CheckpointWithBlob) {
 
   // Make sure the checkpoint can be opened and the blob value read
   options.create_if_missing = false;
-  DB* checkpoint_db = nullptr;
+  std::unique_ptr<DB> checkpoint_db;
   ASSERT_OK(DB::Open(options, snapshot_name_, &checkpoint_db));
 
-  std::unique_ptr<DB> checkpoint_db_guard(checkpoint_db);
-
   PinnableSlice value;
   ASSERT_OK(checkpoint_db->Get(
       ReadOptions(), checkpoint_db->DefaultColumnFamily(), key, &value));
@@ -393,7 +388,7 @@ TEST_F(CheckpointTest, ExportColumnFamilyWithLinks) {
     ASSERT_OK(Put(key, "v1"));
 
     Checkpoint* checkpoint;
-    ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+    ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
 
     // Export the Tables and verify
     ASSERT_OK(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(),
@@ -427,7 +422,7 @@ TEST_F(CheckpointTest, ExportColumnFamilyWithLinks) {
     ASSERT_OK(db_->Put(WriteOptions(), cfh_reverse_comp_, key, "v1"));
 
     Checkpoint* checkpoint;
-    ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+    ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
 
     // Export the Tables and verify
     ASSERT_OK(checkpoint->ExportColumnFamily(cfh_reverse_comp_, export_path_,
@@ -449,7 +444,7 @@ TEST_F(CheckpointTest, ExportColumnFamilyNegativeTest) {
   ASSERT_OK(Put(key, "v1"));
 
   Checkpoint* checkpoint;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
 
   // Export onto existing directory
   ASSERT_OK(env_->CreateDirIfMissing(export_path_));
@@ -482,7 +477,6 @@ TEST_F(CheckpointTest, CheckpointCF) {
   ASSERT_OK(Put(4, "four", "four"));
   ASSERT_OK(Put(5, "five", "five"));
 
-  DB* snapshotDB;
   ReadOptions roptions;
   std::string result;
   std::vector<ColumnFamilyHandle*> cphandles;
@@ -490,7 +484,7 @@ TEST_F(CheckpointTest, CheckpointCF) {
   // Take a snapshot
   ROCKSDB_NAMESPACE::port::Thread t([&]() {
     Checkpoint* checkpoint;
-    ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+    ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
     ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
     delete checkpoint;
   });
@@ -519,6 +513,7 @@ TEST_F(CheckpointTest, CheckpointCF) {
   for (size_t i = 0; i < cfs.size(); ++i) {
     column_families.emplace_back(cfs[i], options);
   }
+  std::unique_ptr<DB> snapshotDB;
   ASSERT_OK(DB::Open(options, snapshot_name_, column_families, &cphandles,
                      &snapshotDB));
   ASSERT_OK(snapshotDB->Get(roptions, cphandles[0], "Default", &result));
@@ -530,8 +525,7 @@ TEST_F(CheckpointTest, CheckpointCF) {
     delete h;
   }
   cphandles.clear();
-  delete snapshotDB;
-  snapshotDB = nullptr;
+  snapshotDB.reset();
 }
 
 TEST_F(CheckpointTest, CheckpointCFNoFlush) {
@@ -545,7 +539,6 @@ TEST_F(CheckpointTest, CheckpointCFNoFlush) {
   ASSERT_OK(Flush());
   ASSERT_OK(Put(2, "two", "two"));
 
-  DB* snapshotDB;
   ReadOptions roptions;
   std::string result;
   std::vector<ColumnFamilyHandle*> cphandles;
@@ -558,7 +551,7 @@ TEST_F(CheckpointTest, CheckpointCFNoFlush) {
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
   Checkpoint* checkpoint;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
   ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_, 1000000));
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 
@@ -577,6 +570,7 @@ TEST_F(CheckpointTest, CheckpointCFNoFlush) {
   for (size_t i = 0; i < cfs.size(); ++i) {
     column_families.emplace_back(cfs[i], options);
   }
+  std::unique_ptr<DB> snapshotDB;
   ASSERT_OK(DB::Open(options, snapshot_name_, column_families, &cphandles,
                      &snapshotDB));
   ASSERT_OK(snapshotDB->Get(roptions, cphandles[0], "Default", &result));
@@ -589,13 +583,13 @@ TEST_F(CheckpointTest, CheckpointCFNoFlush) {
     delete h;
   }
   cphandles.clear();
-  delete snapshotDB;
-  snapshotDB = nullptr;
+  snapshotDB.reset();
 }
 
 TEST_F(CheckpointTest, CurrentFileModifiedWhileCheckpointing) {
   Options options = CurrentOptions();
   options.max_manifest_file_size = 0;  // always rollover manifest for file add
+  options.max_manifest_space_amp_pct = 0;
   Reopen(options);
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
@@ -614,7 +608,7 @@ TEST_F(CheckpointTest, CurrentFileModifiedWhileCheckpointing) {
 
   ROCKSDB_NAMESPACE::port::Thread t([&]() {
     Checkpoint* checkpoint;
-    ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+    ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
     ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
     delete checkpoint;
   });
@@ -626,12 +620,10 @@ TEST_F(CheckpointTest, CurrentFileModifiedWhileCheckpointing) {
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 
-  DB* snapshotDB;
   // Successful Open() implies that CURRENT pointed to the manifest in the
   // checkpoint.
+  std::unique_ptr<DB> snapshotDB;
   ASSERT_OK(DB::Open(options, snapshot_name_, &snapshotDB));
-  delete snapshotDB;
-  snapshotDB = nullptr;
 }
 
 TEST_F(CheckpointTest, CurrentFileModifiedWhileCheckpointing2PC) {
@@ -751,7 +743,7 @@ TEST_F(CheckpointTest, CurrentFileModifiedWhileCheckpointing2PC) {
 TEST_F(CheckpointTest, CheckpointInvalidDirectoryName) {
   for (std::string checkpoint_dir : {"", "/", "////"}) {
     Checkpoint* checkpoint;
-    ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+    ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
     ASSERT_TRUE(
         checkpoint->CreateCheckpoint(checkpoint_dir).IsInvalidArgument());
     delete checkpoint;
@@ -764,7 +756,7 @@ TEST_F(CheckpointTest, CheckpointWithParallelWrites) {
   ASSERT_OK(Put("key1", "val1"));
   port::Thread thread([this]() { ASSERT_OK(Put("key2", "val2")); });
   Checkpoint* checkpoint;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
   ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
   delete checkpoint;
   thread.join();
@@ -815,24 +807,24 @@ TEST_P(CheckpointTestWithWalParams, CheckpointWithUnsyncedDataDropped) {
     // * one active WAL, not synced
     // with a single thread, so that we have at least one that can be hard
     // linked, etc.
-    ASSERT_OK(static_cast_with_check<DBImpl>(db_)->PauseBackgroundWork());
-    ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
+    ASSERT_OK(dbfull()->PauseBackgroundWork());
+    ASSERT_OK(dbfull()->TEST_SwitchMemtable());
     ASSERT_OK(db_->SyncWAL());
   }
   ASSERT_OK(Put("key2", "val2"));
   if (GetLogSizeForFlush() > 0) {
-    ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
+    ASSERT_OK(dbfull()->TEST_SwitchMemtable());
   }
   ASSERT_OK(Put("key3", "val3"));
   Checkpoint* checkpoint;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
   ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_, GetLogSizeForFlush()));
   delete checkpoint;
   ASSERT_OK(fault_fs->DropUnsyncedFileData());
   // make sure it's openable even though whatever data that wasn't synced got
   // dropped.
   options.env = env_;
-  DB* snapshot_db;
+  std::unique_ptr<DB> snapshot_db;
   ASSERT_OK(DB::Open(options, snapshot_name_, &snapshot_db));
   ReadOptions read_opts;
   std::string get_result;
@@ -842,57 +834,8 @@ TEST_P(CheckpointTestWithWalParams, CheckpointWithUnsyncedDataDropped) {
   ASSERT_EQ("val2", get_result);
   ASSERT_OK(snapshot_db->Get(read_opts, "key3", &get_result));
   ASSERT_EQ("val3", get_result);
-  delete snapshot_db;
-  delete db_;
-  db_ = nullptr;
-}
-
-TEST_F(CheckpointTest, CheckpointOptionsFileFailedToPersist) {
-  // Regression test for a bug where checkpoint failed on a DB where persisting
-  // OPTIONS file failed and the DB was opened with
-  // `fail_if_options_file_error == false`.
-  Options options = CurrentOptions();
-  options.fail_if_options_file_error = false;
-  auto fault_fs = std::make_shared<FaultInjectionTestFS>(FileSystem::Default());
-
-  // Setup `FaultInjectionTestFS` and `SyncPoint` callbacks to fail one
-  // operation when inside the OPTIONS file persisting code.
-  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
-  fault_fs->SetThreadLocalErrorContext(
-      FaultInjectionIOType::kWrite, 7 /* seed*/, 1 /* one_in */,
-      false /* retryable */, false /* has_data_loss*/);
-  SyncPoint::GetInstance()->SetCallBack(
-      "PersistRocksDBOptions:start", [fault_fs](void* /* arg */) {
-        fault_fs->EnableThreadLocalErrorInjection(
-            FaultInjectionIOType::kMetadataWrite);
-      });
-  SyncPoint::GetInstance()->SetCallBack(
-      "FaultInjectionTestFS::InjectMetadataWriteError:Injected",
-      [fault_fs](void* /* arg */) {
-        fault_fs->DisableThreadLocalErrorInjection(
-            FaultInjectionIOType::kMetadataWrite);
-      });
-  options.env = fault_fs_env.get();
-  SyncPoint::GetInstance()->EnableProcessing();
-
-  Reopen(options);
-  ASSERT_OK(Put("key1", "val1"));
-  Checkpoint* checkpoint;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
-  ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
-  delete checkpoint;
-
-  // Make sure it's usable.
-  options.env = env_;
-  DB* snapshot_db;
-  ASSERT_OK(DB::Open(options, snapshot_name_, &snapshot_db));
-  ReadOptions read_opts;
-  std::string get_result;
-  ASSERT_OK(snapshot_db->Get(read_opts, "key1", &get_result));
-  ASSERT_EQ("val1", get_result);
-  delete snapshot_db;
-  delete db_;
-  db_ = nullptr;
+  snapshot_db.reset();
+  db_.reset();
 }
 
 TEST_F(CheckpointTest, CheckpointReadOnlyDB) {
@@ -902,18 +845,17 @@ TEST_F(CheckpointTest, CheckpointReadOnlyDB) {
   Options options = CurrentOptions();
   ASSERT_OK(ReadOnlyReopen(options));
   Checkpoint* checkpoint = nullptr;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
   ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
   delete checkpoint;
   checkpoint = nullptr;
   Close();
-  DB* snapshot_db = nullptr;
+  std::unique_ptr<DB> snapshot_db;
   ASSERT_OK(DB::Open(options, snapshot_name_, &snapshot_db));
   ReadOptions read_opts;
   std::string get_result;
   ASSERT_OK(snapshot_db->Get(read_opts, "foo", &get_result));
   ASSERT_EQ("foo_value", get_result);
-  delete snapshot_db;
 }
 
 TEST_F(CheckpointTest, CheckpointWithLockWAL) {
@@ -923,7 +865,7 @@ TEST_F(CheckpointTest, CheckpointWithLockWAL) {
   ASSERT_OK(db_->LockWAL());
 
   Checkpoint* checkpoint = nullptr;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
   ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
   delete checkpoint;
   checkpoint = nullptr;
@@ -931,13 +873,12 @@ TEST_F(CheckpointTest, CheckpointWithLockWAL) {
   ASSERT_OK(db_->UnlockWAL());
   Close();
 
-  DB* snapshot_db = nullptr;
+  std::unique_ptr<DB> snapshot_db;
   ASSERT_OK(DB::Open(options, snapshot_name_, &snapshot_db));
   ReadOptions read_opts;
   std::string get_result;
   ASSERT_OK(snapshot_db->Get(read_opts, "foo", &get_result));
   ASSERT_EQ("foo_value", get_result);
-  delete snapshot_db;
 }
 
 TEST_F(CheckpointTest, CheckpointReadOnlyDBWithMultipleColumnFamilies) {
@@ -952,7 +893,7 @@ TEST_F(CheckpointTest, CheckpointReadOnlyDBWithMultipleColumnFamilies) {
       {kDefaultColumnFamilyName, "pikachu", "eevee"}, options);
   ASSERT_OK(s);
   Checkpoint* checkpoint = nullptr;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
   ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
   delete checkpoint;
   checkpoint = nullptr;
@@ -962,7 +903,7 @@ TEST_F(CheckpointTest, CheckpointReadOnlyDBWithMultipleColumnFamilies) {
       {kDefaultColumnFamilyName, options},
       {"pikachu", options},
       {"eevee", options}};
-  DB* snapshot_db = nullptr;
+  std::unique_ptr<DB> snapshot_db;
   std::vector<ColumnFamilyHandle*> snapshot_handles;
   s = DB::Open(options, snapshot_name_, column_families, &snapshot_handles,
                &snapshot_db);
@@ -979,7 +920,6 @@ TEST_F(CheckpointTest, CheckpointReadOnlyDBWithMultipleColumnFamilies) {
     delete snapshot_h;
   }
   snapshot_handles.clear();
-  delete snapshot_db;
 }
 
 TEST_F(CheckpointTest, CheckpointWithDbPath) {
@@ -989,7 +929,7 @@ TEST_F(CheckpointTest, CheckpointWithDbPath) {
   ASSERT_OK(Put("key1", "val1"));
   ASSERT_OK(Flush());
   Checkpoint* checkpoint;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
   // Currently not supported
   ASSERT_TRUE(checkpoint->CreateCheckpoint(snapshot_name_).IsNotSupported());
   delete checkpoint;
@@ -1011,7 +951,7 @@ TEST_F(CheckpointTest, CheckpointWithArchievedLog) {
   ASSERT_OK(Put("key2", std::string(1024, 'a')));
 
   Checkpoint* checkpoint;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
   TEST_SYNC_POINT("CheckpointTest:CheckpointWithArchievedLog");
   ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_, 1024 * 1024));
   // unflushed log size < 1024 * 1024 < total file size including archived log,
@@ -1020,7 +960,7 @@ TEST_F(CheckpointTest, CheckpointWithArchievedLog) {
   delete checkpoint;
   checkpoint = nullptr;
 
-  DB* snapshot_db;
+  std::unique_ptr<DB> snapshot_db;
   ASSERT_OK(DB::Open(options, snapshot_name_, &snapshot_db));
   ReadOptions read_opts;
   std::string get_result;
@@ -1029,7 +969,6 @@ TEST_F(CheckpointTest, CheckpointWithArchievedLog) {
   get_result.clear();
   ASSERT_OK(snapshot_db->Get(read_opts, "key2", &get_result));
   ASSERT_EQ(std::string(1024, 'a'), get_result);
-  delete snapshot_db;
 }
 
 class CheckpointDestroyTest : public CheckpointTest,
@@ -1060,7 +999,7 @@ TEST_P(CheckpointDestroyTest, DisableEnableSlowDeletion) {
   ASSERT_EQ(NumTableFilesAtLevel(1), 2);
 
   Checkpoint* checkpoint;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
   ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
 
   delete checkpoint;
@@ -1070,7 +1009,7 @@ TEST_P(CheckpointDestroyTest, DisableEnableSlowDeletion) {
   ASSERT_EQ(NumTableFilesAtLevel(0), 0);
   ASSERT_EQ(NumTableFilesAtLevel(1), 2);
 
-  DB* snapshot_db;
+  std::unique_ptr<DB> snapshot_db;
   ASSERT_OK(DB::Open(options, snapshot_name_, &snapshot_db));
   ReadOptions read_opts;
   std::string get_result;
@@ -1078,11 +1017,10 @@ TEST_P(CheckpointDestroyTest, DisableEnableSlowDeletion) {
   ASSERT_EQ("a", get_result);
   ASSERT_OK(snapshot_db->Get(read_opts, "bar", &get_result));
   ASSERT_EQ("val9", get_result);
-  delete snapshot_db;
+  snapshot_db.reset();
 
   // Make sure original obsolete files for hard linked files are all deleted.
-  DBImpl* db_impl = static_cast_with_check<DBImpl>(db_);
-  db_impl->TEST_DeleteObsoleteFiles();
+  dbfull()->TEST_DeleteObsoleteFiles();
   auto sfm = static_cast_with_check<SstFileManagerImpl>(
       options.sst_file_manager.get());
   ASSERT_NE(nullptr, sfm);
diff --git a/utilities/debug.cc b/utilities/debug.cc
index 89e1487faad4..4c35c0ed52c3 100644
--- a/utilities/debug.cc
+++ b/utilities/debug.cc
@@ -7,6 +7,7 @@
 
 #include "db/db_impl/db_impl.h"
 #include "rocksdb/utilities/options_type.h"
+#include "util/cast_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -53,7 +54,7 @@ std::string KeyVersion::GetTypeName() const {
   }
 }
 
-Status GetAllKeyVersions(DB* db, Slice begin_key, Slice end_key,
+Status GetAllKeyVersions(DB* db, OptSlice begin_key, OptSlice end_key,
                          size_t max_num_ikeys,
                          std::vector<KeyVersion>* key_versions) {
   if (nullptr == db) {
@@ -63,8 +64,8 @@ Status GetAllKeyVersions(DB* db, Slice begin_key, Slice end_key,
                            max_num_ikeys, key_versions);
 }
 
-Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, Slice begin_key,
-                         Slice end_key, size_t max_num_ikeys,
+Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, OptSlice begin_key,
+                         OptSlice end_key, size_t max_num_ikeys,
                          std::vector<KeyVersion>* key_versions) {
   if (nullptr == db) {
     return Status::InvalidArgument("db cannot be null.");
@@ -77,7 +78,7 @@ Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, Slice begin_key,
   }
   key_versions->clear();
 
-  DBImpl* idb = static_cast<DBImpl*>(db->GetRootDB());
+  DBImpl* idb = static_cast_with_check<DBImpl>(db->GetRootDB());
   auto icmp = InternalKeyComparator(idb->GetOptions(cfh).comparator);
   ReadOptions read_options;
   Arena arena;
@@ -87,15 +88,10 @@ Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, Slice begin_key,
   const Comparator* ucmp = icmp.user_comparator();
   size_t ts_sz = ucmp->timestamp_size();
 
-  Slice from_slice = begin_key;
-  bool has_begin = !begin_key.empty();
-  Slice end_slice = end_key;
-  bool has_end = !end_key.empty();
   std::string begin_key_buf, end_key_buf;
-  auto [from, end] = MaybeAddTimestampsToRange(
-      has_begin ? &from_slice : nullptr, has_end ? &end_slice : nullptr, ts_sz,
-      &begin_key_buf, &end_key_buf);
-  if (has_begin) {
+  auto [from, end] = MaybeAddTimestampsToRange(begin_key, end_key, ts_sz,
+                                               &begin_key_buf, &end_key_buf);
+  if (begin_key.has_value()) {
     assert(from.has_value());
     InternalKey ikey;
     ikey.SetMinPossibleForUserKey(from.value());
@@ -113,7 +109,7 @@ Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, Slice begin_key,
       return pik_status;
     }
 
-    if (has_end && end.has_value() &&
+    if (end_key.has_value() && end.has_value() &&
         icmp.user_comparator()->Compare(ikey.user_key, end.value()) > 0) {
       break;
     }
diff --git a/utilities/env_mirror.cc b/utilities/env_mirror.cc
index 8e128c9d0617..1b559bceaf37 100644
--- a/utilities/env_mirror.cc
+++ b/utilities/env_mirror.cc
@@ -94,6 +94,16 @@ class RandomAccessFileMirror : public RandomAccessFile {
     // NOTE: not verified
     return a_->GetUniqueId(id, max_size);
   }
+
+  Status GetFileSize(uint64_t* file_size) override {
+    uint64_t asize = 0, bsize = 0;
+    Status as = a_->GetFileSize(&asize);
+    Status bs = b_->GetFileSize(&bsize);
+    assert(as == bs);
+    assert(asize == bsize);
+    *file_size = asize;
+    return as;
+  }
 };
 
 class WritableFileMirror : public WritableFile {
diff --git a/utilities/fault_injection_env.cc b/utilities/fault_injection_env.cc
index fb443cc87f30..6aedb87ab634 100644
--- a/utilities/fault_injection_env.cc
+++ b/utilities/fault_injection_env.cc
@@ -159,6 +159,11 @@ Status TestRandomAccessFile::MultiRead(ReadRequest* reqs, size_t num_reqs) {
   return target_->MultiRead(reqs, num_reqs);
 }
 
+Status TestRandomAccessFile::GetFileSize(uint64_t* file_size) {
+  assert(target_);
+  return target_->GetFileSize(file_size);
+}
+
 TestWritableFile::TestWritableFile(const std::string& fname,
                                    std::unique_ptr<WritableFile>&& f,
                                    FaultInjectionTestEnv* env)
diff --git a/utilities/fault_injection_env.h b/utilities/fault_injection_env.h
index 5612718c6c79..eaece031848d 100644
--- a/utilities/fault_injection_env.h
+++ b/utilities/fault_injection_env.h
@@ -59,6 +59,8 @@ class TestRandomAccessFile : public RandomAccessFile {
 
   Status MultiRead(ReadRequest* reqs, size_t num_reqs) override;
 
+  Status GetFileSize(uint64_t* file_size) override;
+
  private:
   std::unique_ptr<RandomAccessFile> target_;
   FaultInjectionTestEnv* env_;
diff --git a/utilities/fault_injection_fs.cc b/utilities/fault_injection_fs.cc
index 82d3217258d0..e658f114f860 100644
--- a/utilities/fault_injection_fs.cc
+++ b/utilities/fault_injection_fs.cc
@@ -399,10 +399,10 @@ IOStatus TestFSWritableFile::RangeSync(uint64_t offset, uint64_t nbytes,
   return io_s;
 }
 
-TestFSRandomRWFile::TestFSRandomRWFile(const std::string& /*fname*/,
+TestFSRandomRWFile::TestFSRandomRWFile(const std::string& fname,
                                        std::unique_ptr<FSRandomRWFile>&& f,
                                        FaultInjectionTestFS* fs)
-    : target_(std::move(f)), file_opened_(true), fs_(fs) {
+    : fname_(fname), target_(std::move(f)), file_opened_(true), fs_(fs) {
   assert(target_ != nullptr);
 }
 
@@ -433,6 +433,7 @@ IOStatus TestFSRandomRWFile::Read(uint64_t offset, size_t n,
 
 IOStatus TestFSRandomRWFile::Close(const IOOptions& options,
                                    IODebugContext* dbg) {
+  fs_->RandomRWFileClosed(fname_);
   if (!fs_->IsFilesystemActive()) {
     return fs_->GetError();
   }
@@ -457,9 +458,9 @@ IOStatus TestFSRandomRWFile::Sync(const IOOptions& options,
 }
 
 TestFSRandomAccessFile::TestFSRandomAccessFile(
-    const std::string& /*fname*/, std::unique_ptr<FSRandomAccessFile>&& f,
+    const std::string& fname, std::unique_ptr<FSRandomAccessFile>&& f,
     FaultInjectionTestFS* fs)
-    : target_(std::move(f)), fs_(fs) {
+    : target_(std::move(f)), fs_(fs), is_sst_(EndsWith(fname, ".sst")) {
   assert(target_ != nullptr);
 }
 
@@ -562,6 +563,14 @@ size_t TestFSRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
   }
 }
 
+IOStatus TestFSRandomAccessFile::GetFileSize(uint64_t* file_size) {
+  if (is_sst_ && fs_->ShouldFailRandomAccessGetFileSizeSst()) {
+    return IOStatus::IOError("FSRandomAccessFile::GetFileSize failed");
+  } else {
+    return target_->GetFileSize(file_size);
+  }
+}
+
 namespace {
 // Modifies `result` to start at the beginning of `scratch` if not already,
 // copying data there if needed.
@@ -1056,6 +1065,9 @@ IOStatus FaultInjectionTestFS::GetFileSize(const std::string& f,
                                            const IOOptions& options,
                                            uint64_t* file_size,
                                            IODebugContext* dbg) {
+  if (EndsWith(f, ".sst") && ShouldFailFilesystemGetFileSizeSst()) {
+    return IOStatus::IOError("FileSystem::GetFileSize failed");
+  }
   if (!IsFilesystemActive()) {
     return GetError();
   }
@@ -1265,6 +1277,13 @@ IOStatus FaultInjectionTestFS::AbortIO(std::vector<void*>& io_handles) {
   return target()->AbortIO(io_handles);
 }
 
+void FaultInjectionTestFS::RandomRWFileClosed(const std::string& fname) {
+  MutexLock l(&mutex_);
+  if (open_managed_files_.find(fname) != open_managed_files_.end()) {
+    open_managed_files_.erase(fname);
+  }
+}
+
 void FaultInjectionTestFS::WritableFileClosed(const FSFileState& state) {
   MutexLock l(&mutex_);
   if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) {
@@ -1379,7 +1398,7 @@ IOStatus FaultInjectionTestFS::MaybeInjectThreadLocalReadError(
   ErrorContext* ctx =
       static_cast<ErrorContext*>(injected_thread_local_read_error_.Get());
   if (ctx == nullptr || !ctx->enable_error_injection || !ctx->one_in ||
-      ShouldIOActivtiesExcludedFromFaultInjection(io_options.io_activity)) {
+      ShouldIOActivitiesExcludedFromFaultInjection(io_options.io_activity)) {
     return IOStatus::OK();
   }
 
@@ -1465,7 +1484,7 @@ IOStatus FaultInjectionTestFS::MaybeInjectThreadLocalError(
 
   ErrorContext* ctx = GetErrorContextFromFaultInjectionIOType(type);
   if (ctx == nullptr || !ctx->enable_error_injection || !ctx->one_in ||
-      ShouldIOActivtiesExcludedFromFaultInjection(io_options.io_activity) ||
+      ShouldIOActivitiesExcludedFromFaultInjection(io_options.io_activity) ||
       (type == FaultInjectionIOType::kWrite &&
        ShouldExcludeFromWriteFaultInjection(file_name))) {
     return IOStatus::OK();
diff --git a/utilities/fault_injection_fs.h b/utilities/fault_injection_fs.h
index 9ea2a3bb963f..b4cb122273d6 100644
--- a/utilities/fault_injection_fs.h
+++ b/utilities/fault_injection_fs.h
@@ -106,8 +106,8 @@ class TestFSWritableFile : public FSWritableFile {
   const bool unsync_data_loss_;
 };
 
-// A wrapper around WritableFileWriter* file
-// is written to or sync'ed.
+// A wrapper around FSRandomRWFile* file
+// is read from/write to or sync'ed.
 class TestFSRandomRWFile : public FSRandomRWFile {
  public:
   explicit TestFSRandomRWFile(const std::string& fname,
@@ -128,6 +128,9 @@ class TestFSRandomRWFile : public FSRandomRWFile {
   bool use_direct_io() const override { return target_->use_direct_io(); }
 
  private:
+  // keep a copy of file name, so we can untrack it in File system, when it is
+  // closed
+  std::string fname_;
   std::unique_ptr<FSRandomRWFile> target_;
   bool file_opened_;
   FaultInjectionTestFS* fs_;
@@ -155,9 +158,12 @@ class TestFSRandomAccessFile : public FSRandomAccessFile {
 
   size_t GetUniqueId(char* id, size_t max_size) const override;
 
+  IOStatus GetFileSize(uint64_t* file_size) override;
+
  private:
   std::unique_ptr<FSRandomAccessFile> target_;
   FaultInjectionTestFS* fs_;
+  const bool is_sst_;
 };
 
 class TestFSSequentialFile : public FSSequentialFileOwnerWrapper {
@@ -217,21 +223,31 @@ class FaultInjectionTestFS : public FileSystemWrapper {
         injected_thread_local_metadata_write_error_(
             DeleteThreadLocalErrorContext),
         ingest_data_corruption_before_write_(false),
-        checksum_handoff_func_type_(kCRC32c),
-        fail_get_file_unique_id_(false) {}
+        checksum_handoff_func_type_(kCRC32c) {}
   virtual ~FaultInjectionTestFS() override { fs_error_.PermitUncheckedError(); }
 
   static const char* kClassName() { return "FaultInjectionTestFS"; }
   const char* Name() const override { return kClassName(); }
 
-  static bool IsInjectedError(const Status& s) {
-    assert(!s.ok());
-    return std::strstr(s.getState(), kInjected.c_str());
+  static bool IsInjectedError(const Status& s,
+                              const std::string& specific_error_marker = "") {
+    if (s.ok()) {
+      return false;
+    }
+    const char* state = s.getState();
+    if (state == nullptr) {
+      return false;
+    }
+    bool is_injected_error = std::strstr(state, kInjected.c_str()) != nullptr;
+    bool is_specific_error =
+        specific_error_marker.empty() ||
+        std::strstr(state, specific_error_marker.c_str()) != nullptr;
+
+    return is_injected_error && is_specific_error;
   }
 
   static bool IsFailedToWriteToWALError(const Status& s) {
-    assert(!s.ok());
-    return std::strstr(s.getState(), kFailedToWriteToWAL.c_str());
+    return IsInjectedError(s, kFailedToWriteToWAL);
   }
 
   IOStatus NewDirectory(const std::string& name, const IOOptions& options,
@@ -338,6 +354,8 @@ class FaultInjectionTestFS : public FileSystemWrapper {
 
   void WritableFileAppended(const FSFileState& state);
 
+  void RandomRWFileClosed(const std::string& fname);
+
   IOStatus DropUnsyncedFileData();
 
   IOStatus DropRandomUnsyncedFileData(Random* rnd);
@@ -424,10 +442,11 @@ class FaultInjectionTestFS : public FileSystemWrapper {
     allow_link_open_file_ = allow_link_open_file;
   }
 
-  bool ShouldIOActivtiesExcludedFromFaultInjection(Env::IOActivity io_activty) {
+  bool ShouldIOActivitiesExcludedFromFaultInjection(
+      Env::IOActivity io_activity) {
     MutexLock l(&mutex_);
-    return io_activties_excluded_from_fault_injection.find(io_activty) !=
-           io_activties_excluded_from_fault_injection.end();
+    return io_activities_excluded_from_fault_injection.find(io_activity) !=
+           io_activities_excluded_from_fault_injection.end();
   }
 
   void AssertNoOpenFile() { assert(open_managed_files_.empty()); }
@@ -476,6 +495,26 @@ class FaultInjectionTestFS : public FileSystemWrapper {
     return fail_get_file_unique_id_;
   }
 
+  void SetFailRandomAccessGetFileSizeSst(bool flag) {
+    MutexLock l(&mutex_);
+    fail_random_access_get_file_size_sst_ = flag;
+  }
+
+  bool ShouldFailRandomAccessGetFileSizeSst() {
+    MutexLock l(&mutex_);
+    return fail_random_access_get_file_size_sst_;
+  }
+
+  void SetFailFilesystemGetFileSizeSst(bool flag) {
+    MutexLock l(&mutex_);
+    fail_fs_get_file_size_sst_ = flag;
+  }
+
+  bool ShouldFailFilesystemGetFileSizeSst() {
+    MutexLock l(&mutex_);
+    return fail_fs_get_file_size_sst_;
+  }
+
   // Specify what the operation, so we can inject the right type of error
   enum ErrorOperation : char {
     kRead = 0,
@@ -520,10 +559,10 @@ class FaultInjectionTestFS : public FileSystemWrapper {
     return count;
   }
 
-  void SetIOActivtiesExcludedFromFaultInjection(
-      const std::set<Env::IOActivity>& io_activties) {
+  void SetIOActivitiesExcludedFromFaultInjection(
+      const std::set<Env::IOActivity>& io_activities) {
     MutexLock l(&mutex_);
-    io_activties_excluded_from_fault_injection = io_activties;
+    io_activities_excluded_from_fault_injection = io_activities;
   }
 
   void SetFileTypesExcludedFromWriteFaultInjection(
@@ -627,14 +666,16 @@ class FaultInjectionTestFS : public FileSystemWrapper {
   };
 
   std::set<FileType> file_types_excluded_from_write_fault_injection_;
-  std::set<Env::IOActivity> io_activties_excluded_from_fault_injection;
+  std::set<Env::IOActivity> io_activities_excluded_from_fault_injection;
   ThreadLocalPtr injected_thread_local_read_error_;
   ThreadLocalPtr injected_thread_local_write_error_;
   ThreadLocalPtr injected_thread_local_metadata_read_error_;
   ThreadLocalPtr injected_thread_local_metadata_write_error_;
   bool ingest_data_corruption_before_write_;
   ChecksumType checksum_handoff_func_type_;
-  bool fail_get_file_unique_id_;
+  bool fail_get_file_unique_id_ = false;
+  bool fail_random_access_get_file_size_sst_ = false;
+  bool fail_fs_get_file_size_sst_ = false;
 
   // Inject an error. For a READ operation, a status of IOError(), a
   // corruption in the contents of scratch, or truncation of slice
diff --git a/utilities/memory/memory_test.cc b/utilities/memory/memory_test.cc
index c9e782212984..a177e40c360a 100644
--- a/utilities/memory/memory_test.cc
+++ b/utilities/memory/memory_test.cc
@@ -24,7 +24,9 @@ class MemoryTest : public testing::Test {
 
   std::string GetDBName(int id) { return kDbDir + "db_" + std::to_string(id); }
 
-  void UpdateUsagesHistory(const std::vector<DB*>& dbs) {
+  using DBVec = std::vector<std::unique_ptr<DB>>;
+
+  void UpdateUsagesHistory(const DBVec& dbs) {
     std::map<MemoryUtil::UsageType, uint64_t> usage_by_type;
     ASSERT_OK(GetApproximateMemoryUsageByType(dbs, &usage_by_type));
     for (int i = 0; i < MemoryUtil::kNumUsageTypes; ++i) {
@@ -33,16 +35,17 @@ class MemoryTest : public testing::Test {
     }
   }
 
-  void GetCachePointers(const std::vector<DB*>& dbs,
+  void GetCachePointers(const DBVec& dbs,
                         std::unordered_set<const Cache*>* cache_set) {
     cache_set->clear();
 
-    for (auto* db : dbs) {
+    for (auto& db : dbs) {
       assert(db);
 
       // Cache from DBImpl
-      StackableDB* sdb = dynamic_cast<StackableDB*>(db);
-      DBImpl* db_impl = dynamic_cast<DBImpl*>(sdb ? sdb->GetBaseDB() : db);
+      StackableDB* sdb = dynamic_cast<StackableDB*>(db.get());
+      DBImpl* db_impl =
+          dynamic_cast<DBImpl*>(sdb ? sdb->GetBaseDB() : db.get());
       if (db_impl != nullptr) {
         cache_set->insert(db_impl->TEST_table_cache());
       }
@@ -58,7 +61,7 @@ class MemoryTest : public testing::Test {
   }
 
   Status GetApproximateMemoryUsageByType(
-      const std::vector<DB*>& dbs,
+      const DBVec& dbs,
       std::map<MemoryUtil::UsageType, uint64_t>* usage_by_type) {
     std::unordered_set<const Cache*> cache_set;
     GetCachePointers(dbs, &cache_set);
@@ -73,7 +76,7 @@ class MemoryTest : public testing::Test {
 };
 
 TEST_F(MemoryTest, SharedBlockCacheTotal) {
-  std::vector<DB*> dbs;
+  std::vector<std::unique_ptr<DB>> dbs;
   std::vector<uint64_t> usage_by_type;
   const int kNumDBs = 10;
   const int kKeySize = 100;
@@ -88,9 +91,7 @@ TEST_F(MemoryTest, SharedBlockCacheTotal) {
   bbt_opts.block_cache = NewLRUCache(4096 * 1000 * 10);
   for (int i = 0; i < kNumDBs; ++i) {
     ASSERT_OK(DestroyDB(GetDBName(i), opt));
-    DB* db = nullptr;
-    ASSERT_OK(DB::Open(opt, GetDBName(i), &db));
-    dbs.push_back(db);
+    ASSERT_OK(DB::Open(opt, GetDBName(i), &dbs.emplace_back()));
   }
 
   std::vector<std::string> keys_by_db[kNumDBs];
@@ -119,13 +120,10 @@ TEST_F(MemoryTest, SharedBlockCacheTotal) {
     ASSERT_EQ(usage_history_[MemoryUtil::kTableReadersTotal][i],
               usage_history_[MemoryUtil::kTableReadersTotal][i - 1]);
   }
-  for (int i = 0; i < kNumDBs; ++i) {
-    delete dbs[i];
-  }
 }
 
 TEST_F(MemoryTest, MemTableAndTableReadersTotal) {
-  std::vector<DB*> dbs;
+  std::vector<std::unique_ptr<DB>> dbs;
   std::vector<uint64_t> usage_by_type;
   std::vector<std::vector<ColumnFamilyHandle*>> vec_handles;
   const int kNumDBs = 10;
@@ -150,10 +148,9 @@ TEST_F(MemoryTest, MemTableAndTableReadersTotal) {
   for (int i = 0; i < kNumDBs; ++i) {
     ASSERT_OK(DestroyDB(GetDBName(i), opt));
     std::vector<ColumnFamilyHandle*> handles;
-    dbs.emplace_back();
     vec_handles.emplace_back();
     ASSERT_OK(DB::Open(DBOptions(opt), GetDBName(i), cf_descs,
-                       &vec_handles.back(), &dbs.back()));
+                       &vec_handles.back(), &dbs.emplace_back()));
   }
 
   // Fill one memtable per Put to make memtable use more memory.
@@ -237,7 +234,6 @@ TEST_F(MemoryTest, MemTableAndTableReadersTotal) {
     for (auto* handle : vec_handles[i]) {
       delete handle;
     }
-    delete dbs[i];
   }
 }
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/memory/memory_util.cc b/utilities/memory/memory_util.cc
index c7bf30bfb716..c252f46c4eb7 100644
--- a/utilities/memory/memory_util.cc
+++ b/utilities/memory/memory_util.cc
@@ -9,14 +9,15 @@
 
 namespace ROCKSDB_NAMESPACE {
 
+template <typename DBPtr>
 Status MemoryUtil::GetApproximateMemoryUsageByType(
-    const std::vector<DB*>& dbs,
+    const std::vector<DBPtr>& dbs,
     const std::unordered_set<const Cache*> cache_set,
     std::map<MemoryUtil::UsageType, uint64_t>* usage_by_type) {
   usage_by_type->clear();
 
   // MemTable
-  for (auto* db : dbs) {
+  for (auto& db : dbs) {
     uint64_t usage = 0;
     if (db->GetAggregatedIntProperty(DB::Properties::kSizeAllMemTables,
                                      &usage)) {
@@ -29,7 +30,7 @@ Status MemoryUtil::GetApproximateMemoryUsageByType(
   }
 
   // Table Readers
-  for (auto* db : dbs) {
+  for (auto& db : dbs) {
     uint64_t usage = 0;
     if (db->GetAggregatedIntProperty(DB::Properties::kEstimateTableReadersMem,
                                      &usage)) {
@@ -46,4 +47,16 @@ Status MemoryUtil::GetApproximateMemoryUsageByType(
 
   return Status::OK();
 }
+
+template Status MemoryUtil::GetApproximateMemoryUsageByType<DB*>(
+    const std::vector<DB*>& dbs,
+    const std::unordered_set<const Cache*> cache_set,
+    std::map<MemoryUtil::UsageType, uint64_t>* usage_by_type);
+
+template Status
+MemoryUtil::GetApproximateMemoryUsageByType<std::unique_ptr<DB>>(
+    const std::vector<std::unique_ptr<DB>>& dbs,
+    const std::unordered_set<const Cache*> cache_set,
+    std::map<MemoryUtil::UsageType, uint64_t>* usage_by_type);
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/merge_operators/string_append/stringappend_test.cc b/utilities/merge_operators/string_append/stringappend_test.cc
index acc71c8e49c1..2b52b8d901f1 100644
--- a/utilities/merge_operators/string_append/stringappend_test.cc
+++ b/utilities/merge_operators/string_append/stringappend_test.cc
@@ -23,6 +23,7 @@
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/utilities/db_ttl.h"
 #include "test_util/testharness.h"
+#include "util/cast_util.h"
 #include "util/random.h"
 #include "utilities/merge_operators.h"
 #include "utilities/merge_operators/string_append/stringappend2.h"
@@ -34,8 +35,7 @@ const std::string kDbName = test::PerThreadDBPath("stringappend_test");
 
 namespace {
 // OpenDb opens a (possibly new) rocksdb database with a StringAppendOperator
-std::shared_ptr<DB> OpenNormalDb(const std::string& delim) {
-  DB* db;
+std::unique_ptr<DB> OpenNormalDb(const std::string& delim) {
   Options options;
   options.create_if_missing = true;
   MergeOperator* mergeOperator;
@@ -45,12 +45,13 @@ std::shared_ptr<DB> OpenNormalDb(const std::string& delim) {
     mergeOperator = new StringAppendOperator(delim);
   }
   options.merge_operator.reset(mergeOperator);
+  std::unique_ptr<DB> db;
   EXPECT_OK(DB::Open(options, kDbName, &db));
-  return std::shared_ptr<DB>(db);
+  return db;
 }
 
 // Open a TtlDB with a non-associative StringAppendTESTOperator
-std::shared_ptr<DB> OpenTtlDb(const std::string& delim) {
+std::unique_ptr<DB> OpenTtlDb(const std::string& delim) {
   DBWithTTL* db;
   Options options;
   options.create_if_missing = true;
@@ -62,7 +63,7 @@ std::shared_ptr<DB> OpenTtlDb(const std::string& delim) {
   }
   options.merge_operator.reset(mergeOperator);
   EXPECT_OK(DBWithTTL::Open(options, kDbName, &db, 123456));
-  return std::shared_ptr<DB>(db);
+  return std::unique_ptr<DB>(db);
 }
 }  // namespace
 
@@ -72,8 +73,7 @@ class StringLists {
  public:
   // Constructor: specifies the rocksdb db
   /* implicit */
-  StringLists(std::shared_ptr<DB> db)
-      : db_(db), merge_option_(), get_option_() {
+  StringLists(UnownedPtr<DB> db) : db_(db), merge_option_(), get_option_() {
     assert(db);
   }
 
@@ -113,7 +113,7 @@ class StringLists {
   }
 
  private:
-  std::shared_ptr<DB> db_;
+  UnownedPtr<DB> db_;
   WriteOptions merge_option_;
   ReadOptions get_option_;
 };
@@ -138,7 +138,7 @@ class StringAppendOperatorTest : public testing::Test,
     StringAppendOperatorTest::SetOpenDbFunction(&OpenNormalDb);
   }
 
-  using OpenFuncPtr = std::shared_ptr<DB> (*)(const std::string&);
+  using OpenFuncPtr = std::unique_ptr<DB> (*)(const std::string&);
 
   // Allows user to open databases with different configurations.
   // e.g.: Can open a DB or a TtlDB, etc.
@@ -154,7 +154,7 @@ StringAppendOperatorTest::OpenFuncPtr StringAppendOperatorTest::OpenDb =
 
 TEST_P(StringAppendOperatorTest, IteratorTest) {
   auto db_ = OpenDb(",");
-  StringLists slists(db_);
+  StringLists slists(db_.get());
 
   slists.Append("k1", "v1");
   slists.Append("k1", "v2");
@@ -249,7 +249,7 @@ TEST_P(StringAppendOperatorTest, IteratorTest) {
 
 TEST_P(StringAppendOperatorTest, SimpleTest) {
   auto db = OpenDb(",");
-  StringLists slists(db);
+  StringLists slists(db.get());
 
   slists.Append("k1", "v1");
   slists.Append("k1", "v2");
@@ -262,7 +262,7 @@ TEST_P(StringAppendOperatorTest, SimpleTest) {
 
 TEST_P(StringAppendOperatorTest, SimpleDelimiterTest) {
   auto db = OpenDb("|");
-  StringLists slists(db);
+  StringLists slists(db.get());
 
   slists.Append("k1", "v1");
   slists.Append("k1", "v2");
@@ -275,7 +275,7 @@ TEST_P(StringAppendOperatorTest, SimpleDelimiterTest) {
 
 TEST_P(StringAppendOperatorTest, EmptyDelimiterTest) {
   auto db = OpenDb("");
-  StringLists slists(db);
+  StringLists slists(db.get());
 
   slists.Append("k1", "v1");
   slists.Append("k1", "v2");
@@ -288,7 +288,7 @@ TEST_P(StringAppendOperatorTest, EmptyDelimiterTest) {
 
 TEST_P(StringAppendOperatorTest, MultiCharDelimiterTest) {
   auto db = OpenDb("<>");
-  StringLists slists(db);
+  StringLists slists(db.get());
 
   slists.Append("k1", "v1");
   slists.Append("k1", "v2");
@@ -302,7 +302,7 @@ TEST_P(StringAppendOperatorTest, MultiCharDelimiterTest) {
 TEST_P(StringAppendOperatorTest, DelimiterIsDefensivelyCopiedTest) {
   std::string delimiter = "<>";
   auto db = OpenDb(delimiter);
-  StringLists slists(db);
+  StringLists slists(db.get());
 
   slists.Append("k1", "v1");
   slists.Append("k1", "v2");
@@ -316,7 +316,7 @@ TEST_P(StringAppendOperatorTest, DelimiterIsDefensivelyCopiedTest) {
 
 TEST_P(StringAppendOperatorTest, OneValueNoDelimiterTest) {
   auto db = OpenDb("!");
-  StringLists slists(db);
+  StringLists slists(db.get());
 
   slists.Append("random_key", "single_val");
 
@@ -327,7 +327,7 @@ TEST_P(StringAppendOperatorTest, OneValueNoDelimiterTest) {
 
 TEST_P(StringAppendOperatorTest, VariousKeys) {
   auto db = OpenDb("\n");
-  StringLists slists(db);
+  StringLists slists(db.get());
 
   slists.Append("c", "asdasd");
   slists.Append("a", "x");
@@ -353,7 +353,7 @@ TEST_P(StringAppendOperatorTest, VariousKeys) {
 // Generate semi random keys/words from a small distribution.
 TEST_P(StringAppendOperatorTest, RandomMixGetAppend) {
   auto db = OpenDb(" ");
-  StringLists slists(db);
+  StringLists slists(db.get());
 
   // Generate a list of random keys and values
   const int kWordCount = 15;
@@ -402,7 +402,7 @@ TEST_P(StringAppendOperatorTest, RandomMixGetAppend) {
 
 TEST_P(StringAppendOperatorTest, BIGRandomMixGetAppend) {
   auto db = OpenDb(" ");
-  StringLists slists(db);
+  StringLists slists(db.get());
 
   // Generate a list of random keys and values
   const int kWordCount = 15;
@@ -453,7 +453,7 @@ TEST_P(StringAppendOperatorTest, PersistentVariousKeys) {
   // Perform the following operations in limited scope
   {
     auto db = OpenDb("\n");
-    StringLists slists(db);
+    StringLists slists(db.get());
 
     slists.Append("c", "asdasd");
     slists.Append("a", "x");
@@ -476,7 +476,7 @@ TEST_P(StringAppendOperatorTest, PersistentVariousKeys) {
   // Reopen the database (the previous changes should persist / be remembered)
   {
     auto db = OpenDb("\n");
-    StringLists slists(db);
+    StringLists slists(db.get());
 
     slists.Append("c", "bbnagnagsx");
     slists.Append("a", "sa");
@@ -502,7 +502,7 @@ TEST_P(StringAppendOperatorTest, PersistentVariousKeys) {
   // Reopen the database (the previous changes should persist / be remembered)
   {
     auto db = OpenDb("\n");
-    StringLists slists(db);
+    StringLists slists(db.get());
 
     // All changes should be on disk. This will test VersionSet Get()
     std::string a, b, c;
@@ -520,7 +520,7 @@ TEST_P(StringAppendOperatorTest, PersistentFlushAndCompaction) {
   // Perform the following operations in limited scope
   {
     auto db = OpenDb("\n");
-    StringLists slists(db);
+    StringLists slists(db.get());
     std::string a, b, c;
 
     // Append, Flush, Get
@@ -559,7 +559,7 @@ TEST_P(StringAppendOperatorTest, PersistentFlushAndCompaction) {
   // Reopen the database (the previous changes should persist / be remembered)
   {
     auto db = OpenDb("\n");
-    StringLists slists(db);
+    StringLists slists(db.get());
     std::string a, b, c;
 
     // Get (Quick check for persistence of previous database)
@@ -607,7 +607,7 @@ TEST_P(StringAppendOperatorTest, PersistentFlushAndCompaction) {
 
 TEST_P(StringAppendOperatorTest, SimpleTestNullDelimiter) {
   auto db = OpenDb(std::string(1, '\0'));
-  StringLists slists(db);
+  StringLists slists(db.get());
 
   slists.Append("k1", "v1");
   slists.Append("k1", "v2");
diff --git a/utilities/object_registry.cc b/utilities/object_registry.cc
index 105d52bf5af3..2b9e4d85aa29 100644
--- a/utilities/object_registry.cc
+++ b/utilities/object_registry.cc
@@ -15,7 +15,7 @@
 
 namespace ROCKSDB_NAMESPACE {
 namespace {
-bool MatchesInteger(const std::string &target, size_t start, size_t pos) {
+bool MatchesInteger(const std::string& target, size_t start, size_t pos) {
   // If it is numeric, everything up to the match must be a number
   int digits = 0;
   if (target[start] == '-') {
@@ -31,7 +31,7 @@ bool MatchesInteger(const std::string &target, size_t start, size_t pos) {
   return (digits > 0);
 }
 
-bool MatchesDecimal(const std::string &target, size_t start, size_t pos) {
+bool MatchesDecimal(const std::string& target, size_t start, size_t pos) {
   int digits = 0;
   if (target[start] == '-') {
     start++;  // Allow negative numbers
@@ -54,8 +54,8 @@ bool MatchesDecimal(const std::string &target, size_t start, size_t pos) {
 }  // namespace
 
 size_t ObjectLibrary::PatternEntry::MatchSeparatorAt(
-    size_t start, Quantifier mode, const std::string &target, size_t tlen,
-    const std::string &separator) const {
+    size_t start, Quantifier mode, const std::string& target, size_t tlen,
+    const std::string& separator) const {
   size_t slen = separator.size();
   // See if there is enough space.  If so, find the separator
   if (tlen < start + slen) {
@@ -87,9 +87,9 @@ size_t ObjectLibrary::PatternEntry::MatchSeparatorAt(
   }
 }
 
-bool ObjectLibrary::PatternEntry::MatchesTarget(const std::string &name,
+bool ObjectLibrary::PatternEntry::MatchesTarget(const std::string& name,
                                                 size_t nlen,
-                                                const std::string &target,
+                                                const std::string& target,
                                                 size_t tlen) const {
   if (separators_.empty()) {
     assert(optional_);  // If there are no separators, it must be only a name
@@ -109,7 +109,7 @@ bool ObjectLibrary::PatternEntry::MatchesTarget(const std::string &name,
     size_t start = nlen;
     auto mode = kMatchExact;
     for (size_t idx = 0; idx < separators_.size(); ++idx) {
-      const auto &separator = separators_[idx];
+      const auto& separator = separators_[idx];
       start = MatchSeparatorAt(start, mode, target, tlen, separator.first);
       if (start == std::string::npos) {
         return false;
@@ -132,12 +132,12 @@ bool ObjectLibrary::PatternEntry::MatchesTarget(const std::string &name,
   return true;
 }
 
-bool ObjectLibrary::PatternEntry::Matches(const std::string &target) const {
+bool ObjectLibrary::PatternEntry::Matches(const std::string& target) const {
   auto tlen = target.size();
   if (MatchesTarget(name_, nlength_, target, tlen)) {
     return true;
   } else if (!names_.empty()) {
-    for (const auto &alt : names_) {
+    for (const auto& alt : names_) {
       if (MatchesTarget(alt, alt.size(), target, tlen)) {
         return true;
       }
@@ -146,17 +146,17 @@ bool ObjectLibrary::PatternEntry::Matches(const std::string &target) const {
   return false;
 }
 
-size_t ObjectLibrary::GetFactoryCount(size_t *types) const {
+size_t ObjectLibrary::GetFactoryCount(size_t* types) const {
   std::unique_lock<std::mutex> lock(mu_);
   *types = factories_.size();
   size_t factories = 0;
-  for (const auto &e : factories_) {
+  for (const auto& e : factories_) {
     factories += e.second.size();
   }
   return factories;
 }
 
-size_t ObjectLibrary::GetFactoryCount(const std::string &type) const {
+size_t ObjectLibrary::GetFactoryCount(const std::string& type) const {
   std::unique_lock<std::mutex> lock(mu_);
   auto iter = factories_.find(type);
   if (iter != factories_.end()) {
@@ -166,36 +166,36 @@ size_t ObjectLibrary::GetFactoryCount(const std::string &type) const {
   }
 }
 
-void ObjectLibrary::GetFactoryNames(const std::string &type,
-                                    std::vector<std::string> *names) const {
+void ObjectLibrary::GetFactoryNames(const std::string& type,
+                                    std::vector<std::string>* names) const {
   assert(names);
   std::unique_lock<std::mutex> lock(mu_);
   auto iter = factories_.find(type);
   if (iter != factories_.end()) {
-    for (const auto &f : iter->second) {
+    for (const auto& f : iter->second) {
       names->push_back(f->Name());
     }
   }
 }
 
 void ObjectLibrary::GetFactoryTypes(
-    std::unordered_set<std::string> *types) const {
+    std::unordered_set<std::string>* types) const {
   assert(types);
   std::unique_lock<std::mutex> lock(mu_);
-  for (const auto &iter : factories_) {
+  for (const auto& iter : factories_) {
     types->insert(iter.first);
   }
 }
 
-void ObjectLibrary::Dump(Logger *logger) const {
+void ObjectLibrary::Dump(Logger* logger) const {
   std::unique_lock<std::mutex> lock(mu_);
   if (logger != nullptr && !factories_.empty()) {
     ROCKS_LOG_HEADER(logger, "    Registered Library: %s\n", id_.c_str());
-    for (const auto &iter : factories_) {
+    for (const auto& iter : factories_) {
       ROCKS_LOG_HEADER(logger, "    Registered factories for type[%s] ",
                        iter.first.c_str());
       bool printed_one = false;
-      for (const auto &e : iter.second) {
+      for (const auto& e : iter.second) {
         ROCKS_LOG_HEADER(logger, "%c %s", (printed_one) ? ',' : ':', e->Name());
         printed_one = true;
       }
@@ -205,7 +205,7 @@ void ObjectLibrary::Dump(Logger *logger) const {
 
 // Returns the Default singleton instance of the ObjectLibrary
 // This instance will contain most of the "standard" registered objects
-std::shared_ptr<ObjectLibrary> &ObjectLibrary::Default() {
+std::shared_ptr<ObjectLibrary>& ObjectLibrary::Default() {
   // Use avoid destruction here so the default ObjectLibrary will not be
   // statically destroyed and long-lived.
   STATIC_AVOID_DESTRUCTION(std::shared_ptr<ObjectLibrary>, instance)
@@ -213,9 +213,9 @@ std::shared_ptr<ObjectLibrary> &ObjectLibrary::Default() {
   return instance;
 }
 
-ObjectRegistry::ObjectRegistry(const std::shared_ptr<ObjectLibrary> &library) {
+ObjectRegistry::ObjectRegistry(const std::shared_ptr<ObjectLibrary>& library) {
   libraries_.push_back(library);
-  for (const auto &b : builtins_) {
+  for (const auto& b : builtins_) {
     RegisterPlugin(b.first, b.second);
   }
 }
@@ -233,13 +233,13 @@ std::shared_ptr<ObjectRegistry> ObjectRegistry::NewInstance() {
 }
 
 std::shared_ptr<ObjectRegistry> ObjectRegistry::NewInstance(
-    const std::shared_ptr<ObjectRegistry> &parent) {
+    const std::shared_ptr<ObjectRegistry>& parent) {
   return std::make_shared<ObjectRegistry>(parent);
 }
 
 Status ObjectRegistry::SetManagedObject(
-    const std::string &type, const std::string &id,
-    const std::shared_ptr<Customizable> &object) {
+    const std::string& type, const std::string& id,
+    const std::shared_ptr<Customizable>& object) {
   std::string object_key = ToManagedObjectKey(type, id);
   std::shared_ptr<Customizable> curr;
   if (parent_ != nullptr) {
@@ -267,7 +267,7 @@ Status ObjectRegistry::SetManagedObject(
 }
 
 std::shared_ptr<Customizable> ObjectRegistry::GetManagedObject(
-    const std::string &type, const std::string &id) const {
+    const std::string& type, const std::string& id) const {
   {
     std::unique_lock<std::mutex> lock(objects_mutex_);
     auto iter = managed_objects_.find(ToManagedObjectKey(type, id));
@@ -283,8 +283,8 @@ std::shared_ptr<Customizable> ObjectRegistry::GetManagedObject(
 }
 
 Status ObjectRegistry::ListManagedObjects(
-    const std::string &type, const std::string &name,
-    std::vector<std::shared_ptr<Customizable>> *results) const {
+    const std::string& type, const std::string& name,
+    std::vector<std::shared_ptr<Customizable>>* results) const {
   {
     std::string key = ToManagedObjectKey(type, name);
     std::unique_lock<std::mutex> lock(objects_mutex_);
@@ -309,50 +309,50 @@ Status ObjectRegistry::ListManagedObjects(
 // Returns the number of registered types for this registry.
 // If specified (not-null), types is updated to include the names of the
 // registered types.
-size_t ObjectRegistry::GetFactoryCount(const std::string &type) const {
+size_t ObjectRegistry::GetFactoryCount(const std::string& type) const {
   size_t count = 0;
   if (parent_ != nullptr) {
     count = parent_->GetFactoryCount(type);
   }
   std::unique_lock<std::mutex> lock(library_mutex_);
-  for (const auto &library : libraries_) {
+  for (const auto& library : libraries_) {
     count += library->GetFactoryCount(type);
   }
   return count;
 }
 
-void ObjectRegistry::GetFactoryNames(const std::string &type,
-                                     std::vector<std::string> *names) const {
+void ObjectRegistry::GetFactoryNames(const std::string& type,
+                                     std::vector<std::string>* names) const {
   assert(names);
   names->clear();
   if (parent_ != nullptr) {
     parent_->GetFactoryNames(type, names);
   }
   std::unique_lock<std::mutex> lock(library_mutex_);
-  for (const auto &library : libraries_) {
+  for (const auto& library : libraries_) {
     library->GetFactoryNames(type, names);
   }
 }
 
 void ObjectRegistry::GetFactoryTypes(
-    std::unordered_set<std::string> *types) const {
+    std::unordered_set<std::string>* types) const {
   assert(types);
   if (parent_ != nullptr) {
     parent_->GetFactoryTypes(types);
   }
   std::unique_lock<std::mutex> lock(library_mutex_);
-  for (const auto &library : libraries_) {
+  for (const auto& library : libraries_) {
     library->GetFactoryTypes(types);
   }
 }
 
-void ObjectRegistry::Dump(Logger *logger) const {
+void ObjectRegistry::Dump(Logger* logger) const {
   if (logger != nullptr) {
     std::unique_lock<std::mutex> lock(library_mutex_);
     if (!plugins_.empty()) {
       ROCKS_LOG_HEADER(logger, "    Registered Plugins:");
       bool printed_one = false;
-      for (const auto &plugin : plugins_) {
+      for (const auto& plugin : plugins_) {
         ROCKS_LOG_HEADER(logger, "%s%s", (printed_one) ? ", " : " ",
                          plugin.c_str());
         printed_one = true;
@@ -368,8 +368,8 @@ void ObjectRegistry::Dump(Logger *logger) const {
   }
 }
 
-int ObjectRegistry::RegisterPlugin(const std::string &name,
-                                   const RegistrarFunc &func) {
+int ObjectRegistry::RegisterPlugin(const std::string& name,
+                                   const RegistrarFunc& func) {
   if (!name.empty() && func != nullptr) {
     plugins_.push_back(name);
     return AddLibrary(name)->Register(func, name);
diff --git a/utilities/option_change_migration/option_change_migration.cc b/utilities/option_change_migration/option_change_migration.cc
index a08c5b59292c..4de1ff85107e 100644
--- a/utilities/option_change_migration/option_change_migration.cc
+++ b/utilities/option_change_migration/option_change_migration.cc
@@ -23,147 +23,336 @@ Options GetNoCompactionOptions(const Options& opts) {
   return ret_opts;
 }
 
-Status OpenDb(const Options& options, const std::string& dbname,
-              std::unique_ptr<DB>* db) {
-  db->reset();
-  DB* tmpdb;
-  Status s = DB::Open(options, dbname, &tmpdb);
-  if (s.ok()) {
-    db->reset(tmpdb);
-  }
-  return s;
-}
+// Compact a specific CF to a specific level
+//  cf_handle should not be null
+Status CompactToLevel(DB* db, ColumnFamilyHandle* cf_handle, int dest_level) {
+  assert(cf_handle != nullptr);
 
-// l0_file_size specifies size of file on L0. Files will be range partitioned
-// after a full compaction so they are likely qualified to put on L0. If
-// left as 0, the files are compacted in a single file and put to L0. Otherwise,
-// will try to compact the files as size l0_file_size.
-Status CompactToLevel(const Options& options, const std::string& dbname,
-                      int dest_level, uint64_t l0_file_size, bool need_reopen) {
-  std::unique_ptr<DB> db;
-  Options no_compact_opts = GetNoCompactionOptions(options);
-  if (dest_level == 0) {
-    if (l0_file_size == 0) {
-      // Single file.
-      l0_file_size = 999999999999999;
-    }
-    // L0 has strict sequenceID requirements to files to it. It's safer
-    // to only put one compacted file to there.
-    // This is only used for converting to universal compaction with
-    // only one level. In this case, compacting to one file is also
-    // optimal.
-    no_compact_opts.target_file_size_base = l0_file_size;
-    no_compact_opts.max_compaction_bytes = l0_file_size;
-  }
-  Status s = OpenDb(no_compact_opts, dbname, &db);
-  if (!s.ok()) {
-    return s;
-  }
   CompactRangeOptions cro;
   cro.change_level = true;
   cro.target_level = dest_level;
+
   if (dest_level == 0) {
     // cannot use kForceOptimized because the compaction is expected to
-    // generate one output file
+    // generate one output file so to force the full compaction to skip trivial
+    // move to L0
     cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
   }
-  s = db->CompactRange(cro, nullptr, nullptr);
 
-  if (s.ok() && need_reopen) {
-    // Need to restart DB to rewrite the manifest file.
-    // In order to open a DB with specific num_levels, the manifest file should
-    // contain no record that mentiones any level beyond num_levels. Issuing a
-    // full compaction will move all the data to a level not exceeding
-    // num_levels, but the manifest may still contain previous record mentioning
-    // a higher level. Reopening the DB will force the manifest to be rewritten
-    // so that those records will be cleared.
-    db.reset();
-    s = OpenDb(no_compact_opts, dbname, &db);
-  }
-  return s;
+  return db->CompactRange(cro, cf_handle, nullptr, nullptr);
 }
 
-Status MigrateToUniversal(std::string dbname, const Options& old_opts,
-                          const Options& new_opts) {
-  if (old_opts.num_levels <= new_opts.num_levels ||
-      old_opts.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
-    return Status::OK();
-  } else {
-    bool need_compact = false;
-    {
-      std::unique_ptr<DB> db;
-      Options opts = GetNoCompactionOptions(old_opts);
-      Status s = OpenDb(opts, dbname, &db);
-      if (!s.ok()) {
-        return s;
-      }
-      ColumnFamilyMetaData metadata;
-      db->GetColumnFamilyMetaData(&metadata);
-      if (!metadata.levels.empty() &&
-          metadata.levels.back().level >= new_opts.num_levels) {
-        need_compact = true;
-      }
-    }
-    if (need_compact) {
-      return CompactToLevel(old_opts, dbname, new_opts.num_levels - 1,
-                            /*l0_file_size=*/0, true);
-    }
+Status MigrateToUniversal(DB* db, ColumnFamilyHandle* cf_handle,
+                          int old_num_levels, int new_num_levels) {
+  assert(cf_handle != nullptr);
+
+  if (old_num_levels <= new_num_levels) {
     return Status::OK();
   }
+
+  // Check if compaction is needed
+  ColumnFamilyMetaData metadata;
+  db->GetColumnFamilyMetaData(cf_handle, &metadata);
+
+  if (!metadata.levels.empty() &&
+      metadata.levels.back().level >= new_num_levels) {
+    // Need to compact to fit new num_levels
+    return CompactToLevel(db, cf_handle, new_num_levels - 1);
+  }
+
+  return Status::OK();
 }
 
-Status MigrateToLevelBase(std::string dbname, const Options& old_opts,
-                          const Options& new_opts) {
-  if (!new_opts.level_compaction_dynamic_level_bytes) {
-    if (old_opts.num_levels == 1) {
+Status MigrateToLevelBase(DB* db, ColumnFamilyHandle* cf_handle,
+                          int old_num_levels, int new_num_levels,
+                          bool dynamic_level_bytes) {
+  assert(cf_handle != nullptr);
+
+  if (!dynamic_level_bytes) {
+    // Non-dynamic level mode
+    if (old_num_levels == 1) {
       return Status::OK();
     }
-    // Compact everything to level 1 to guarantee it can be safely opened.
-    Options opts = old_opts;
-    opts.target_file_size_base = new_opts.target_file_size_base;
-    // Although sometimes we can open the DB with the new option without error,
-    // We still want to compact the files to avoid the LSM tree to stuck
-    // in bad shape. For example, if the user changed the level size
-    // multiplier from 4 to 8, with the same data, we will have fewer
-    // levels. Unless we issue a full comaction, the LSM tree may stuck
-    // with more levels than needed and it won't recover automatically.
-    return CompactToLevel(opts, dbname, 1, /*l0_file_size=*/0, true);
+    // Compact to L1
+    return CompactToLevel(db, cf_handle, 1);
+
   } else {
-    // Compact everything to the last level to guarantee it can be safely
-    // opened.
-    if (old_opts.num_levels == 1) {
+    // Dynamic level mode
+    if (old_num_levels == 1) {
       return Status::OK();
-    } else if (new_opts.num_levels > old_opts.num_levels) {
-      // Dynamic level mode requires data to be put in the last level first.
-      return CompactToLevel(new_opts, dbname, new_opts.num_levels - 1,
-                            /*l0_file_size=*/0, false);
-    } else {
-      Options opts = old_opts;
-      opts.target_file_size_base = new_opts.target_file_size_base;
-      return CompactToLevel(opts, dbname, new_opts.num_levels - 1,
-                            /*l0_file_size=*/0, true);
     }
+    // Compact to last level
+    return CompactToLevel(db, cf_handle, new_num_levels - 1);
   }
 }
-}  // namespace
 
-Status OptionChangeMigration(std::string dbname, const Options& old_opts,
-                             const Options& new_opts) {
+Status MigrateToFIFO(DB* db, ColumnFamilyHandle* cf_handle) {
+  assert(cf_handle != nullptr);
+  return CompactToLevel(db, cf_handle, 0);
+}
+
+Status MigrateSingleColumnFamily(DB* db, ColumnFamilyHandle* cf_handle,
+                                 const Options& old_opts,
+                                 const Options& new_opts) {
+  assert(cf_handle != nullptr);
+
   if (old_opts.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
-    // LSM generated by FIFO compaction can be opened by any compaction.
     return Status::OK();
-  } else if (new_opts.compaction_style ==
-             CompactionStyle::kCompactionStyleUniversal) {
-    return MigrateToUniversal(dbname, old_opts, new_opts);
+  }
+
+  if (new_opts.compaction_style == CompactionStyle::kCompactionStyleUniversal) {
+    return MigrateToUniversal(db, cf_handle, old_opts.num_levels,
+                              new_opts.num_levels);
   } else if (new_opts.compaction_style ==
              CompactionStyle::kCompactionStyleLevel) {
-    return MigrateToLevelBase(dbname, old_opts, new_opts);
+    return MigrateToLevelBase(db, cf_handle, old_opts.num_levels,
+                              new_opts.num_levels,
+                              new_opts.level_compaction_dynamic_level_bytes);
   } else if (new_opts.compaction_style ==
              CompactionStyle::kCompactionStyleFIFO) {
-    return CompactToLevel(old_opts, dbname, 0, 0 /* l0_file_size */, true);
+    return MigrateToFIFO(db, cf_handle);
+  }
+
+  return Status::NotSupported(
+      "Do not know how to migrate to this compaction style");
+}
+
+Status ValidateCFDescriptors(
+    const std::vector<ColumnFamilyDescriptor>& old_cf_descs,
+    const std::vector<ColumnFamilyDescriptor>& new_cf_descs) {
+  if (old_cf_descs.size() != new_cf_descs.size()) {
+    return Status::InvalidArgument(
+        "old_cf_descs and new_cf_descs must have the same number of column "
+        "families. Got " +
+        std::to_string(old_cf_descs.size()) + " old CFs and " +
+        std::to_string(new_cf_descs.size()) +
+        " new CFs. Adding or dropping CFs is not supported.");
+  }
+
+  for (size_t i = 0; i < old_cf_descs.size(); ++i) {
+    if (old_cf_descs[i].name != new_cf_descs[i].name) {
+      return Status::InvalidArgument(
+          "Column family mismatch at index " + std::to_string(i) + ": " +
+          "old has '" + old_cf_descs[i].name + "', " + "new has '" +
+          new_cf_descs[i].name + "'. CF names and order must match exactly.");
+    }
+  }
+
+  return Status::OK();
+}
+
+struct BaseOptionsResult {
+  ColumnFamilyOptions base_opts;
+  bool need_reopen = true;
+};
+
+BaseOptionsResult DetermineBaseOptions(const ColumnFamilyOptions& old_opts,
+                                       const ColumnFamilyOptions& new_opts) {
+  BaseOptionsResult result;
+
+  if (new_opts.compaction_style == CompactionStyle::kCompactionStyleLevel) {
+    if (!new_opts.level_compaction_dynamic_level_bytes) {
+      result.base_opts = old_opts;
+      result.base_opts.target_file_size_base = new_opts.target_file_size_base;
+    } else {
+      if (new_opts.num_levels > old_opts.num_levels) {
+        result.base_opts = new_opts;
+        result.need_reopen = false;
+      } else {
+        result.base_opts = old_opts;
+        result.base_opts.target_file_size_base = new_opts.target_file_size_base;
+      }
+    }
   } else {
-    return Status::NotSupported(
-        "Do not how to migrate to this compaction style");
+    result.base_opts = old_opts;
+  }
+
+  return result;
+}
+
+void ApplySpecialSingleLevelSettings(const ColumnFamilyOptions& new_opts,
+                                     ColumnFamilyOptions* base_opts) {
+  if (((new_opts.compaction_style ==
+            CompactionStyle::kCompactionStyleUniversal ||
+        new_opts.compaction_style == CompactionStyle::kCompactionStyleLevel) &&
+       new_opts.num_levels == 1) ||
+      new_opts.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+    base_opts->target_file_size_base = 999999999999999;
+    base_opts->max_compaction_bytes = 999999999999999;
+  }
+}
+
+std::vector<ColumnFamilyDescriptor> PrepareNoCompactionCFDescriptors(
+    const DBOptions& old_db_opts,
+    const std::vector<ColumnFamilyDescriptor>& old_cf_descs,
+    const std::vector<ColumnFamilyDescriptor>& new_cf_descs,
+    bool* any_need_reopen) {
+  assert(old_cf_descs.size() == new_cf_descs.size());
+
+  std::vector<ColumnFamilyDescriptor> no_compact_cf_descs;
+  *any_need_reopen = false;
+
+  for (size_t i = 0; i < old_cf_descs.size(); ++i) {
+    const std::string& cf_name = old_cf_descs[i].name;
+    const ColumnFamilyOptions& old_opts = old_cf_descs[i].options;
+    const ColumnFamilyOptions& new_opts = new_cf_descs[i].options;
+
+    BaseOptionsResult result = DetermineBaseOptions(old_opts, new_opts);
+    ColumnFamilyOptions base_opts = result.base_opts;
+
+    if (result.need_reopen) {
+      *any_need_reopen = true;
+    }
+
+    ApplySpecialSingleLevelSettings(new_opts, &base_opts);
+
+    Options tmp_opts(old_db_opts, base_opts);
+    Options no_compact_opts = GetNoCompactionOptions(tmp_opts);
+
+    no_compact_cf_descs.emplace_back(cf_name,
+                                     ColumnFamilyOptions(no_compact_opts));
+  }
+
+  return no_compact_cf_descs;
+}
+
+Status OpenDBWithCFs(const DBOptions& db_opts, const std::string& dbname,
+                     const std::vector<ColumnFamilyDescriptor>& cf_descs,
+                     std::unique_ptr<DB>* db,
+                     std::vector<ColumnFamilyHandle*>* handles) {
+  handles->clear();
+  Status s = DB::Open(db_opts, dbname, cf_descs, handles, db);
+
+  if (!s.ok()) {
+    for (auto* handle : *handles) {
+      delete handle;
+    }
+    handles->clear();
+  }
+
+  return s;
+}
+
+Status CleanupCFHandles(DB* db, std::vector<ColumnFamilyHandle*>* handles) {
+  Status s;
+  for (auto* handle : *handles) {
+    if (handle != db->DefaultColumnFamily()) {
+      Status destroy_status = db->DestroyColumnFamilyHandle(handle);
+      if (!destroy_status.ok() && s.ok()) {
+        s = destroy_status;
+      }
+    }
+  }
+  handles->clear();
+  return s;
+}
+
+Status MigrateAllCFs(DB* db, const std::vector<ColumnFamilyHandle*>& handles,
+                     const DBOptions& old_db_opts, const DBOptions& new_db_opts,
+                     const std::vector<ColumnFamilyDescriptor>& old_cf_descs,
+                     const std::vector<ColumnFamilyDescriptor>& new_cf_descs) {
+  assert(handles.size() == old_cf_descs.size());
+  assert(old_cf_descs.size() == new_cf_descs.size());
+
+  for (size_t i = 0; i < handles.size(); ++i) {
+    const ColumnFamilyOptions& old_cf_opts = old_cf_descs[i].options;
+    const ColumnFamilyOptions& new_cf_opts = new_cf_descs[i].options;
+
+    Options old_opts(old_db_opts, old_cf_opts);
+    Options new_opts(new_db_opts, new_cf_opts);
+
+    Status s = MigrateSingleColumnFamily(db, handles[i], old_opts, new_opts);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  return Status::OK();
+}
+}  // namespace
+
+Status OptionChangeMigration(
+    const std::string& dbname, const DBOptions& old_db_opts,
+    const std::vector<ColumnFamilyDescriptor>& old_cf_descs,
+    const DBOptions& new_db_opts,
+    const std::vector<ColumnFamilyDescriptor>& new_cf_descs) {
+  // Step 1: Validate that old and new have same CFs in same order
+  Status s = ValidateCFDescriptors(old_cf_descs, new_cf_descs);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Step 2: Prepare no-compaction CF descriptors
+  bool any_need_reopen = false;
+  std::vector<ColumnFamilyDescriptor> no_compact_cf_descs =
+      PrepareNoCompactionCFDescriptors(old_db_opts, old_cf_descs, new_cf_descs,
+                                       &any_need_reopen);
+
+  // Step 3: Open DB with all CFs
+  std::unique_ptr<DB> db;
+  std::vector<ColumnFamilyHandle*> handles;
+  s = OpenDBWithCFs(old_db_opts, dbname, no_compact_cf_descs, &db, &handles);
+  if (!s.ok()) {
+    return s;
   }
+  assert(db != nullptr);
+
+  // Step 4: Migrate all CFs
+  s = MigrateAllCFs(db.get(), handles, old_db_opts, new_db_opts, old_cf_descs,
+                    new_cf_descs);
+
+  // Step 5: Cleanup CF handles
+  Status cleanup_status = CleanupCFHandles(db.get(), &handles);
+  if (s.ok() && !cleanup_status.ok()) {
+    s = cleanup_status;
+  }
+
+  // Step 6: Close and reopen DB if needed to rewrite manifest
+  if (s.ok() && any_need_reopen) {
+    Status close_status = db->Close();
+    if (!close_status.ok()) {
+      return close_status;
+    }
+    db.reset();
+
+    s = OpenDBWithCFs(old_db_opts, dbname, no_compact_cf_descs, &db, &handles);
+    if (!s.ok()) {
+      return s;
+    }
+
+    // Cleanup CF handles before final close
+    cleanup_status = CleanupCFHandles(db.get(), &handles);
+    if (!cleanup_status.ok() && s.ok()) {
+      s = cleanup_status;
+    }
+  }
+
+  // Final step: Close DB (either after reopening or without reopening)
+  Status close_status = db->Close();
+  if (!close_status.ok() && s.ok()) {
+    s = close_status;
+  }
+
+  db.reset();
+
+  return s;
+}
+
+Status OptionChangeMigration(const std::string& dbname, const Options& old_opts,
+                             const Options& new_opts) {
+  DBOptions old_db_opts(old_opts);
+  DBOptions new_db_opts(new_opts);
+
+  ColumnFamilyOptions old_cf_opts(old_opts);
+  ColumnFamilyOptions new_cf_opts(new_opts);
+
+  std::vector<ColumnFamilyDescriptor> old_cf_descs = {
+      {kDefaultColumnFamilyName, old_cf_opts}};
+
+  std::vector<ColumnFamilyDescriptor> new_cf_descs = {
+      {kDefaultColumnFamilyName, new_cf_opts}};
+
+  return OptionChangeMigration(dbname, old_db_opts, old_cf_descs, new_db_opts,
+                               new_cf_descs);
 }
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/option_change_migration/option_change_migration_test.cc b/utilities/option_change_migration/option_change_migration_test.cc
index 9984f0dd456e..4a78e9fe1111 100644
--- a/utilities/option_change_migration/option_change_migration_test.cc
+++ b/utilities/option_change_migration/option_change_migration_test.cc
@@ -556,6 +556,385 @@ TEST_F(DBOptionChangeMigrationTest, CompactedSrcToUniversal) {
   }
 }
 
+class DBOptionChangeMigrationMultiCFTest : public DBTestBase {
+ public:
+  DBOptionChangeMigrationMultiCFTest()
+      : DBTestBase("db_option_change_migration_multi_cf_test",
+                   /*env_do_fsync=*/true) {}
+};
+
+TEST_F(DBOptionChangeMigrationMultiCFTest, BasicMultiCF) {
+  Options options = CurrentOptions();
+  options.compaction_style = CompactionStyle::kCompactionStyleLevel;
+  options.level_compaction_dynamic_level_bytes = false;
+  options.num_levels = 4;
+  options.write_buffer_size = 64 * 1024;
+  options.target_file_size_base = 128 * 1024;
+
+  // Create DB with default CF
+  Reopen(options);
+
+  // Create additional CF
+  ColumnFamilyHandle* cf_handle;
+  ASSERT_OK(db_->CreateColumnFamily(options, "cf1", &cf_handle));
+
+  // Write data to both CFs
+  Random rnd(301);
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put("key" + std::to_string(i), rnd.RandomString(900)));
+    ASSERT_OK(db_->Put(WriteOptions(), cf_handle, "key" + std::to_string(i),
+                       rnd.RandomString(900)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Flush(FlushOptions(), cf_handle));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Collect keys from both CFs
+  std::set<std::string> keys_default;
+  std::set<std::string> keys_cf1;
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    for (it->SeekToFirst(); it->Valid(); it->Next()) {
+      keys_default.insert(it->key().ToString());
+    }
+    ASSERT_OK(it->status());
+  }
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions(), cf_handle));
+    for (it->SeekToFirst(); it->Valid(); it->Next()) {
+      keys_cf1.insert(it->key().ToString());
+    }
+    ASSERT_OK(it->status());
+  }
+
+  delete cf_handle;
+  Close();
+
+  // Prepare old and new options
+  DBOptions old_db_opts(options);
+  ColumnFamilyOptions old_cf_opts(options);
+
+  std::vector<ColumnFamilyDescriptor> old_cf_descs = {
+      {kDefaultColumnFamilyName, old_cf_opts}, {"cf1", old_cf_opts}};
+
+  // New options: migrate to Universal compaction
+  Options new_options = options;
+  new_options.compaction_style = CompactionStyle::kCompactionStyleUniversal;
+  new_options.num_levels = 5;
+  new_options.target_file_size_base = 256 * 1024;
+
+  DBOptions new_db_opts(new_options);
+  ColumnFamilyOptions new_cf_opts(new_options);
+
+  std::vector<ColumnFamilyDescriptor> new_cf_descs = {
+      {kDefaultColumnFamilyName, new_cf_opts}, {"cf1", new_cf_opts}};
+
+  // Perform multi-CF migration
+  ASSERT_OK(OptionChangeMigration(dbname_, old_db_opts, old_cf_descs,
+                                  new_db_opts, new_cf_descs));
+
+  // Reopen with new options
+  std::vector<ColumnFamilyHandle*> handles;
+  ASSERT_OK(DB::Open(new_db_opts, dbname_, new_cf_descs, &handles, &db_));
+  ASSERT_EQ(handles.size(), 2);
+
+  // Verify data in default CF
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    it->SeekToFirst();
+    for (const std::string& key : keys_default) {
+      ASSERT_TRUE(it->Valid());
+      ASSERT_EQ(key, it->key().ToString());
+      it->Next();
+    }
+    ASSERT_TRUE(!it->Valid());
+    ASSERT_OK(it->status());
+  }
+
+  // Verify data in cf1
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions(), handles[1]));
+    it->SeekToFirst();
+    for (const std::string& key : keys_cf1) {
+      ASSERT_TRUE(it->Valid());
+      ASSERT_EQ(key, it->key().ToString());
+      it->Next();
+    }
+    ASSERT_TRUE(!it->Valid());
+    ASSERT_OK(it->status());
+  }
+
+  // Cleanup
+  for (auto* handle : handles) {
+    if (handle != db_->DefaultColumnFamily()) {
+      ASSERT_OK(db_->DestroyColumnFamilyHandle(handle));
+    }
+  }
+}
+
+TEST_F(DBOptionChangeMigrationMultiCFTest, DifferentStylesPerCF) {
+  // Create DB with 2 CFs, both using Level compaction
+  Options options1 = CurrentOptions();
+  options1.compaction_style = CompactionStyle::kCompactionStyleLevel;
+  options1.num_levels = 4;
+  options1.write_buffer_size = 64 * 1024;
+
+  Reopen(options1);
+
+  ColumnFamilyHandle* cf_handle;
+  ASSERT_OK(db_->CreateColumnFamily(options1, "cf1", &cf_handle));
+
+  // Write data
+  Random rnd(301);
+  for (int i = 0; i < 50; i++) {
+    ASSERT_OK(Put("key" + std::to_string(i), rnd.RandomString(900)));
+    ASSERT_OK(db_->Put(WriteOptions(), cf_handle, "key" + std::to_string(i),
+                       rnd.RandomString(900)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Flush(FlushOptions(), cf_handle));
+
+  // Collect keys from both CFs
+  std::set<std::string> keys_default;
+  std::set<std::string> keys_cf1;
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    for (it->SeekToFirst(); it->Valid(); it->Next()) {
+      keys_default.insert(it->key().ToString());
+    }
+    ASSERT_OK(it->status());
+  }
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions(), cf_handle));
+    for (it->SeekToFirst(); it->Valid(); it->Next()) {
+      keys_cf1.insert(it->key().ToString());
+    }
+    ASSERT_OK(it->status());
+  }
+
+  delete cf_handle;
+  Close();
+
+  // Old descriptors
+  DBOptions old_db_opts(options1);
+  ColumnFamilyOptions old_cf_opts(options1);
+
+  std::vector<ColumnFamilyDescriptor> old_cf_descs = {
+      {kDefaultColumnFamilyName, old_cf_opts}, {"cf1", old_cf_opts}};
+
+  // New descriptors: default CF to Universal, cf1 to Level with dynamic
+  Options new_opts_default = options1;
+  new_opts_default.compaction_style =
+      CompactionStyle::kCompactionStyleUniversal;
+  new_opts_default.num_levels = 5;
+
+  Options new_opts_cf1 = options1;
+  new_opts_cf1.compaction_style = CompactionStyle::kCompactionStyleLevel;
+  new_opts_cf1.level_compaction_dynamic_level_bytes = true;
+  new_opts_cf1.num_levels = 5;
+
+  DBOptions new_db_opts(new_opts_default);
+
+  std::vector<ColumnFamilyDescriptor> new_cf_descs = {
+      {kDefaultColumnFamilyName, ColumnFamilyOptions(new_opts_default)},
+      {"cf1", ColumnFamilyOptions(new_opts_cf1)}};
+
+  // Perform migration
+  ASSERT_OK(OptionChangeMigration(dbname_, old_db_opts, old_cf_descs,
+                                  new_db_opts, new_cf_descs));
+
+  // Reopen and verify
+  std::vector<ColumnFamilyHandle*> handles;
+  ASSERT_OK(DB::Open(new_db_opts, dbname_, new_cf_descs, &handles, &db_));
+  ASSERT_EQ(handles.size(), 2);
+
+  // Verify data in default CF
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    it->SeekToFirst();
+    for (const std::string& key : keys_default) {
+      ASSERT_TRUE(it->Valid());
+      ASSERT_EQ(key, it->key().ToString());
+      it->Next();
+    }
+    ASSERT_TRUE(!it->Valid());
+    ASSERT_OK(it->status());
+  }
+
+  // Verify data in cf1
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions(), handles[1]));
+    it->SeekToFirst();
+    for (const std::string& key : keys_cf1) {
+      ASSERT_TRUE(it->Valid());
+      ASSERT_EQ(key, it->key().ToString());
+      it->Next();
+    }
+    ASSERT_TRUE(!it->Valid());
+    ASSERT_OK(it->status());
+  }
+
+  // Cleanup
+  for (auto* handle : handles) {
+    if (handle != db_->DefaultColumnFamily()) {
+      ASSERT_OK(db_->DestroyColumnFamilyHandle(handle));
+    }
+  }
+}
+
+TEST_F(DBOptionChangeMigrationMultiCFTest, ValidationMismatched) {
+  Options options = CurrentOptions();
+  DBOptions db_opts(options);
+  ColumnFamilyOptions cf_opts(options);
+
+  // Test 1: Mismatched CF count (missing cf1)
+  {
+    std::vector<ColumnFamilyDescriptor> old_cf_descs = {
+        {kDefaultColumnFamilyName, cf_opts}, {"cf1", cf_opts}};
+
+    std::vector<ColumnFamilyDescriptor> new_cf_descs = {
+        {kDefaultColumnFamilyName, cf_opts}};  // Missing cf1
+
+    Status s = OptionChangeMigration(dbname_, db_opts, old_cf_descs, db_opts,
+                                     new_cf_descs);
+    ASSERT_TRUE(s.IsInvalidArgument());
+    ASSERT_TRUE(s.ToString().find("same number") != std::string::npos);
+  }
+
+  // Test 2: Mismatched CF names (cf2 instead of cf1)
+  {
+    std::vector<ColumnFamilyDescriptor> old_cf_descs = {
+        {kDefaultColumnFamilyName, cf_opts}, {"cf1", cf_opts}};
+
+    std::vector<ColumnFamilyDescriptor> new_cf_descs = {
+        {kDefaultColumnFamilyName, cf_opts},
+        {"cf2", cf_opts}};  // Different name
+
+    Status s = OptionChangeMigration(dbname_, db_opts, old_cf_descs, db_opts,
+                                     new_cf_descs);
+    ASSERT_TRUE(s.IsInvalidArgument());
+    ASSERT_TRUE(s.ToString().find("mismatch") != std::string::npos);
+  }
+
+  // Test 3: Mismatched CF order (swapped)
+  {
+    std::vector<ColumnFamilyDescriptor> old_cf_descs = {
+        {kDefaultColumnFamilyName, cf_opts}, {"cf1", cf_opts}};
+
+    std::vector<ColumnFamilyDescriptor> new_cf_descs = {
+        {"cf1", cf_opts},  // Swapped order
+        {kDefaultColumnFamilyName, cf_opts}};
+
+    Status s = OptionChangeMigration(dbname_, db_opts, old_cf_descs, db_opts,
+                                     new_cf_descs);
+    ASSERT_TRUE(s.IsInvalidArgument());
+    ASSERT_TRUE(s.ToString().find("mismatch") != std::string::npos);
+  }
+}
+
+TEST_F(DBOptionChangeMigrationMultiCFTest, FromFIFOMultiCF) {
+  Options options = CurrentOptions();
+  options.compaction_style = CompactionStyle::kCompactionStyleFIFO;
+  options.num_levels = 1;
+  options.max_open_files = -1;
+
+  Reopen(options);
+
+  ColumnFamilyHandle* cf_handle;
+  ASSERT_OK(db_->CreateColumnFamily(options, "cf1", &cf_handle));
+
+  // Write some data
+  Random rnd(301);
+  for (int i = 0; i < 50; i++) {
+    ASSERT_OK(Put("key" + std::to_string(i), rnd.RandomString(900)));
+    ASSERT_OK(db_->Put(WriteOptions(), cf_handle, "key" + std::to_string(i),
+                       rnd.RandomString(900)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Flush(FlushOptions(), cf_handle));
+
+  // Collect keys from both CFs
+  std::set<std::string> keys_default;
+  std::set<std::string> keys_cf1;
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    for (it->SeekToFirst(); it->Valid(); it->Next()) {
+      keys_default.insert(it->key().ToString());
+    }
+    ASSERT_OK(it->status());
+  }
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions(), cf_handle));
+    for (it->SeekToFirst(); it->Valid(); it->Next()) {
+      keys_cf1.insert(it->key().ToString());
+    }
+    ASSERT_OK(it->status());
+  }
+
+  delete cf_handle;
+  Close();
+
+  // Migrate from FIFO to Level
+  DBOptions old_db_opts(options);
+  ColumnFamilyOptions old_cf_opts(options);
+
+  std::vector<ColumnFamilyDescriptor> old_cf_descs = {
+      {kDefaultColumnFamilyName, old_cf_opts}, {"cf1", old_cf_opts}};
+
+  Options new_options = options;
+  new_options.compaction_style = CompactionStyle::kCompactionStyleLevel;
+  new_options.num_levels = 4;
+  new_options.max_open_files = 1000;
+
+  DBOptions new_db_opts(new_options);
+  ColumnFamilyOptions new_cf_opts(new_options);
+
+  std::vector<ColumnFamilyDescriptor> new_cf_descs = {
+      {kDefaultColumnFamilyName, new_cf_opts}, {"cf1", new_cf_opts}};
+
+  // Migration should succeed (FIFO is special case)
+  ASSERT_OK(OptionChangeMigration(dbname_, old_db_opts, old_cf_descs,
+                                  new_db_opts, new_cf_descs));
+
+  // Reopen and verify
+  std::vector<ColumnFamilyHandle*> handles;
+  ASSERT_OK(DB::Open(new_db_opts, dbname_, new_cf_descs, &handles, &db_));
+  ASSERT_EQ(handles.size(), 2);
+
+  // Verify data in default CF
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    it->SeekToFirst();
+    for (const std::string& key : keys_default) {
+      ASSERT_TRUE(it->Valid());
+      ASSERT_EQ(key, it->key().ToString());
+      it->Next();
+    }
+    ASSERT_TRUE(!it->Valid());
+    ASSERT_OK(it->status());
+  }
+
+  // Verify data in cf1
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions(), handles[1]));
+    it->SeekToFirst();
+    for (const std::string& key : keys_cf1) {
+      ASSERT_TRUE(it->Valid());
+      ASSERT_EQ(key, it->key().ToString());
+      it->Next();
+    }
+    ASSERT_TRUE(!it->Valid());
+    ASSERT_OK(it->status());
+  }
+
+  // Cleanup
+  for (auto* handle : handles) {
+    if (handle != db_->DefaultColumnFamily()) {
+      ASSERT_OK(db_->DestroyColumnFamilyHandle(handle));
+    }
+  }
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/utilities/options/options_util_test.cc b/utilities/options/options_util_test.cc
index 193142d67c20..0bfcf704f5f5 100644
--- a/utilities/options/options_util_test.cc
+++ b/utilities/options/options_util_test.cc
@@ -216,8 +216,6 @@ class DummySliceTransform : public SliceTransform {
   // determine whether this is a valid src upon the function applies
   bool InDomain(const Slice& /*src*/) const override { return false; }
 
-  // determine whether dst=Transform(src) for some src
-  bool InRange(const Slice& /*dst*/) const override { return false; }
 };
 
 }  // namespace
@@ -243,7 +241,7 @@ TEST_F(OptionsUtilTest, SanityCheck) {
   db_opt.create_if_missing = true;
 
   ASSERT_OK(DestroyDB(dbname_, Options(db_opt, cf_descs[0].options)));
-  DB* db;
+  std::unique_ptr<DB> db;
   std::vector<ColumnFamilyHandle*> handles;
   // open and persist the options
   ASSERT_OK(DB::Open(db_opt, dbname_, cf_descs, &handles, &db));
@@ -252,7 +250,7 @@ TEST_F(OptionsUtilTest, SanityCheck) {
   for (auto* handle : handles) {
     delete handle;
   }
-  delete db;
+  db.reset();
 
   ConfigOptions config_options;
   config_options.ignore_unknown_options = false;
@@ -435,7 +433,7 @@ TEST_F(OptionsUtilTest, LoadLatestOptions) {
   DBOptions db_opts;
   std::vector<ColumnFamilyDescriptor> cf_descs;
   std::vector<ColumnFamilyHandle*> handles;
-  DB* db;
+  std::unique_ptr<DB> db;
   options.create_if_missing = true;
 
   ASSERT_OK(DestroyDB(dbname_, options));
@@ -495,7 +493,7 @@ TEST_F(OptionsUtilTest, LoadLatestOptions) {
   for (auto* handle : handles) {
     delete handle;
   }
-  delete db;
+  db.reset();
   ASSERT_OK(DestroyDB(dbname_, options, cf_descs));
 }
 
@@ -639,7 +637,7 @@ TEST_F(OptionsUtilTest, BadLatestOptions) {
 }
 
 TEST_F(OptionsUtilTest, RenameDatabaseDirectory) {
-  DB* db;
+  std::unique_ptr<DB> db;
   Options options;
   DBOptions db_opts;
   std::vector<ColumnFamilyDescriptor> cf_descs;
@@ -652,7 +650,7 @@ TEST_F(OptionsUtilTest, RenameDatabaseDirectory) {
 
   ASSERT_OK(DB::Open(options, dbname_, &db));
   ASSERT_OK(db->Put(WriteOptions(), "foo", "value0"));
-  delete db;
+  db.reset();
 
   auto new_dbname = dbname_ + "_2";
 
@@ -669,14 +667,14 @@ TEST_F(OptionsUtilTest, RenameDatabaseDirectory) {
   for (auto* handle : handles) {
     delete handle;
   }
-  delete db;
+  db.reset();
   Options new_options(db_opts, cf_descs[0].options);
   ASSERT_OK(DestroyDB(new_dbname, new_options, cf_descs));
   ASSERT_OK(DestroyDB(dbname_, options));
 }
 
 TEST_F(OptionsUtilTest, WalDirSettings) {
-  DB* db;
+  std::unique_ptr<DB> db;
   Options options;
   DBOptions db_opts;
   std::vector<ColumnFamilyDescriptor> cf_descs;
@@ -689,14 +687,14 @@ TEST_F(OptionsUtilTest, WalDirSettings) {
 
   // Open a DB with no wal dir set.  The wal_dir should stay empty
   ASSERT_OK(DB::Open(options, dbname_, &db));
-  delete db;
+  db.reset();
   ASSERT_OK(LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs));
   ASSERT_EQ(db_opts.wal_dir, "");
 
   // Open a DB with wal_dir == dbname.  The wal_dir should be set to empty
   options.wal_dir = dbname_;
   ASSERT_OK(DB::Open(options, dbname_, &db));
-  delete db;
+  db.reset();
   ASSERT_OK(LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs));
   ASSERT_EQ(db_opts.wal_dir, "");
 
@@ -705,7 +703,7 @@ TEST_F(OptionsUtilTest, WalDirSettings) {
   options.wal_dir = "";
   options.db_paths.emplace_back(dbname_, std::numeric_limits<uint64_t>::max());
   ASSERT_OK(DB::Open(options, dbname_, &db));
-  delete db;
+  db.reset();
   ASSERT_OK(LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs));
   ASSERT_EQ(db_opts.wal_dir, "");
 
@@ -714,7 +712,7 @@ TEST_F(OptionsUtilTest, WalDirSettings) {
   options.wal_dir = dbname_ + "/";
   options.db_paths.emplace_back(dbname_, std::numeric_limits<uint64_t>::max());
   ASSERT_OK(DB::Open(options, dbname_, &db));
-  delete db;
+  db.reset();
   ASSERT_OK(LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs));
   ASSERT_EQ(db_opts.wal_dir, "");
   ASSERT_OK(DestroyDB(dbname_, options));
@@ -725,7 +723,7 @@ TEST_F(OptionsUtilTest, WalDirSettings) {
   options.db_paths.emplace_back(dbname_ + "_0",
                                 std::numeric_limits<uint64_t>::max());
   ASSERT_OK(DB::Open(options, dbname_, &db));
-  delete db;
+  db.reset();
   ASSERT_OK(LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs));
   ASSERT_EQ(db_opts.wal_dir, dbname_);
   ASSERT_OK(DestroyDB(dbname_, options));
@@ -734,14 +732,14 @@ TEST_F(OptionsUtilTest, WalDirSettings) {
   options.wal_dir = dbname_ + "/wal";
   options.db_paths.clear();
   ASSERT_OK(DB::Open(options, dbname_, &db));
-  delete db;
+  db.reset();
   ASSERT_OK(LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs));
   ASSERT_EQ(db_opts.wal_dir, dbname_ + "/wal");
   ASSERT_OK(DestroyDB(dbname_, options));
 }
 
 TEST_F(OptionsUtilTest, WalDirInOptins) {
-  DB* db;
+  std::unique_ptr<DB> db;
   Options options;
   DBOptions db_opts;
   std::vector<ColumnFamilyDescriptor> cf_descs;
@@ -755,7 +753,7 @@ TEST_F(OptionsUtilTest, WalDirInOptins) {
   options.create_if_missing = true;
   options.wal_dir = "";
   ASSERT_OK(DB::Open(options, dbname_, &db));
-  delete db;
+  db.reset();
   options.wal_dir = dbname_;
   std::string options_file;
   ASSERT_OK(GetLatestOptionsFileName(dbname_, options.env, &options_file));
@@ -766,7 +764,7 @@ TEST_F(OptionsUtilTest, WalDirInOptins) {
   ASSERT_EQ(db_opts.wal_dir, dbname_);
   options.wal_dir = "";
   ASSERT_OK(DB::Open(options, dbname_, &db));
-  delete db;
+  db.reset();
   ASSERT_OK(LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs));
   ASSERT_EQ(db_opts.wal_dir, "");
 }
diff --git a/utilities/persistent_cache/block_cache_tier_file.cc b/utilities/persistent_cache/block_cache_tier_file.cc
index 493b92236753..110e74c9e0de 100644
--- a/utilities/persistent_cache/block_cache_tier_file.cc
+++ b/utilities/persistent_cache/block_cache_tier_file.cc
@@ -254,7 +254,7 @@ bool RandomAccessCacheFile::ParseRec(const LBA& lba, Slice* key, Slice* val,
 
   CacheRecord rec;
   if (!rec.Deserialize(data)) {
-    assert(!"Error deserializing data");
+    assert(false && "Error deserializing data");
     Error(log_, "Error de-serializing record from file %s off %d",
           Path().c_str(), lba.off_);
     return false;
@@ -339,7 +339,7 @@ bool WriteableCacheFile::Append(const Slice& key, const Slice& val, LBA* lba) {
   CacheRecord rec(key, val);
   if (!rec.Serialize(&bufs_, &buf_woff_)) {
     // unexpected error: unable to serialize the data
-    assert(!"Error serializing record");
+    assert(false && "Error serializing record");
     return false;
   }
 
diff --git a/utilities/persistent_cache/block_cache_tier_file.h b/utilities/persistent_cache/block_cache_tier_file.h
index 7f329695f52c..82ee40d07369 100644
--- a/utilities/persistent_cache/block_cache_tier_file.h
+++ b/utilities/persistent_cache/block_cache_tier_file.h
@@ -101,14 +101,14 @@ class BlockCacheFile : public LRUElement<BlockCacheFile> {
   // append key/value to file and return LBA locator to user
   virtual bool Append(const Slice& /*key*/, const Slice& /*val*/,
                       LBA* const /*lba*/) {
-    assert(!"not implemented");
+    assert(false && "not implemented");
     return false;
   }
 
   // read from the record locator (LBA) and return key, value and status
   virtual bool Read(const LBA& /*lba*/, Slice* /*key*/, Slice* /*block*/,
                     char* /*scratch*/) {
-    assert(!"not implemented");
+    assert(false && "not implemented");
     return false;
   }
 
diff --git a/utilities/persistent_cache/hash_table_test.cc b/utilities/persistent_cache/hash_table_test.cc
index 7ae6a4a643dc..76c70813fb23 100644
--- a/utilities/persistent_cache/hash_table_test.cc
+++ b/utilities/persistent_cache/hash_table_test.cc
@@ -132,6 +132,7 @@ TEST_F(HashTableTest, TestErase) {
 }
 
 TEST_F(EvictableHashTableTest, TestEvict) {
+#ifndef __clang_analyzer__
   const uint64_t max_keys = 1024 * 1024;
 
   // insert
@@ -148,6 +149,7 @@ TEST_F(EvictableHashTableTest, TestEvict) {
     assert(val->val_ == std::string(1000, val->key_ % 255));
     delete val;
   }
+#endif  // __clang_analyzer__
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/persistent_cache/volatile_tier_impl.cc b/utilities/persistent_cache/volatile_tier_impl.cc
index eea119e6094b..44b6187d0417 100644
--- a/utilities/persistent_cache/volatile_tier_impl.cc
+++ b/utilities/persistent_cache/volatile_tier_impl.cc
@@ -106,7 +106,7 @@ Status VolatileCacheTier::Lookup(const Slice& page_key,
 }
 
 bool VolatileCacheTier::Erase(const Slice& /*key*/) {
-  assert(!"not supported");
+  assert(false && "not supported");
   return true;
 }
 
diff --git a/utilities/table_properties_collectors/compact_on_deletion_collector.cc b/utilities/table_properties_collectors/compact_on_deletion_collector.cc
index a175d0a016f2..348cd849a87d 100644
--- a/utilities/table_properties_collectors/compact_on_deletion_collector.cc
+++ b/utilities/table_properties_collectors/compact_on_deletion_collector.cc
@@ -17,16 +17,19 @@
 namespace ROCKSDB_NAMESPACE {
 
 CompactOnDeletionCollector::CompactOnDeletionCollector(
-    size_t sliding_window_size, size_t deletion_trigger, double deletion_ratio)
+    size_t sliding_window_size, size_t deletion_trigger, double deletion_ratio,
+    uint64_t min_file_size)
     : bucket_size_((sliding_window_size + kNumBuckets - 1) / kNumBuckets),
       current_bucket_(0),
       num_keys_in_current_bucket_(0),
       num_deletions_in_observation_window_(0),
       deletion_trigger_(deletion_trigger),
       deletion_ratio_(deletion_ratio),
+      min_file_size_(min_file_size),
+      cur_file_size_(0),
+      max_deletion_in_window_(0),
       deletion_ratio_enabled_(deletion_ratio > 0 && deletion_ratio <= 1),
-      need_compaction_(false),
-      finished_(false) {
+      need_compaction_(false) {
   memset(num_deletions_in_buckets_, 0, sizeof(size_t) * kNumBuckets);
 }
 
@@ -39,7 +42,7 @@ Status CompactOnDeletionCollector::AddUserKey(const Slice& /*key*/,
                                               const Slice& /*value*/,
                                               EntryType type,
                                               SequenceNumber /*seq*/,
-                                              uint64_t /*file_size*/) {
+                                              uint64_t file_size) {
   assert(!finished_);
   if (!bucket_size_ && !deletion_ratio_enabled_) {
     // This collector is effectively disabled
@@ -51,11 +54,14 @@ Status CompactOnDeletionCollector::AddUserKey(const Slice& /*key*/,
     return Status::OK();
   }
 
+  const bool is_delete = (type == kEntryDelete || type == kEntrySingleDelete ||
+                          type == kEntryDeleteWithTimestamp);
   if (deletion_ratio_enabled_) {
     total_entries_++;
-    if (type == kEntryDelete) {
+    if (is_delete) {
       deletion_entries_++;
     }
+    cur_file_size_ = file_size;
   }
 
   if (bucket_size_) {
@@ -76,13 +82,20 @@ Status CompactOnDeletionCollector::AddUserKey(const Slice& /*key*/,
     }
 
     num_keys_in_current_bucket_++;
-    if (type == kEntryDelete) {
+    if (is_delete) {
       num_deletions_in_observation_window_++;
       num_deletions_in_buckets_[current_bucket_]++;
-      if (num_deletions_in_observation_window_ >= deletion_trigger_) {
-        need_compaction_ = true;
+      if (num_deletions_in_observation_window_ >= max_deletion_in_window_) {
+        max_deletion_in_window_ = num_deletions_in_observation_window_;
       }
     }
+
+    // The file may qualify for compaction based on file size constraints,
+    // even if max_deletion_in_window_ is not updated.
+    if (max_deletion_in_window_ >= deletion_trigger_ &&
+        file_size >= min_file_size_) {
+      need_compaction_ = true;
+    }
   }
 
   return Status::OK();
@@ -90,7 +103,8 @@ Status CompactOnDeletionCollector::AddUserKey(const Slice& /*key*/,
 
 Status CompactOnDeletionCollector::Finish(
     UserCollectedProperties* /*properties*/) {
-  if (!need_compaction_ && deletion_ratio_enabled_ && total_entries_ > 0) {
+  if (!need_compaction_ && deletion_ratio_enabled_ && total_entries_ > 0 &&
+      cur_file_size_ >= min_file_size_) {
     double ratio = static_cast<double>(deletion_entries_) / total_entries_;
     need_compaction_ = ratio >= deletion_ratio_;
   }
@@ -153,23 +167,43 @@ static std::unordered_map<std::string, OptionTypeInfo>
             return Status::OK();
           },
           nullptr}},
+        {"min_file_size",
+         {0, OptionType::kUnknown, OptionVerificationType::kNormal,
+          OptionTypeFlags::kCompareNever | OptionTypeFlags::kMutable,
+          [](const ConfigOptions&, const std::string&, const std::string& value,
+             void* addr) {
+            auto* factory =
+                static_cast<CompactOnDeletionCollectorFactory*>(addr);
+            factory->SetMinFileSize(ParseUint64(value));
+            return Status::OK();
+          },
+          [](const ConfigOptions&, const std::string&, const void* addr,
+             std::string* value) {
+            const auto* factory =
+                static_cast<const CompactOnDeletionCollectorFactory*>(addr);
+            *value = std::to_string(factory->GetMinFileSize());
+            return Status::OK();
+          },
+          nullptr}},
 
 };
 
 CompactOnDeletionCollectorFactory::CompactOnDeletionCollectorFactory(
-    size_t sliding_window_size, size_t deletion_trigger, double deletion_ratio)
+    size_t sliding_window_size, size_t deletion_trigger, double deletion_ratio,
+    uint64_t min_file_size)
     : sliding_window_size_(sliding_window_size),
       deletion_trigger_(deletion_trigger),
-      deletion_ratio_(deletion_ratio) {
+      deletion_ratio_(deletion_ratio),
+      min_file_size_(min_file_size) {
   RegisterOptions("", this, &on_deletion_collector_type_info);
 }
 
 TablePropertiesCollector*
 CompactOnDeletionCollectorFactory::CreateTablePropertiesCollector(
     TablePropertiesCollectorFactory::Context /*context*/) {
-  return new CompactOnDeletionCollector(sliding_window_size_.load(),
-                                        deletion_trigger_.load(),
-                                        deletion_ratio_.load());
+  return new CompactOnDeletionCollector(
+      sliding_window_size_.load(), deletion_trigger_.load(),
+      deletion_ratio_.load(), min_file_size_.load());
 }
 
 std::string CompactOnDeletionCollectorFactory::ToString() const {
@@ -183,10 +217,12 @@ std::string CompactOnDeletionCollectorFactory::ToString() const {
 std::shared_ptr<CompactOnDeletionCollectorFactory>
 NewCompactOnDeletionCollectorFactory(size_t sliding_window_size,
                                      size_t deletion_trigger,
-                                     double deletion_ratio) {
+                                     double deletion_ratio,
+                                     uint64_t min_file_size) {
   return std::shared_ptr<CompactOnDeletionCollectorFactory>(
       new CompactOnDeletionCollectorFactory(sliding_window_size,
-                                            deletion_trigger, deletion_ratio));
+                                            deletion_trigger, deletion_ratio,
+                                            min_file_size));
 }
 
 namespace {
diff --git a/utilities/table_properties_collectors/compact_on_deletion_collector.h b/utilities/table_properties_collectors/compact_on_deletion_collector.h
index 1ccfa7becdf7..a800760dcb82 100644
--- a/utilities/table_properties_collectors/compact_on_deletion_collector.h
+++ b/utilities/table_properties_collectors/compact_on_deletion_collector.h
@@ -11,7 +11,8 @@ namespace ROCKSDB_NAMESPACE {
 class CompactOnDeletionCollector : public TablePropertiesCollector {
  public:
   CompactOnDeletionCollector(size_t sliding_window_size,
-                             size_t deletion_trigger, double deletion_raatio);
+                             size_t deletion_trigger, double deletion_ratio,
+                             uint64_t min_file_size);
 
   // AddUserKey() will be called when a new key/value pair is inserted into the
   // table.
@@ -36,7 +37,7 @@ class CompactOnDeletionCollector : public TablePropertiesCollector {
   // The name of the properties collector can be used for debugging purpose.
   const char* Name() const override { return "CompactOnDeletionCollector"; }
 
-  // EXPERIMENTAL Return whether the output file should be further compacted
+  // Return whether the output file should be further compacted
   bool NeedCompact() const override { return need_compaction_; }
 
   static const int kNumBuckets = 128;
@@ -48,18 +49,21 @@ class CompactOnDeletionCollector : public TablePropertiesCollector {
   // "bucket_size_" keys.
   size_t num_deletions_in_buckets_[kNumBuckets];
   // the number of keys in a bucket
-  size_t bucket_size_;
+  const size_t bucket_size_;
 
   size_t current_bucket_;
   size_t num_keys_in_current_bucket_;
   size_t num_deletions_in_observation_window_;
-  size_t deletion_trigger_;
+  const size_t deletion_trigger_;
   const double deletion_ratio_;
-  const bool deletion_ratio_enabled_;
   size_t total_entries_ = 0;
   size_t deletion_entries_ = 0;
+  const size_t min_file_size_;
+  size_t cur_file_size_;
+  size_t max_deletion_in_window_;
+  const bool deletion_ratio_enabled_;
   // true if the current SST file needs to be compacted.
   bool need_compaction_;
-  bool finished_;
+  bool finished_ = false;
 };
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc b/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc
index 9fec089fc13f..5fabb9856eba 100644
--- a/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc
+++ b/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc
@@ -232,6 +232,61 @@ TEST(CompactOnDeletionCollector, SlidingWindow) {
   }
 }
 
+TEST(CompactOnDeletionCollector, MinFileSize) {
+  TablePropertiesCollectorFactory::Context context;
+  context.column_family_id =
+      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily;
+  context.last_level_inclusive_max_seqno_threshold = kMaxSequenceNumber;
+
+  const size_t kWindowSize = 1000;
+  const size_t kDeletionTrigger = 800;
+  const double kDeletionRatio = 0.9;
+  const uint64_t kMinFileSize = 1 << 20;
+
+  for (uint64_t file_size : {(uint64_t)0, kMinFileSize - 1, kMinFileSize}) {
+    {
+      auto factory = NewCompactOnDeletionCollectorFactory(
+          kWindowSize, kDeletionTrigger, 0, kMinFileSize);
+      std::unique_ptr<TablePropertiesCollector> collector(
+          factory->CreateTablePropertiesCollector(context));
+
+      // Add enough deletions to meet the sliding window triggers
+      for (size_t i = 0; i < kWindowSize; i++) {
+        if (i < kDeletionTrigger) {
+          ASSERT_OK(collector->AddUserKey("key", "value", kEntryDelete, 0,
+                                          file_size));
+        } else {
+          ASSERT_OK(
+              collector->AddUserKey("key", "value", kEntryPut, 0, file_size));
+        }
+      }
+      ASSERT_OK(collector->Finish(nullptr));
+      ASSERT_EQ(collector->NeedCompact(), file_size >= kMinFileSize);
+    }
+
+    {
+      auto factory = NewCompactOnDeletionCollectorFactory(
+          kWindowSize, kDeletionTrigger, kDeletionRatio, kMinFileSize);
+
+      std::unique_ptr<TablePropertiesCollector> collector(
+          factory->CreateTablePropertiesCollector(context));
+
+      const size_t kTotalEntries = 100;
+      // Add all deletions to maximize tombstone ratio
+      for (size_t i = 0; i < kTotalEntries - 1; i++) {
+        ASSERT_OK(
+            collector->AddUserKey("key", "value", kEntrySingleDelete, 0, 0));
+      }
+      // Give update file size
+      ASSERT_OK(collector->AddUserKey("key", "value", kEntrySingleDelete, 0,
+                                      file_size));
+
+      ASSERT_OK(collector->Finish(nullptr));
+      ASSERT_EQ(collector->NeedCompact(), file_size >= kMinFileSize);
+    }
+  }
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/utilities/transactions/lock/lock_manager.cc b/utilities/transactions/lock/lock_manager.cc
index 7bcbf6f9d804..f4828ec59069 100644
--- a/utilities/transactions/lock/lock_manager.cc
+++ b/utilities/transactions/lock/lock_manager.cc
@@ -17,8 +17,11 @@ std::shared_ptr<LockManager> NewLockManager(PessimisticTransactionDB* db,
     auto mgr = opt.lock_mgr_handle->getLockManager();
     return std::shared_ptr<LockManager>(opt.lock_mgr_handle, mgr);
   } else {
-    // Use a point lock manager by default
-    return std::shared_ptr<LockManager>(new PointLockManager(db, opt));
+    if (opt.use_per_key_point_lock_mgr) {
+      return std::make_shared<PerKeyPointLockManager>(db, opt);
+    } else {
+      return std::make_shared<PointLockManager>(db, opt);
+    }
   }
 }
 
diff --git a/utilities/transactions/lock/point/any_lock_manager_test.h b/utilities/transactions/lock/point/any_lock_manager_test.h
new file mode 100644
index 000000000000..9ea9114b9264
--- /dev/null
+++ b/utilities/transactions/lock/point/any_lock_manager_test.h
@@ -0,0 +1,244 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "utilities/transactions/lock/point/point_lock_manager_test.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+using init_func_t = void (*)(PointLockManagerTest*);
+
+class AnyLockManagerTest : public PointLockManagerTest,
+                           public testing::WithParamInterface<init_func_t> {
+ public:
+  void SetUp() override {
+    // If a custom setup function was provided, use it. Otherwise, use what we
+    // have inherited.
+    auto init_func = GetParam();
+    if (init_func) {
+      (*init_func)(this);
+    } else {
+      PointLockManagerTest::SetUp();
+    }
+  }
+};
+
+TEST_P(AnyLockManagerTest, ReentrantExclusiveLock) {
+  // Tests that a txn can acquire exclusive lock on the same key repeatedly.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  auto txn = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true));
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true));
+
+  // Cleanup
+  locker_->UnLock(txn, 1, "k", env_);
+
+  delete txn;
+}
+
+TEST_P(AnyLockManagerTest, ReentrantSharedLock) {
+  // Tests that a txn can acquire shared lock on the same key repeatedly.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  auto txn = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false));
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false));
+
+  // Cleanup
+  if (dynamic_cast<PointLockManager*>(locker_.get()) != nullptr &&
+      dynamic_cast<PerKeyPointLockManager*>(locker_.get()) == nullptr) {
+    // PointLockManager would create 2 entries in the lock manager, so it needs
+    // to unlock it twice.
+    locker_->UnLock(txn, 1, "k", env_);
+  }
+  locker_->UnLock(txn, 1, "k", env_);
+
+  delete txn;
+}
+
+TEST_P(AnyLockManagerTest, LockUpgrade) {
+  // Tests that a txn can upgrade from a shared lock to an exclusive lock.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  auto txn = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false));
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true));
+
+  // Cleanup
+  locker_->UnLock(txn, 1, "k", env_);
+  delete txn;
+}
+
+TEST_P(AnyLockManagerTest, LockDowngrade) {
+  // Tests that a txn can acquire a shared lock after acquiring an exclusive
+  // lock on the same key.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  auto txn = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true));
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false));
+
+  // Cleanup
+  locker_->UnLock(txn, 1, "k", env_);
+  delete txn;
+}
+
+TEST_P(AnyLockManagerTest, LockConflict) {
+  // Tests that lock conflicts lead to lock timeout.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  auto txn1 = NewTxn();
+  auto txn2 = NewTxn();
+
+  {
+    // exclusive-exclusive conflict.
+    ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+    auto s = locker_->TryLock(txn2, 1, "k1", env_, true);
+    ASSERT_TRUE(s.IsTimedOut());
+  }
+
+  {
+    // exclusive-shared conflict.
+    ASSERT_OK(locker_->TryLock(txn1, 1, "k2", env_, true));
+    auto s = locker_->TryLock(txn2, 1, "k2", env_, false);
+    ASSERT_TRUE(s.IsTimedOut());
+  }
+
+  {
+    // shared-exclusive conflict.
+    ASSERT_OK(locker_->TryLock(txn1, 1, "k2", env_, false));
+    auto s = locker_->TryLock(txn2, 1, "k2", env_, true);
+    ASSERT_TRUE(s.IsTimedOut());
+  }
+
+  // Cleanup
+  locker_->UnLock(txn1, 1, "k1", env_);
+  locker_->UnLock(txn1, 1, "k2", env_);
+
+  delete txn1;
+  delete txn2;
+}
+
+TEST_P(AnyLockManagerTest, SharedLocks) {
+  // Tests that shared locks can be concurrently held by multiple transactions.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  auto txn1 = NewTxn();
+  auto txn2 = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, false));
+  ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, false));
+
+  // Cleanup
+  locker_->UnLock(txn1, 1, "k", env_);
+  locker_->UnLock(txn2, 1, "k", env_);
+
+  delete txn1;
+  delete txn2;
+}
+
+TEST_P(AnyLockManagerTest, Deadlock) {
+  // Tests that deadlock can be detected.
+  // Deadlock scenario:
+  // txn1 exclusively locks k1, and wants to lock k2;
+  // txn2 exclusively locks k2, and wants to lock k1.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  TransactionOptions txn_opt;
+  // disable dead lock timeout, so that the dead lock detection behavior is
+  // consistent. This prevents the test to be flaky
+  txn_opt.deadlock_timeout_us = 0;
+  txn_opt.deadlock_detect = true;
+  txn_opt.lock_timeout = 1000000;
+  auto txn1 = NewTxn(txn_opt);
+  auto txn2 = NewTxn(txn_opt);
+
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+  ASSERT_OK(locker_->TryLock(txn2, 1, "k2", env_, true));
+
+  // txn1 tries to lock k2, will be blocked.
+  port::Thread t;
+  BlockUntilWaitingTxn(wait_sync_point_name_, t, [&]() {
+    // block because txn2 is holding a lock on k2.
+    ASSERT_OK(locker_->TryLock(txn1, 1, "k2", env_, true));
+  });
+
+  auto s = locker_->TryLock(txn2, 1, "k1", env_, true);
+  ASSERT_TRUE(s.IsBusy());
+  ASSERT_EQ(s.subcode(), Status::SubCode::kDeadlock);
+
+  std::vector<DeadlockPath> deadlock_paths = locker_->GetDeadlockInfoBuffer();
+  ASSERT_EQ(deadlock_paths.size(), 1u);
+  ASSERT_FALSE(deadlock_paths[0].limit_exceeded);
+
+  std::vector<DeadlockInfo> deadlocks = deadlock_paths[0].path;
+  ASSERT_EQ(deadlocks.size(), 2u);
+
+  ASSERT_EQ(deadlocks[0].m_txn_id, txn1->GetID());
+  ASSERT_EQ(deadlocks[0].m_cf_id, 1u);
+  ASSERT_TRUE(deadlocks[0].m_exclusive);
+  ASSERT_EQ(deadlocks[0].m_waiting_key, "k2");
+
+  ASSERT_EQ(deadlocks[1].m_txn_id, txn2->GetID());
+  ASSERT_EQ(deadlocks[1].m_cf_id, 1u);
+  ASSERT_TRUE(deadlocks[1].m_exclusive);
+  ASSERT_EQ(deadlocks[1].m_waiting_key, "k1");
+
+  locker_->UnLock(txn2, 1, "k2", env_);
+  t.join();
+
+  // Cleanup
+  locker_->UnLock(txn1, 1, "k1", env_);
+  locker_->UnLock(txn1, 1, "k2", env_);
+  delete txn2;
+  delete txn1;
+}
+
+TEST_P(AnyLockManagerTest, GetWaitingTxns_MultipleTxns) {
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+
+  auto txn1 = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, false));
+
+  auto txn2 = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, false));
+
+  auto txn3 = NewTxn();
+  txn3->SetLockTimeout(10000);
+  port::Thread t1;
+  BlockUntilWaitingTxn(wait_sync_point_name_, t1, [&]() {
+    ASSERT_OK(locker_->TryLock(txn3, 1, "k", env_, true));
+    locker_->UnLock(txn3, 1, "k", env_);
+  });
+
+  // Ok, now txn3 is waiting for lock on "k", which is owned by two
+  // transactions. Check that GetWaitingTxns reports this correctly
+  uint32_t wait_cf_id;
+  std::string wait_key;
+  auto waiters = txn3->GetWaitingTxns(&wait_cf_id, &wait_key);
+
+  ASSERT_EQ(wait_cf_id, 1u);
+  ASSERT_EQ(wait_key, "k");
+  ASSERT_EQ(waiters.size(), 2);
+  bool waits_correct =
+      (waiters[0] == txn1->GetID() && waiters[1] == txn2->GetID()) ||
+      (waiters[1] == txn1->GetID() && waiters[0] == txn2->GetID());
+  ASSERT_EQ(waits_correct, true);
+
+  // Release locks so txn3 can proceed with execution
+  locker_->UnLock(txn1, 1, "k", env_);
+  locker_->UnLock(txn2, 1, "k", env_);
+
+  // Wait until txn3 finishes
+  t1.join();
+
+  delete txn1;
+  delete txn2;
+  delete txn3;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/transactions/lock/point/point_lock_bench.cc b/utilities/transactions/lock/point/point_lock_bench.cc
new file mode 100644
index 000000000000..2867738fdf1e
--- /dev/null
+++ b/utilities/transactions/lock/point/point_lock_bench.cc
@@ -0,0 +1,18 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else
+#include "rocksdb/point_lock_bench_tool.h"
+int main(int argc, char** argv) {
+  return ROCKSDB_NAMESPACE::point_lock_bench_tool(argc, argv);
+}
+#endif  // GFLAGS
diff --git a/utilities/transactions/lock/point/point_lock_bench_tool.cc b/utilities/transactions/lock/point/point_lock_bench_tool.cc
new file mode 100644
index 000000000000..b9d55c34deaa
--- /dev/null
+++ b/utilities/transactions/lock/point/point_lock_bench_tool.cc
@@ -0,0 +1,159 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifdef GFLAGS
+
+#include <cstdio>
+#include <iostream>
+#include <memory>
+
+#include "port/stack_trace.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/env.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "util/gflags_compat.h"
+#include "utilities/transactions/lock/point/point_lock_manager.h"
+#include "utilities/transactions/lock/point/point_lock_validation_test_runner.h"
+#include "utilities/transactions/pessimistic_transaction_db.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+namespace ROCKSDB_NAMESPACE {
+
+DEFINE_string(db_dir, "/tmp/point_lock_manager_test",
+              "DB path for running the benchmark");
+DEFINE_uint32(stripe_count, 16, "Number of stripes in point lock manager");
+DEFINE_bool(is_per_key_point_lock_manager, false,
+            "Use PerKeyPointLockManager or PointLockManager");
+DEFINE_uint32(thread_count, 64,
+              "Number of threads to acquire release locks concurrently");
+DEFINE_uint32(key_count, 16, "Number of keys to acquire release locks upon");
+DEFINE_uint32(max_num_keys_to_lock_per_txn, 8,
+              "Max Number of keys to lock in a transaction");
+DEFINE_uint32(execution_time_sec, 10,
+              "Number of seconds to execute the benchmark");
+DEFINE_uint32(lock_type, 2,
+              "Lock type to test, 0: exclusive lock only; 1: shared lock only; "
+              "2: both shared and exclusive locks");
+DEFINE_int64(lock_timeout_ms, 1000,
+             "Lock acquisition request timeout in milliseconds.");
+DEFINE_int64(deadlock_timeout_us, 500,
+             "DeadLock detection timeout in microseconds.");
+DEFINE_int64(lock_expiration_ms, 100,
+             "Acquired Lock expiration time in milliseconds.");
+DEFINE_bool(allow_non_deadlock_error, true,
+            "Allow returned error code other than deadlock, such as timeout.");
+DEFINE_uint32(
+    max_sleep_after_lock_acquisition_ms, 5,
+    "Max number of milliseconds to sleep after acquiring all the locks in the "
+    "transaction. The actuall sleep time will be randomized from 0 to max. It "
+    "is used to simulate some useful work performed.");
+DEFINE_bool(check_thread_stuck, false,
+            "Check thread periodically to see whether they are stuck or not. "
+            "This is useful for detecting stuck transaction quickly. But it "
+            "could have false-positive when running with ASAN or running with "
+            "high thread count on a small number of CPUs");
+
+namespace {  // anonymous namespace
+
+class PointLockManagerBenchmark {
+ public:
+  PointLockManagerBenchmark() {
+    env_ = Env::Default();
+    env_->CreateDir(FLAGS_db_dir);
+
+    Options opt;
+    opt.create_if_missing = true;
+    txndb_opt_.num_stripes = FLAGS_stripe_count;
+
+    db_ = nullptr;
+
+    auto s = TransactionDB::Open(opt, txndb_opt_, FLAGS_db_dir, &db_);
+    ASSERT_OK(s);
+
+    if (FLAGS_is_per_key_point_lock_manager) {
+      locker_ = std::make_shared<PerKeyPointLockManager>(
+          static_cast<PessimisticTransactionDB*>(db_), txndb_opt_);
+    } else {
+      locker_ = std::make_shared<PointLockManager>(
+          static_cast<PessimisticTransactionDB*>(db_), txndb_opt_);
+    }
+
+    txn_opt_.deadlock_detect = true;
+    txn_opt_.lock_timeout = FLAGS_lock_timeout_ms;
+    txn_opt_.deadlock_timeout_us = FLAGS_deadlock_timeout_us;
+    txn_opt_.expiration = FLAGS_lock_expiration_ms;
+  }
+
+  // Disable copy and assignment
+  PointLockManagerBenchmark(const PointLockManagerBenchmark&) = delete;
+  PointLockManagerBenchmark& operator=(const PointLockManagerBenchmark&) =
+      delete;
+  PointLockManagerBenchmark(PointLockManagerBenchmark&&) = delete;
+  PointLockManagerBenchmark& operator=(PointLockManagerBenchmark&&) = delete;
+
+  ~PointLockManagerBenchmark() {
+    delete db_;
+    auto s = DestroyDir(env_, FLAGS_db_dir);
+    ASSERT_OK(s);
+  }
+
+  void run() {
+    PointLockValidationTestRunner test_runner(
+        env_, txndb_opt_, locker_, db_, txn_opt_, FLAGS_thread_count,
+        FLAGS_key_count, FLAGS_max_num_keys_to_lock_per_txn,
+        FLAGS_execution_time_sec, static_cast<LockTypeToTest>(FLAGS_lock_type),
+        FLAGS_allow_non_deadlock_error,
+        FLAGS_max_sleep_after_lock_acquisition_ms, FLAGS_check_thread_stuck);
+    test_runner.run();
+  }
+
+ private:
+  Env* env_;
+  TransactionDBOptions txndb_opt_;
+  std::shared_ptr<LockManager> locker_;
+  TransactionDB* db_;
+  TransactionOptions txn_opt_;
+};
+
+}  // anonymous namespace
+
+int point_lock_bench_tool(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  // Print test configuration
+  std::vector<gflags::CommandLineFlagInfo> all_flags;
+  gflags::GetAllFlags(&all_flags);
+
+  for (const auto& flag : all_flags) {
+    // only show the flags defined in this file
+    if (flag.filename.find("point_lock_bench_tool.cc") != std::string::npos) {
+      std::cout << "-" << flag.name << "=";
+      if (flag.type == "bool") {
+        std::cout << (gflags::GetCommandLineFlagInfoOrDie(flag.name.c_str())
+                                  .current_value == "true"
+                          ? "true"
+                          : "false");
+      } else {
+        std::cout << gflags::GetCommandLineFlagInfoOrDie(flag.name.c_str())
+                         .current_value;
+      }
+      std::cout << " ";
+    }
+  }
+  std::cout << std::endl;
+
+  // Run the benchmark
+  PointLockManagerBenchmark benchmark;
+  benchmark.run();
+
+  return 0;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // GFLAGS
diff --git a/utilities/transactions/lock/point/point_lock_manager.cc b/utilities/transactions/lock/point/point_lock_manager.cc
index 85916b86f9af..05386b16bce0 100644
--- a/utilities/transactions/lock/point/point_lock_manager.cc
+++ b/utilities/transactions/lock/point/point_lock_manager.cc
@@ -13,7 +13,6 @@
 #include "rocksdb/slice.h"
 #include "rocksdb/utilities/transaction_db_mutex.h"
 #include "test_util/sync_point.h"
-#include "util/cast_util.h"
 #include "util/hash.h"
 #include "util/thread_local.h"
 #include "utilities/transactions/pessimistic_transaction_db.h"
@@ -21,36 +20,275 @@
 
 namespace ROCKSDB_NAMESPACE {
 
+constexpr bool kDebugLog = false;
+
+// KeyLockWaiter represents a waiter for a key lock. It contains a conditional
+// variable to allow waiter to wait for the key lock. It also contains other
+// metadata about the waiter such as transaction id, lock type etc.
+struct KeyLockWaiter {
+  KeyLockWaiter(std::shared_ptr<TransactionDBCondVar> c, TransactionID i,
+                bool ex)
+      : id(i), exclusive(ex), ready(false), cv(std::move(c)) {}
+
+  // disable copy constructor and assignment operator, move and move
+  // assignment
+  KeyLockWaiter(const KeyLockWaiter&) = delete;
+  KeyLockWaiter& operator=(const KeyLockWaiter&) = delete;
+  KeyLockWaiter(KeyLockWaiter&&) = delete;
+  KeyLockWaiter& operator=(KeyLockWaiter&&) = delete;
+
+  ~KeyLockWaiter() = default;
+
+  // Reset the waiter to be used again
+  void Reset(TransactionID i, bool e) {
+    id = i;
+    exclusive = e;
+    ready = false;
+  }
+
+  // Check whether the waiter has been notified that it is its turn to take the
+  // lock
+  bool IsReady() const { return ready; }
+
+  // Wait until its turn to take the lock forever
+  Status Wait(std::shared_ptr<TransactionDBMutex>& mutex) {
+    // Mutex is already locked by caller
+    // Check ready flag before wait
+    if (ready) {
+      return Status::OK();
+    }
+    return AfterWait(cv->Wait(mutex));
+  }
+
+  // Wait until its turn to take the lock within timeout_us
+  Status WaitFor(std::shared_ptr<TransactionDBMutex>& mutex,
+                 int64_t timeout_us) {
+    // Mutex is already locked by caller
+    // Check ready flag before wait
+    if (ready) {
+      return Status::OK();
+    }
+    return AfterWait(cv->WaitFor(mutex, timeout_us));
+  }
+
+  // Notify the waiter to take the lock
+  void Notify() {
+    // Mutex is already locked by caller
+    ready = true;
+    cv->Notify();
+  }
+
+  TransactionID id;
+  bool exclusive;
+
+ private:
+  Status AfterWait(Status wait_result) {
+    if (wait_result.ok() || wait_result.IsTimedOut()) {
+      // check ready again after wake up.
+      if (ready) {
+        return Status::OK();
+      } else {
+        return Status::TimedOut(Status::SubCode::kMutexTimeout);
+      }
+    } else {
+      return wait_result;
+    }
+  }
+
+  // Track whether the waiter has been woken up explicitly.
+  bool ready;
+  // TODO(Xingbo), Switch to std::binary_semaphore, once we have c++20
+  // semaphore is likely more performant than mutex + cv.
+  // Although we will also need to implement TransactionDBSemaphore, which would
+  // be required if external system wants to do instrumented lock wait tracking
+  std::shared_ptr<TransactionDBCondVar> cv;
+};
+
 struct LockInfo {
+  LockInfo(TransactionID id, uint64_t time, bool ex)
+      : exclusive(ex), expiration_time(time) {
+    txn_ids.push_back(id);
+  }
+
+  DECLARE_DEFAULT_MOVES(LockInfo);
+
   bool exclusive;
   autovector<TransactionID> txn_ids;
 
   // Transaction locks are not valid after this time in us
   uint64_t expiration_time;
 
-  LockInfo(TransactionID id, uint64_t time, bool ex)
-      : exclusive(ex), expiration_time(time) {
-    txn_ids.push_back(id);
+  // waiter queue for this key
+  // TODO xingbo, use intrusive list to avoid extra memory allocation
+  std::unique_ptr<std::list<KeyLockWaiter*>> waiter_queue;
+};
+
+// Print debug info for lock waiter wake up action.
+void DebugWakeUpWaiter(TransactionID txn_id, TransactionID waiter_id,
+                       const std::string& key, const std::string& msg) {
+  if (kDebugLog) {
+    // print which waiter got woken up
+    fprintf(stderr,
+            "Txn %" PRIu64 ": wake up next waiter on %s Txn %" PRIu64
+            " on key %s\n",
+            txn_id, msg.c_str(), waiter_id, key.c_str());
+    fflush(stderr);
+  }
+}
+
+// Key lock waiter context, used for free the lock automatically
+struct KeyLockWaiterContext {
+  // When a lock waiter is aborted due to dead lock or time out, this function
+  // is used to wake up the waiters after it, if they could proceed.
+  void TryWakeUpNextWaiters(const LockInfo& lock_info, const std::string& key) {
+    if (waiter_queue != nullptr && lock_waiter != waiter_queue->end()) {
+      bool wake_up_next_shared_waiters = false;
+
+      if (lock_waiter == waiter_queue->begin()) {
+        // if lock waiter is at the head of the queue, check the current lock
+        // status. If it is exclusive lock, no waiter should be woken up. other
+        // wise, try to wake up shared lock waiters on the right side of itself.
+        wake_up_next_shared_waiters = !lock_info.exclusive;
+      } else {
+        // if lock waiter is not at the head of the queue, check the previous
+        // lock status. If it is active and shared, it should try to wake up the
+        // shared lock waiter on the right side of itself.
+        auto lock_waiter_prev = lock_waiter;
+        lock_waiter_prev--;
+        wake_up_next_shared_waiters =
+            (*lock_waiter_prev)->IsReady() && !(*lock_waiter_prev)->exclusive;
+      }
+
+      if (wake_up_next_shared_waiters) {
+        // Go through all the waiters on the right side of the lock waiter and
+        // wake up the shared lock waiter until the end of the queue or
+        // encountered an exclusive lock waiter.
+        auto lock_waiter_next = lock_waiter;
+        lock_waiter_next++;
+        while (lock_waiter_next != waiter_queue->end() &&
+               !(*lock_waiter_next)->exclusive) {
+          (*lock_waiter_next)->Notify();
+          DebugWakeUpWaiter((*lock_waiter)->id, (*lock_waiter_next)->id, key,
+                            "TryWakeUpNextWaiters");
+          lock_waiter_next++;
+        }
+      }
+    }
   }
-  LockInfo(const LockInfo& lock_info)
 
-      = default;
-  void operator=(const LockInfo& lock_info) {
-    exclusive = lock_info.exclusive;
-    txn_ids = lock_info.txn_ids;
-    expiration_time = lock_info.expiration_time;
+  ~KeyLockWaiterContext() {
+    if (waiter_queue != nullptr && lock_waiter != waiter_queue->end()) {
+      waiter_queue->erase(lock_waiter);
+      lock_waiter = waiter_queue->end();
+    }
+    waiter_queue = nullptr;
   }
-  DECLARE_DEFAULT_MOVES(LockInfo);
+
+  // The waiter queue the lock waiter joined. Used for remove the waiter from
+  // the waiter queue.
+  std::list<KeyLockWaiter*>* waiter_queue = nullptr;
+  // The stable iterator that tracks the position of the waiter in the waiter
+  // queue. Used for remove the waiter from the waiter queue.
+  std::list<KeyLockWaiter*>::iterator lock_waiter;
 };
 
 struct LockMapStripe {
-  explicit LockMapStripe(std::shared_ptr<TransactionDBMutexFactory> factory) {
-    stripe_mutex = factory->AllocateMutex();
-    stripe_cv = factory->AllocateCondVar();
+  explicit LockMapStripe(std::shared_ptr<TransactionDBMutexFactory> factory,
+                         ThreadLocalPtr& key_lock_waiter)
+      : mutex_factory_(std::move(factory)), key_lock_waiter_(key_lock_waiter) {
+    stripe_mutex = mutex_factory_->AllocateMutex();
+    stripe_cv = mutex_factory_->AllocateCondVar();
+
     assert(stripe_mutex);
     assert(stripe_cv);
   }
 
+  LockInfo* GetLockInfo(const std::string& key) {
+    auto lock_info_iter = keys.find(key);
+    if (lock_info_iter != keys.end()) {
+      return &lock_info_iter->second;
+    } else {
+      return nullptr;
+    }
+  }
+
+  // Wait until its turn to take the lock of this key within timeout_us.
+  // By default timeout_us == 0, which means wait forever
+  void JoinWaitQueue(LockInfo& lock_info, TransactionID id, bool exclusive,
+                     bool isUpgrade, KeyLockWaiterContext& waiter_context) {
+    if (lock_info.waiter_queue == nullptr) {
+      // no waiter queue yet, create a new one
+      lock_info.waiter_queue = std::make_unique<std::list<KeyLockWaiter*>>();
+    }
+
+    auto waiter_queue = lock_info.waiter_queue.get();
+
+    // by default insert the new lock waiter at the end of the queue.
+    auto insert_point = waiter_queue->end();
+
+    if (isUpgrade) {
+      // If transaction is upgrading a shared lock to exclusive lock, prioritize
+      // it by moving its lock waiter before the first exclusive lock in the
+      // queue if there is one, or end of the queue if not exist. It will be
+      // able to acquire the lock after the other shared locks waiters at the
+      // front of queue acquired and released locks. This reduces the chance of
+      // deadlock, which makes transaction run more efficiently.
+
+      if (waiter_context.waiter_queue != nullptr) {
+        // If waiter_context is already initialized, it means current
+        // transaction already joined the lock queue. Don't move the lock
+        // position if it is already at the head of the queue or the lock
+        // waiters before it are ready to take the lock.
+        if (waiter_context.lock_waiter == waiter_queue->begin()) {
+          return;
+        }
+
+        auto prev_lock_waiter = waiter_context.lock_waiter;
+        prev_lock_waiter--;
+        if ((*prev_lock_waiter)->IsReady()) {
+          return;
+        }
+
+        // Remove existing lock waiter
+        waiter_queue->erase(waiter_context.lock_waiter);
+      }
+
+      // For upgrade, insert waiter either at the end of the queue or before the
+      // first exlusive lock waiter.
+      insert_point = waiter_queue->begin();
+      while ((insert_point != waiter_queue->end()) &&
+             (!(*insert_point)->exclusive)) {
+        insert_point++;
+      }
+    }
+
+    // Insert the new lock waiter
+    waiter_context.lock_waiter =
+        waiter_queue->insert(insert_point, GetKeyLockWaiter(id, exclusive));
+
+    waiter_context.waiter_queue = waiter_queue;
+  }
+
+  // Wait on an existing KeyLockWaiter until its turn to take the lock or
+  // timeout
+  Status WaitOnLock(std::list<KeyLockWaiter*>::iterator& lock_waiter,
+                    int64_t timeout_us = 0) {
+    Status ret;
+    if (timeout_us == 0) {
+      ret = (*lock_waiter)->Wait(stripe_mutex);
+    } else {
+      ret = (*lock_waiter)->WaitFor(stripe_mutex, timeout_us);
+    }
+    return ret;
+  }
+
+  void ReleaseLastLockHolder(
+      LockInfo& lock_info,
+      UnorderedMap<std::string, LockInfo>::iterator stripe_iter,
+      LockMap* lock_map, TransactionID txn_id, const std::string& key,
+      const int64_t max_num_locks, autovector<TransactionID>& txns,
+      autovector<TransactionID>::iterator& txn_it);
+
   // Mutex must be held before modifying keys map
   std::shared_ptr<TransactionDBMutex> stripe_mutex;
 
@@ -60,16 +298,39 @@ struct LockMapStripe {
   // Locked keys mapped to the info about the transactions that locked them.
   // TODO(agiardullo): Explore performance of other data structures.
   UnorderedMap<std::string, LockInfo> keys;
+
+ private:
+  std::shared_ptr<TransactionDBMutexFactory> mutex_factory_;
+
+  // key lock waiter, wrapped in thread local for reusing it across
+  // transactions.
+  ThreadLocalPtr& key_lock_waiter_;
+
+  // Return key lock waiter stored in thread local var, create on first use
+  KeyLockWaiter* GetKeyLockWaiter(TransactionID id, bool exclusive) {
+    KeyLockWaiter* waiter = nullptr;
+    if (key_lock_waiter_.Get() == nullptr) {
+      // create key lock waiter
+      key_lock_waiter_.Reset(
+          new KeyLockWaiter(mutex_factory_->AllocateCondVar(), id, exclusive));
+      waiter = static_cast<KeyLockWaiter*>(key_lock_waiter_.Get());
+    } else {
+      waiter = static_cast<KeyLockWaiter*>(key_lock_waiter_.Get());
+      waiter->Reset(id, exclusive);
+    }
+    return waiter;
+  }
 };
 
 // Map of #num_stripes LockMapStripes
 struct LockMap {
   explicit LockMap(size_t num_stripes,
-                   std::shared_ptr<TransactionDBMutexFactory> factory)
-      : num_stripes_(num_stripes) {
+                   std::shared_ptr<TransactionDBMutexFactory> factory,
+                   ThreadLocalPtr& key_lock_waiter)
+      : num_stripes_(num_stripes), key_lock_waiter_(key_lock_waiter) {
     lock_map_stripes_.reserve(num_stripes);
     for (size_t i = 0; i < num_stripes; i++) {
-      LockMapStripe* stripe = new LockMapStripe(factory);
+      LockMapStripe* stripe = new LockMapStripe(factory, key_lock_waiter_);
       lock_map_stripes_.push_back(stripe);
     }
   }
@@ -78,20 +339,80 @@ struct LockMap {
     for (auto stripe : lock_map_stripes_) {
       delete stripe;
     }
+    // Validate total locked key count is 0, when lock map is destructed.
+    assert(locked_key_cnt.LoadRelaxed() == 0);
   }
 
   // Number of sepearate LockMapStripes to create, each with their own Mutex
   const size_t num_stripes_;
+  ThreadLocalPtr& key_lock_waiter_;
 
   // Count of keys that are currently locked in this column family.
+  // Note that multiple shared locks on the same key is counted as 1 lock.
   // (Only maintained if PointLockManager::max_num_locks_ is positive.)
-  std::atomic<int64_t> lock_cnt{0};
+  RelaxedAtomic<int64_t> locked_key_cnt{0};
 
   std::vector<LockMapStripe*> lock_map_stripes_;
 
   size_t GetStripe(const std::string& key) const;
 };
 
+inline void RemoveTransaction(autovector<TransactionID>& txns,
+                              autovector<TransactionID>::iterator& txn_it) {
+  if (txns.size() > 1) {
+    auto last_it = txns.end() - 1;
+    if (txn_it != last_it) {
+      *txn_it = *last_it;
+    }
+  }
+  txns.pop_back();
+}
+
+void LockMapStripe::ReleaseLastLockHolder(
+    LockInfo& lock_info,
+    UnorderedMap<std::string, LockInfo>::iterator stripe_iter,
+    LockMap* lock_map, TransactionID txn_id, const std::string& key,
+    const int64_t max_num_locks, autovector<TransactionID>& txns,
+    autovector<TransactionID>::iterator& txn_it) {
+  // check whether there is other waiting transactions
+  if (lock_info.waiter_queue == nullptr || lock_info.waiter_queue->empty()) {
+    keys.erase(stripe_iter);
+    if (max_num_locks > 0) {
+      // Maintain lock count if there is a limit on the number of
+      // locks.
+      assert(lock_map->locked_key_cnt.LoadRelaxed() > 0);
+      lock_map->locked_key_cnt.FetchSubRelaxed(1);
+    }
+  } else {
+    // there are waiters in the queue, so we need to wake the next
+    // one up
+    RemoveTransaction(txns, txn_it);
+    // loop through the waiter queue and wake up all the shared lock
+    // waiters until the first exclusive lock waiter, or wake up the
+    // first waiter, if it is waiting for an exclusive lock.
+    bool first_waiter = true;
+    for (auto& waiter : *lock_info.waiter_queue) {
+      if (waiter->exclusive) {
+        if (first_waiter) {
+          // the first waiter is an exclusive lock waiter, wake it
+          // up Note that they are only notified, but not removed
+          // from the waiter queue. This allows new transaction to
+          // be aware that there are waiters ahead of them.
+          waiter->Notify();
+          DebugWakeUpWaiter(txn_id, waiter->id, key, "UnlockKey X waiter");
+        }
+        // found the first exclusive lock waiter, stop
+        break;
+      } else {
+        // wake up the shared lock waiter
+        waiter->Notify();
+        DebugWakeUpWaiter(txn_id, waiter->id, key, "UnlockKey S waiter");
+      }
+      first_waiter = false;
+    }
+  }
+}
+
 namespace {
 void UnrefLockMapsCache(void* ptr) {
   // Called when a thread exits or a ThreadLocalPtr gets destroyed.
@@ -99,6 +420,10 @@ void UnrefLockMapsCache(void* ptr) {
       static_cast<UnorderedMap<uint32_t, std::shared_ptr<LockMap>>*>(ptr);
   delete lock_maps_cache;
 }
+void UnrefKeyLockWaiter(void* ptr) {
+  auto key_lock_waiter = static_cast<KeyLockWaiter*>(ptr);
+  delete key_lock_waiter;
+}
 }  // anonymous namespace
 
 PointLockManager::PointLockManager(PessimisticTransactionDB* txn_db,
@@ -107,6 +432,7 @@ PointLockManager::PointLockManager(PessimisticTransactionDB* txn_db,
       default_num_stripes_(opt.num_stripes),
       max_num_locks_(opt.max_num_locks),
       lock_maps_cache_(new ThreadLocalPtr(&UnrefLockMapsCache)),
+      key_lock_waiter_(&UnrefKeyLockWaiter),
       dlock_buffer_(opt.max_num_deadlocks),
       mutex_factory_(opt.custom_mutex_factory
                          ? opt.custom_mutex_factory
@@ -122,7 +448,8 @@ void PointLockManager::AddColumnFamily(const ColumnFamilyHandle* cf) {
 
   if (lock_maps_.find(cf->GetID()) == lock_maps_.end()) {
     lock_maps_.emplace(cf->GetID(), std::make_shared<LockMap>(
-                                        default_num_stripes_, mutex_factory_));
+                                        default_num_stripes_, mutex_factory_,
+                                        key_lock_waiter_));
   } else {
     // column_family already exists in lock map
     assert(false);
@@ -242,16 +569,18 @@ Status PointLockManager::TryLock(PessimisticTransaction* txn,
 
   LockInfo lock_info(txn->GetID(), txn->GetExpirationTime(), exclusive);
   int64_t timeout = txn->GetLockTimeout();
+  int64_t deadlock_timeout_us = txn->GetDeadlockTimeout();
 
   return AcquireWithTimeout(txn, lock_map, stripe, column_family_id, key, env,
-                            timeout, lock_info);
+                            timeout, deadlock_timeout_us, lock_info);
 }
 
 // Helper function for TryLock().
 Status PointLockManager::AcquireWithTimeout(
     PessimisticTransaction* txn, LockMap* lock_map, LockMapStripe* stripe,
     ColumnFamilyId column_family_id, const std::string& key, Env* env,
-    int64_t timeout, const LockInfo& lock_info) {
+    int64_t timeout, int64_t /*deadlock_timeout_us*/,
+    const LockInfo& lock_info) {
   Status result;
   uint64_t end_time = 0;
 
@@ -277,13 +606,13 @@ Status PointLockManager::AcquireWithTimeout(
   autovector<TransactionID> wait_ids;
   result = AcquireLocked(lock_map, stripe, key, env, lock_info,
                          &expire_time_hint, &wait_ids);
-
   if (!result.ok() && timeout != 0) {
     PERF_TIMER_GUARD(key_lock_wait_time);
     PERF_COUNTER_ADD(key_lock_wait_count, 1);
     // If we weren't able to acquire the lock, we will keep retrying as long
     // as the timeout allows.
     bool timed_out = false;
+    bool cv_wait_fail = false;
     do {
       // Decide how long to wait
       int64_t cv_end_time = -1;
@@ -294,8 +623,7 @@ Status PointLockManager::AcquireWithTimeout(
       } else if (end_time > 0) {
         cv_end_time = end_time;
       }
-
-      assert(result.IsBusy() || wait_ids.size() != 0);
+      assert(result.IsLockLimit() == wait_ids.empty());
 
       // We are dependent on a transaction to finish, so perform deadlock
       // detection.
@@ -315,11 +643,20 @@ Status PointLockManager::AcquireWithTimeout(
       if (cv_end_time < 0) {
         // Wait indefinitely
         result = stripe->stripe_cv->Wait(stripe->stripe_mutex);
+        cv_wait_fail = !result.ok();
       } else {
+        // FIXME: in this case, cv_end_time could be `expire_time_hint` from the
+        // current lock holder, a time out does not mean we reached the current
+        // transaction's timeout, and we should continue to retry locking
+        // instead of exiting this while loop below.
         uint64_t now = env->NowMicros();
         if (static_cast<uint64_t>(cv_end_time) > now) {
           result = stripe->stripe_cv->WaitFor(stripe->stripe_mutex,
                                               cv_end_time - now);
+          cv_wait_fail = !result.ok() && !result.IsTimedOut();
+        } else {
+          // now >= cv_end_time, we already timed out
+          result = Status::TimedOut(Status::SubCode::kLockTimeout);
         }
       }
 
@@ -329,6 +666,9 @@ Status PointLockManager::AcquireWithTimeout(
           DecrementWaiters(txn, wait_ids);
         }
       }
+      if (cv_wait_fail) {
+        break;
+      }
 
       if (result.IsTimedOut()) {
         timed_out = true;
@@ -336,19 +676,145 @@ Status PointLockManager::AcquireWithTimeout(
         // acquire lock below (it is possible the lock expired and we
         // were never signaled).
       }
-
-      if (result.ok() || result.IsTimedOut()) {
-        result = AcquireLocked(lock_map, stripe, key, env, lock_info,
-                               &expire_time_hint, &wait_ids);
-      }
+      assert(result.ok() || result.IsTimedOut());
+      wait_ids.clear();
+      result = AcquireLocked(lock_map, stripe, key, env, lock_info,
+                             &expire_time_hint, &wait_ids);
     } while (!result.ok() && !timed_out);
   }
 
   stripe->stripe_mutex->UnLock();
 
+  // On timeout, persist the lock information so we can debug the contention
+  if (result.IsTimedOut()) {
+    txn->SetWaitingTxn(wait_ids, column_family_id, &key, true);
+  }
+
+  return result;
+}
+
+// Try to lock this key after we have acquired the mutex.
+// Sets *expire_time to the expiration time in microseconds
+//  or 0 if no expiration.
+//
+// Returns Status::TimeOut if the lock cannot be acquired due to it being
+// held by other transactions, `txn_ids` will be populated with the id of
+// transactions that hold the lock, excluding lock_info.txn_ids[0].
+// Returns Status::Aborted(kLockLimit) if the lock cannot be acquired due to
+// reaching per CF limit on the number of locks.
+//
+// REQUIRED:  Stripe mutex must be held. txn_ids must be empty.
+Status PointLockManager::AcquireLocked(LockMap* lock_map, LockMapStripe* stripe,
+                                       const std::string& key, Env* env,
+                                       const LockInfo& txn_lock_info,
+                                       uint64_t* expire_time,
+                                       autovector<TransactionID>* txn_ids) {
+  assert(txn_lock_info.txn_ids.size() == 1);
+  assert(txn_ids && txn_ids->empty());
+
+  Status result;
+  // Check if this key is already locked
+  auto stripe_iter = stripe->keys.find(key);
+  if (stripe_iter != stripe->keys.end()) {
+    // Lock already held
+    auto& lock_info = stripe_iter->second;
+    assert(lock_info.txn_ids.size() == 1 || !lock_info.exclusive);
+
+    if (lock_info.exclusive || txn_lock_info.exclusive) {
+      if (lock_info.txn_ids.size() == 1 &&
+          lock_info.txn_ids[0] == txn_lock_info.txn_ids[0]) {
+        // The list contains one txn and we're it, so just take it.
+        lock_info.exclusive = txn_lock_info.exclusive;
+        lock_info.expiration_time = txn_lock_info.expiration_time;
+      } else {
+        // Check if it's expired. Skips over txn_lock_info.txn_ids[0] in case
+        // it's there for a shared lock with multiple holders which was not
+        // caught in the first case.
+        if (IsLockExpired(txn_lock_info.txn_ids[0], lock_info, env,
+                          expire_time)) {
+          // lock is expired, can steal it
+          lock_info.txn_ids = txn_lock_info.txn_ids;
+          lock_info.exclusive = txn_lock_info.exclusive;
+          lock_info.expiration_time = txn_lock_info.expiration_time;
+          // lock_cnt does not change
+        } else {
+          result = Status::TimedOut(Status::SubCode::kLockTimeout);
+          for (auto id : lock_info.txn_ids) {
+            // A transaction is not blocked by itself
+            if (id != txn_lock_info.txn_ids[0]) {
+              txn_ids->push_back(id);
+            }
+          }
+        }
+      }
+    } else {
+      // We are requesting shared access to a shared lock, so just grant it.
+      lock_info.txn_ids.push_back(txn_lock_info.txn_ids[0]);
+      // Using std::max means that expiration time never goes down even when
+      // a transaction is removed from the list. The correct solution would be
+      // to track expiry for every transaction, but this would also work for
+      // now.
+      lock_info.expiration_time =
+          std::max(lock_info.expiration_time, txn_lock_info.expiration_time);
+    }
+  } else {
+    // Lock not held.
+    // Check lock limit
+    if (max_num_locks_ > 0 &&
+        lock_map->locked_key_cnt.LoadRelaxed() >= max_num_locks_) {
+      result = Status::LockLimit();
+    } else {
+      // acquire lock
+      stripe->keys.try_emplace(key, txn_lock_info.txn_ids[0],
+                               txn_lock_info.expiration_time,
+                               txn_lock_info.exclusive);
+
+      // Maintain lock count if there is a limit on the number of locks
+      if (max_num_locks_ > 0) {
+        lock_map->locked_key_cnt.FetchAddRelaxed(1);
+      }
+    }
+  }
+
   return result;
 }
 
+void PointLockManager::UnLockKey(PessimisticTransaction* txn,
+                                 const std::string& key, LockMapStripe* stripe,
+                                 LockMap* lock_map, Env* env) {
+  (void)env;
+  TransactionID txn_id = txn->GetID();
+
+  auto stripe_iter = stripe->keys.find(key);
+  if (stripe_iter != stripe->keys.end()) {
+    auto& txns = stripe_iter->second.txn_ids;
+    auto txn_it = std::find(txns.begin(), txns.end(), txn_id);
+    // Found the key we locked.  unlock it.
+    if (txn_it != txns.end()) {
+      if (txns.size() == 1) {
+        stripe->keys.erase(stripe_iter);
+      } else {
+        auto last_it = txns.end() - 1;
+        if (txn_it != last_it) {
+          *txn_it = *last_it;
+        }
+        txns.pop_back();
+      }
+
+      if (max_num_locks_ > 0) {
+        // Maintain lock count if there is a limit on the number of locks.
+        assert(lock_map->locked_key_cnt.LoadRelaxed() > 0);
+        lock_map->locked_key_cnt.FetchSubRelaxed(1);
+      }
+    }
+  } else {
+    // This key is either not locked or locked by someone else.  This should
+    // only happen if the unlocking transaction has expired.
+    assert(txn->GetExpirationTime() > 0 &&
+           txn->GetExpirationTime() < env->NowMicros());
+  }
+}
+
 void PointLockManager::DecrementWaiters(
     const PessimisticTransaction* txn,
     const autovector<TransactionID>& wait_ids) {
@@ -466,130 +932,22 @@ bool PointLockManager::IncrementWaiters(
   return true;
 }
 
-// Try to lock this key after we have acquired the mutex.
-// Sets *expire_time to the expiration time in microseconds
-//  or 0 if no expiration.
-// REQUIRED:  Stripe mutex must be held.
-Status PointLockManager::AcquireLocked(LockMap* lock_map, LockMapStripe* stripe,
-                                       const std::string& key, Env* env,
-                                       const LockInfo& txn_lock_info,
-                                       uint64_t* expire_time,
-                                       autovector<TransactionID>* txn_ids) {
-  assert(txn_lock_info.txn_ids.size() == 1);
+void PointLockManager::UnLock(PessimisticTransaction* txn,
+                              ColumnFamilyId column_family_id,
+                              const std::string& key, Env* env) {
+  std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(column_family_id);
+  LockMap* lock_map = lock_map_ptr.get();
+  if (lock_map == nullptr) {
+    // Column Family must have been dropped.
+    return;
+  }
 
-  Status result;
-  // Check if this key is already locked
-  auto stripe_iter = stripe->keys.find(key);
-  if (stripe_iter != stripe->keys.end()) {
-    // Lock already held
-    LockInfo& lock_info = stripe_iter->second;
-    assert(lock_info.txn_ids.size() == 1 || !lock_info.exclusive);
+  // Lock the mutex for the stripe that this key hashes to
+  size_t stripe_num = lock_map->GetStripe(key);
+  assert(lock_map->lock_map_stripes_.size() > stripe_num);
+  LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num);
 
-    if (lock_info.exclusive || txn_lock_info.exclusive) {
-      if (lock_info.txn_ids.size() == 1 &&
-          lock_info.txn_ids[0] == txn_lock_info.txn_ids[0]) {
-        // The list contains one txn and we're it, so just take it.
-        lock_info.exclusive = txn_lock_info.exclusive;
-        lock_info.expiration_time = txn_lock_info.expiration_time;
-      } else {
-        // Check if it's expired. Skips over txn_lock_info.txn_ids[0] in case
-        // it's there for a shared lock with multiple holders which was not
-        // caught in the first case.
-        if (IsLockExpired(txn_lock_info.txn_ids[0], lock_info, env,
-                          expire_time)) {
-          // lock is expired, can steal it
-          lock_info.txn_ids = txn_lock_info.txn_ids;
-          lock_info.exclusive = txn_lock_info.exclusive;
-          lock_info.expiration_time = txn_lock_info.expiration_time;
-          // lock_cnt does not change
-        } else {
-          result = Status::TimedOut(Status::SubCode::kLockTimeout);
-          *txn_ids = lock_info.txn_ids;
-        }
-      }
-    } else {
-      // We are requesting shared access to a shared lock, so just grant it.
-      lock_info.txn_ids.push_back(txn_lock_info.txn_ids[0]);
-      // Using std::max means that expiration time never goes down even when
-      // a transaction is removed from the list. The correct solution would be
-      // to track expiry for every transaction, but this would also work for
-      // now.
-      lock_info.expiration_time =
-          std::max(lock_info.expiration_time, txn_lock_info.expiration_time);
-    }
-  } else {  // Lock not held.
-    // Check lock limit
-    if (max_num_locks_ > 0 &&
-        lock_map->lock_cnt.load(std::memory_order_acquire) >= max_num_locks_) {
-      result = Status::Busy(Status::SubCode::kLockLimit);
-    } else {
-      // acquire lock
-      stripe->keys.emplace(key, txn_lock_info);
-
-      // Maintain lock count if there is a limit on the number of locks
-      if (max_num_locks_) {
-        lock_map->lock_cnt++;
-      }
-    }
-  }
-
-  return result;
-}
-
-void PointLockManager::UnLockKey(PessimisticTransaction* txn,
-                                 const std::string& key, LockMapStripe* stripe,
-                                 LockMap* lock_map, Env* env) {
-#ifdef NDEBUG
-  (void)env;
-#endif
-  TransactionID txn_id = txn->GetID();
-
-  auto stripe_iter = stripe->keys.find(key);
-  if (stripe_iter != stripe->keys.end()) {
-    auto& txns = stripe_iter->second.txn_ids;
-    auto txn_it = std::find(txns.begin(), txns.end(), txn_id);
-    // Found the key we locked.  unlock it.
-    if (txn_it != txns.end()) {
-      if (txns.size() == 1) {
-        stripe->keys.erase(stripe_iter);
-      } else {
-        auto last_it = txns.end() - 1;
-        if (txn_it != last_it) {
-          *txn_it = *last_it;
-        }
-        txns.pop_back();
-      }
-
-      if (max_num_locks_ > 0) {
-        // Maintain lock count if there is a limit on the number of locks.
-        assert(lock_map->lock_cnt.load(std::memory_order_relaxed) > 0);
-        lock_map->lock_cnt--;
-      }
-    }
-  } else {
-    // This key is either not locked or locked by someone else.  This should
-    // only happen if the unlocking transaction has expired.
-    assert(txn->GetExpirationTime() > 0 &&
-           txn->GetExpirationTime() < env->NowMicros());
-  }
-}
-
-void PointLockManager::UnLock(PessimisticTransaction* txn,
-                              ColumnFamilyId column_family_id,
-                              const std::string& key, Env* env) {
-  std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(column_family_id);
-  LockMap* lock_map = lock_map_ptr.get();
-  if (lock_map == nullptr) {
-    // Column Family must have been dropped.
-    return;
-  }
-
-  // Lock the mutex for the stripe that this key hashes to
-  size_t stripe_num = lock_map->GetStripe(key);
-  assert(lock_map->lock_map_stripes_.size() > stripe_num);
-  LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num);
-
-  stripe->stripe_mutex->Lock().PermitUncheckedError();
+  stripe->stripe_mutex->Lock().AssertOK();
   UnLockKey(txn, key, stripe, lock_map, env);
   stripe->stripe_mutex->UnLock();
 
@@ -631,7 +989,7 @@ void PointLockManager::UnLock(PessimisticTransaction* txn,
       assert(lock_map->lock_map_stripes_.size() > stripe_num);
       LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num);
 
-      stripe->stripe_mutex->Lock().PermitUncheckedError();
+      stripe->stripe_mutex->Lock().AssertOK();
 
       for (const std::string* key : stripe_keys) {
         UnLockKey(txn, *key, stripe, lock_map, env);
@@ -662,7 +1020,7 @@ PointLockManager::PointLockStatus PointLockManager::GetPointLockStatus() {
     const auto& stripes = lock_maps_[i]->lock_map_stripes_;
     // Iterate and lock all stripes in ascending order.
     for (const auto& j : stripes) {
-      j->stripe_mutex->Lock().PermitUncheckedError();
+      j->stripe_mutex->Lock().AssertOK();
       for (const auto& it : j->keys) {
         struct KeyLockInfo info;
         info.exclusive = it.second.exclusive;
@@ -714,4 +1072,758 @@ void PointLockManager::UnLock(PessimisticTransaction* /* txn */,
   // no-op
 }
 
+// PerKeyPointLockManager implementation
+PerKeyPointLockManager::PerKeyPointLockManager(PessimisticTransactionDB* db,
+                                               const TransactionDBOptions& opt)
+    : PointLockManager(db, opt) {}
+
+void DebugLockStatus(TransactionID my_txn_id, const LockInfo& lock_info,
+                     const std::string& key,
+                     const KeyLockWaiterContext& key_lock_waiter_ctx) {
+  if (kDebugLog) {
+    char msg[512];
+    size_t offset = 0;
+
+    // print lock holders
+    offset += snprintf(msg + offset, sizeof(msg),
+                       "Txn %" PRIu64 ": LockStatus key %s: holder [",
+                       my_txn_id, key.c_str());
+    for (const auto& txn_id : lock_info.txn_ids) {
+      offset += snprintf(msg + offset, sizeof(msg), "%s%" PRIu64 ",",
+                         lock_info.exclusive ? "X" : "S", txn_id);
+    }
+
+    // print waiter queue
+    offset += snprintf(msg + offset, sizeof(msg), "], waiter_queue [");
+    for (auto it = key_lock_waiter_ctx.waiter_queue->begin();
+         it != key_lock_waiter_ctx.waiter_queue->end(); it++) {
+      offset += snprintf(msg + offset, sizeof(msg), "%s%" PRIu64 ",",
+                         (*it)->exclusive ? "X" : "S", (*it)->id);
+    }
+
+    offset += snprintf(msg + offset, sizeof(msg), "]\n");
+    fprintf(stderr, "%s", msg);
+    fflush(stderr);
+  }
+}
+
+int64_t PerKeyPointLockManager::CalculateWaitEndTime(int64_t expire_time_hint,
+                                                     int64_t end_time) {
+  int64_t cv_end_time = -1;
+  if (expire_time_hint > 0 && end_time > 0) {
+    cv_end_time = std::min(expire_time_hint, end_time);
+  } else if (expire_time_hint > 0) {
+    cv_end_time = expire_time_hint;
+  } else if (end_time > 0) {
+    cv_end_time = end_time;
+  }
+  return cv_end_time;
+}
+
+// Acquire lock within timeout.
+// This function is similar to PointLockManger::AcquireWithTimeout with
+// following differences.
+//
+// If deadlock_timeout_us is not 0, it first performs a wait without doing dead
+// lock detection. This wait duration is specified by deadlock_timeout_us.
+// If this wait times out and it is still not able to acquire the lock, perform
+// the deadlock detection before wait again.
+//
+// It uses a per key lock waiter queue to handle lock waiting and wake up
+// efficiently. When a transaction is waiting for acquiring a lock on a key, it
+// joins a wait queue that is dedicated for this key. It will either timeout, or
+// get woken up when it is its turn to take the lock. This is more efficient
+// than the PointLockManger implementation where all lock waiters wait on the
+// same lock stripe cond var.
+Status PerKeyPointLockManager::AcquireWithTimeout(
+    PessimisticTransaction* txn, LockMap* lock_map, LockMapStripe* stripe,
+    ColumnFamilyId column_family_id, const std::string& key, Env* env,
+    int64_t timeout, int64_t deadlock_timeout_us,
+    const LockInfo& txn_lock_info) {
+  Status result;
+  uint64_t end_time = 0;
+  auto my_txn_id = txn_lock_info.txn_ids[0];
+
+  if (timeout > 0) {
+    uint64_t start_time = env->NowMicros();
+    end_time = start_time + timeout;
+  }
+
+  if (timeout < 0) {
+    // If timeout is negative, we wait indefinitely to acquire the lock
+    result = stripe->stripe_mutex->Lock();
+  } else {
+    result = stripe->stripe_mutex->TryLockFor(timeout);
+  }
+
+  if (!result.ok()) {
+    // failed to acquire mutex
+    return result;
+  }
+
+  // Acquire lock if we are able to
+  uint64_t expire_time_hint = 0;
+  autovector<TransactionID> wait_ids;
+  bool isUpgrade = false;
+
+  auto lock_info = stripe->GetLockInfo(key);
+
+  auto wait_before_deadlock_detection =
+      txn->IsDeadlockDetect() && (deadlock_timeout_us > 0);
+  result = AcquireLocked(
+      lock_map, stripe, key, env, txn_lock_info, &expire_time_hint,
+      // If wait before deadlock detection, it executes a fast path to save CPU
+      // cycles, wait ids are not collected.
+      wait_before_deadlock_detection ? nullptr : &wait_ids, &lock_info,
+      &isUpgrade, true);
+  if (!result.ok() && timeout != 0 &&
+      /* No need to retry after reach lock limit or aborted */
+      !result.IsLockLimit() && !result.IsAborted()) {
+    assert(lock_info);
+
+    PERF_TIMER_GUARD(key_lock_wait_time);
+    PERF_COUNTER_ADD(key_lock_wait_count, 1);
+    // If we weren't able to acquire the lock, we will keep retrying as long
+    // as the timeout allows.
+    bool timed_out = false;
+    bool cv_wait_fail = false;
+
+    KeyLockWaiterContext key_lock_waiter_ctx;
+
+    // Decide how long to wait
+    auto cv_end_time = CalculateWaitEndTime(expire_time_hint, end_time);
+
+    // We will try to wait a little bit before checking deadlock, as
+    // deadlock check is expensive.
+    if (wait_before_deadlock_detection) {
+      int64_t now = env->NowMicros();
+      if (cv_end_time < 0 || cv_end_time > now) {
+        if (kDebugLog) {
+          // print lock status before deadlock detection
+          fprintf(stderr,
+                  "Txn %" PRIu64
+                  " wait before deadlock detection %s, exclusive lock "
+                  "%d\n",
+                  my_txn_id, key.c_str(), txn_lock_info.exclusive);
+          fflush(stderr);
+        }
+        stripe->JoinWaitQueue(*lock_info, my_txn_id, txn_lock_info.exclusive,
+                              false, key_lock_waiter_ctx);
+        DebugLockStatus(my_txn_id, *lock_info, key, key_lock_waiter_ctx);
+
+        TEST_SYNC_POINT(
+            "PerKeyPointLockManager::AcquireWithTimeout:"
+            "WaitingTxnBeforeDeadLockDetection");
+        result = stripe->WaitOnLock(
+            key_lock_waiter_ctx.lock_waiter,
+            std::min(cv_end_time - now, (int64_t)deadlock_timeout_us));
+        assert(result.ok() || result.IsTimedOut());
+        // Refresh lock info pointer, as this pointer is not guaranteed to be
+        // stable in folly
+        lock_info = stripe->GetLockInfo(key);
+        // try to take a lock again to get wait ids after deadlock timeout
+        result = AcquireLocked(lock_map, stripe, key, env, txn_lock_info,
+                               &expire_time_hint, &wait_ids, &lock_info,
+                               &isUpgrade, !result.ok());
+      } else {
+        // Already timed out
+        timed_out = true;
+        result = Status::TimedOut(Status::SubCode::kLockTimeout);
+      }
+    }
+
+    while (!result.ok() && !timed_out && !result.IsAborted()) {
+      // Refresh wait end time
+      cv_end_time = CalculateWaitEndTime(expire_time_hint, end_time);
+
+      // We are dependent on a transaction to finish, so perform deadlock
+      // detection.
+      if (!wait_ids.empty()) {
+        if (txn->IsDeadlockDetect()) {
+          if (IncrementWaiters(txn, wait_ids, key, column_family_id,
+                               txn_lock_info.exclusive, env)) {
+            result = Status::Busy(Status::SubCode::kDeadlock);
+            break;
+          }
+        }
+        txn->SetWaitingTxn(wait_ids, column_family_id, &key);
+      }
+
+      TEST_SYNC_POINT("PointLockManager::AcquireWithTimeout:WaitingTxn");
+
+      if (kDebugLog) {
+        // print transaction lock status and wait ids
+        char msg[512];
+        size_t offset = 0;
+        offset += snprintf(msg + offset, sizeof(msg),
+                           "Txn %" PRIu64
+                           " wait after deadlock detection %s, exclusive lock "
+                           "%d, upgrade %d, wait_ids [",
+                           my_txn_id, key.c_str(), txn_lock_info.exclusive,
+                           isUpgrade);
+
+        for (auto it = wait_ids.begin(); it != wait_ids.end(); it++) {
+          offset += snprintf(msg + offset, sizeof(msg), "%" PRIu64 ",", *it);
+        }
+
+        offset += snprintf(msg + offset, sizeof(msg), "]\n");
+
+        fprintf(stderr, "%s", msg);
+        fflush(stderr);
+      }
+
+      // If it has not joined wait queue, join it now.
+      // If it is a lock upgrade, rejoin it.
+      if (isUpgrade || (key_lock_waiter_ctx.waiter_queue == nullptr)) {
+        stripe->JoinWaitQueue(*lock_info, my_txn_id, txn_lock_info.exclusive,
+                              isUpgrade, key_lock_waiter_ctx);
+
+        DebugLockStatus(my_txn_id, *lock_info, key, key_lock_waiter_ctx);
+      }
+
+      int64_t now = 0;
+      if (cv_end_time < 0) {
+        // Wait indefinitely
+        result = stripe->WaitOnLock(key_lock_waiter_ctx.lock_waiter);
+        cv_wait_fail = !result.ok();
+      } else {
+        now = env->NowMicros();
+        if (cv_end_time > now) {
+          result = stripe->WaitOnLock(key_lock_waiter_ctx.lock_waiter,
+                                      cv_end_time - now);
+
+          cv_wait_fail = !result.ok() && !result.IsTimedOut();
+        } else {
+          // now >= cv_end_time, we already timed out
+          result = Status::TimedOut(Status::SubCode::kLockTimeout);
+        }
+      }
+
+#ifndef NDEBUG
+      stripe->stripe_mutex->UnLock();
+      TEST_SYNC_POINT_CALLBACK(
+          "PerKeyPointLockManager::AcquireWithTimeout:AfterWokenUp",
+          &my_txn_id);
+      TEST_SYNC_POINT(
+          "PerKeyPointLockManager::AcquireWithTimeout:BeforeTakeLock");
+      auto lock_status = stripe->stripe_mutex->Lock();
+      assert(lock_status.ok());
+#endif
+
+      if (!wait_ids.empty()) {
+        txn->ClearWaitingTxn();
+        if (txn->IsDeadlockDetect()) {
+          DecrementWaiters(txn, wait_ids);
+        }
+      }
+
+      if (cv_wait_fail) {
+        break;
+      }
+
+      if (result.IsTimedOut()) {
+        timed_out = true;
+        // Even though we timed out, we will still make one more attempt to
+        // acquire lock below (it is possible the lock expired and we
+        // were never signaled).
+      }
+      assert(result.ok() || result.IsTimedOut());
+
+      // Refresh lock info pointer, as this pointer is not guaranteed to be
+      // stable in folly
+      lock_info = stripe->GetLockInfo(key);
+
+      // Try to get the lock again.
+      result = AcquireLocked(
+          lock_map, stripe, key, env, txn_lock_info, &expire_time_hint,
+          &wait_ids, &lock_info, &isUpgrade,
+          /* If wait is timed out, it means it is not its turn to take the lock.
+           * Therefore, it should still follow FIFO order. */
+          timed_out);
+      auto fail_to_take_lock_on_its_turn = !timed_out && !result.ok();
+      if (fail_to_take_lock_on_its_turn) {
+        // If it is its turn, but it failed to take lock, something is broken.
+        // Assert this should not happen in debug build during testing.
+        // In prod, it simply gives up the attempt.
+        assert(!fail_to_take_lock_on_its_turn);
+        break;
+      }
+
+      if (!result.ok() && cv_end_time >= 0) {
+        if (static_cast<int64_t>(end_time) <= now) {
+          // lock timeout timed out
+          result = Status::TimedOut(Status::SubCode::kLockTimeout);
+          timed_out = true;
+        }
+      }
+    }
+
+    // For any reason that the transaction failed to acquire the lock, it should
+    // try to wake up next waiters, if they are ready to proceed.
+    if (!result.ok()) {
+      key_lock_waiter_ctx.TryWakeUpNextWaiters(*lock_info, key);
+    }
+  }
+
+  stripe->stripe_mutex->UnLock();
+
+  // On timeout, persist the lock information so we can debug the contention
+  if (result.IsTimedOut()) {
+    txn->SetWaitingTxn(wait_ids, column_family_id, &key, true);
+  }
+
+  return result;
+}
+
+Status PerKeyPointLockManager::FillWaitIds(LockInfo& lock_info,
+                                           const LockInfo& txn_lock_info,
+                                           autovector<TransactionID>* wait_ids,
+                                           bool& isUpgrade,
+                                           TransactionID& my_txn_id,
+                                           const std::string& key) {
+  if (wait_ids != nullptr) {
+    for (auto id : lock_info.txn_ids) {
+      // A transaction is not blocked by itself
+      if (id != my_txn_id) {
+        wait_ids->push_back(id);
+      } else {
+        // Itself is already holding a lock, so it is either an upgrade or
+        // downgrade. Downgrade has already been handled above. Assert it
+        // is an upgrade here.
+        auto is_upgrade = !lock_info.exclusive && txn_lock_info.exclusive;
+        if (!is_upgrade) {
+          if (kDebugLog) {
+            fprintf(stderr,
+                    "txn id %" PRIu64 " assert failed on lock upgrade key %s\n",
+                    my_txn_id, key.c_str());
+            fflush(stderr);
+          }
+          assert(is_upgrade);
+          return Status::Aborted(Status::SubCode::kNotExpectedCodePath);
+        }
+        isUpgrade = true;
+      }
+    }
+  }
+  return Status::OK();
+}
+
+// This function is similar to PointLockManager::AcquireLocked with following
+// differences.
+//
+// It introduces a per key lock waiter queue. When it tries to take the lock, it
+// will first check whether there are other transactions already in the waiter
+// queue, if so it will return TimeOut. Caller will join the waiter queue, if
+// lock timeout is not reached yet. When it is its to take the lock, it will be
+// woken up and take the lock.
+//
+// It introduces a fast path check that will quickly check whether the lock
+// could be obtained without gathering waiter id information. This allows
+// transaction to sleep a short time before perform deadlock detection.
+//
+// @param lock_info_ptr: pointer to the LockInfo associated with the key. If the
+//    key is already locked, LockInfo will be not null. If not, LockInfo is
+//    null, and a new LockInfo is created and assigned to lock_info_ptr.
+//
+// @param wait_ids: When wait_ids is nullptr, it perform a fast path check to
+//    see whether it could take the lock, it does not fill waiter_ids. If
+//    wait_ids is not nullptr, it will fill the wait_ids with the lock holder.
+//
+// @param isUpgrade: isUpgrade is set to true, if the transaction tries to
+//    uprade a lock to exclusive, but it needs to wait for other lock holders to
+//    release the shared locks. Note that isUpgrade is not set on fast path
+//    check.
+//
+// @param fifo: fifo flag indicates whether it should follow fifo order to check
+//    whether there is already a waiter waiting for the lock or not. If fifo is
+//    true and there is already a lock waiter waiting in the queue and it is not
+//    itself, return TimedOut. If fifo is false, it means it is its turn to take
+//    the lock.
+Status PerKeyPointLockManager::AcquireLocked(
+    LockMap* lock_map, LockMapStripe* stripe, const std::string& key, Env* env,
+    const LockInfo& txn_lock_info, uint64_t* expire_time,
+    autovector<TransactionID>* wait_ids, LockInfo** lock_info_ptr,
+    bool* isUpgrade, bool fifo) {
+  assert(txn_lock_info.txn_ids.size() == 1);
+
+  if (wait_ids != nullptr) {
+    wait_ids->clear();
+  }
+
+  *isUpgrade = false;
+  auto my_txn_id = txn_lock_info.txn_ids[0];
+
+  if (!*lock_info_ptr) {
+    // No lock nor waiter on this key, so it can try to acquire the lock
+    // directly
+    if (max_num_locks_ > 0 &&
+        lock_map->locked_key_cnt.LoadRelaxed() >= max_num_locks_) {
+      return Status::LockLimit();
+    } else {
+      // acquire lock
+      auto ret = stripe->keys.try_emplace(key, my_txn_id,
+                                          txn_lock_info.expiration_time,
+                                          txn_lock_info.exclusive);
+      assert(ret.second);
+      *lock_info_ptr = &(ret.first->second);
+
+      // Maintain lock count if there is a limit on the number of locks
+      if (max_num_locks_ > 0) {
+        lock_map->locked_key_cnt.FetchAddRelaxed(1);
+      }
+
+      return Status::OK();
+    }
+  }
+
+  auto& lock_info = **lock_info_ptr;
+  auto locked = !lock_info.txn_ids.empty();
+  auto solo_lock_owner =
+      (lock_info.txn_ids.size() == 1) && (lock_info.txn_ids[0] == my_txn_id);
+
+  // Handle lock downgrade and reentrant first, it should always succeed
+  if (locked) {
+    if (solo_lock_owner) {
+      // Lock is already owned by itself.
+      if (lock_info.exclusive && !txn_lock_info.exclusive) {
+        // For downgrade, wake up all the shared lock waiters at the front of
+        // the waiter queue
+        if (lock_info.waiter_queue != nullptr) {
+          for (auto& waiter : *lock_info.waiter_queue) {
+            if (waiter->exclusive) {
+              break;
+            }
+            waiter->Notify();
+            DebugWakeUpWaiter(my_txn_id, waiter->id, key, "Lock Downgrade");
+          }
+        }
+      }
+
+      if (lock_info.exclusive || !txn_lock_info.exclusive) {
+        // If it is lock downgrade or re-entrant, grant it immediately
+        lock_info.exclusive = txn_lock_info.exclusive;
+        lock_info.expiration_time = txn_lock_info.expiration_time;
+        return Status::OK();
+      }
+    } else {
+      // handle read reentrant lock for non solo lock owner case
+      // Check whether the transaction already hold a shared lock and it is
+      // trying to acquire it again.
+      if (!txn_lock_info.exclusive && !lock_info.exclusive) {
+        auto lock_it = std::find(lock_info.txn_ids.begin(),
+                                 lock_info.txn_ids.end(), my_txn_id);
+        if (lock_it != lock_info.txn_ids.end()) {
+          lock_info.expiration_time = std::max(lock_info.expiration_time,
+                                               txn_lock_info.expiration_time);
+          return Status::OK();
+        }
+      }
+    }
+  }
+
+  auto has_waiter =
+      (lock_info.waiter_queue != nullptr) && !lock_info.waiter_queue->empty();
+
+  // Update solo lock owner for the rest of the cases
+  if (solo_lock_owner) {
+    // If there is a shared lock waiter that is ready to take the lock, the
+    // current transaction would not be the solo lock owner.
+    auto has_ready_shared_lock_waiter =
+        has_waiter && lock_info.waiter_queue->front()->IsReady() &&
+        (!lock_info.waiter_queue->front()->exclusive);
+    solo_lock_owner = !has_ready_shared_lock_waiter;
+  }
+
+  // If myself is the first waiter in the queue, skip checking waiter queue
+  auto is_first_waiter =
+      has_waiter && (lock_info.waiter_queue->front()->id == my_txn_id);
+
+  if (fifo && has_waiter && !is_first_waiter) {
+    // There are other waiters ahead of myself
+    {
+      // handle shared lock request on a shared lock with only shared lock
+      // waiters
+      if (!txn_lock_info.exclusive &&
+          (!locked || (locked && !lock_info.exclusive))) {
+        bool has_exclusive_waiter = false;
+        // check whether there is exclusive lock waiter
+        for (auto& waiter : *lock_info.waiter_queue) {
+          if (waiter->exclusive) {
+            has_exclusive_waiter = true;
+            break;
+          }
+        }
+        if (!has_exclusive_waiter) {
+          // no X waiter in the queue, so it can acquire the lock without
+          // waiting
+          lock_info.txn_ids.push_back(my_txn_id);
+          lock_info.exclusive = false;
+          lock_info.expiration_time = std::max(lock_info.expiration_time,
+                                               txn_lock_info.expiration_time);
+          return Status::OK();
+        }
+      }
+    }
+
+    // fast path check for lock upgrade
+    if (solo_lock_owner && !lock_info.exclusive && txn_lock_info.exclusive) {
+      // During lock upgrade, if it is the only transaction owns the lock and no
+      // other shared lock requesting transaction is ready to take the lock,
+      // prioritize the lock grade and grant it now.
+      lock_info.exclusive = txn_lock_info.exclusive;
+      lock_info.expiration_time = txn_lock_info.expiration_time;
+      return Status::OK();
+    }
+
+    if (wait_ids == nullptr) {
+      // If wait_ids is nullptr, it is a fast path check to see whether it is
+      // able to take the lock or not, skip filling the waiting txn ids for
+      // deadlock detection.
+      return Status::TimedOut(Status::SubCode::kLockTimeout);
+    }
+
+    // For other cases with fifo and lock waiter, try to wait in the queue
+    // and fill the waiting txn list
+    auto s = FillWaitIds(lock_info, txn_lock_info, wait_ids, *isUpgrade,
+                         my_txn_id, key);
+    if (!s.ok()) {
+      // propagate error up
+      return s;
+    }
+
+    // Add the waiter txn ids to the blocking txn id list
+    if (txn_lock_info.exclusive) {
+      // For exclusive lock, it traverse the queue from front to back to
+      // handle upgrade
+      for (auto& waiter : *lock_info.waiter_queue) {
+        // For upgrade locks, it will be placed at the beginning of
+        // the queue. However, for shared lock waiters that are at
+        // the beginning of the queue that got woken up but haven't
+        // taken the lock yet, they should still be added to the
+        // blocking txn id list.
+        if (*isUpgrade && waiter->exclusive) {
+          break;
+        }
+        if (waiter->id != my_txn_id) {
+          wait_ids->push_back(waiter->id);
+        }
+      }
+    } else {
+      // For shared lock, skip the S lock waiters at the end of the queue, as
+      // they will be waked up together. Therefore, it traverses the queue from
+      // from back to front.
+      bool skip_shared_lock_waiter = true;
+      for (auto it = lock_info.waiter_queue->rbegin();
+           it != lock_info.waiter_queue->rend(); ++it) {
+        if ((*it)->exclusive) {
+          skip_shared_lock_waiter = false;
+        } else {
+          if (skip_shared_lock_waiter) {
+            continue;
+          }
+        }
+        if ((*it)->id != my_txn_id) {
+          wait_ids->push_back((*it)->id);
+        }
+      }
+    }
+
+    return Status::TimedOut(Status::SubCode::kLockTimeout);
+  } else {
+    // there is no waiter or it is its turn to take the lock
+    if (!locked) {
+      // no lock on this key, acquire it directly
+      lock_info.txn_ids = txn_lock_info.txn_ids;
+      lock_info.exclusive = txn_lock_info.exclusive;
+      lock_info.expiration_time = txn_lock_info.expiration_time;
+      return Status::OK();
+    }
+
+    if (IsLockExpired(my_txn_id, lock_info, env, expire_time)) {
+      // current lock is expired, steal it.
+      lock_info.txn_ids = txn_lock_info.txn_ids;
+      lock_info.exclusive = txn_lock_info.exclusive;
+      lock_info.expiration_time = txn_lock_info.expiration_time;
+      return Status::OK();
+    }
+
+    // Check lock compatibility
+    if (txn_lock_info.exclusive) {
+      // handle lock upgrade
+      if (solo_lock_owner) {
+        // Lock re-entrant or downgrade has already been handled above.
+        // Assert it is an upgrade here. Acquire the lock directly.
+        assert(!lock_info.exclusive);
+        lock_info.exclusive = txn_lock_info.exclusive;
+        lock_info.expiration_time = txn_lock_info.expiration_time;
+        return Status::OK();
+      } else {
+        // lock is already owned by other transactions
+        auto s = FillWaitIds(lock_info, txn_lock_info, wait_ids, *isUpgrade,
+                             my_txn_id, key);
+        if (!s.ok()) {
+          // propagate error up
+          return s;
+        }
+        return Status::TimedOut(Status::SubCode::kLockTimeout);
+      }
+    } else {
+      // handle shared lock request
+      if (lock_info.exclusive) {
+        // lock is already owned by other exclusive lock
+        auto s = FillWaitIds(lock_info, txn_lock_info, wait_ids, *isUpgrade,
+                             my_txn_id, key);
+        if (!s.ok()) {
+          // propagate error up
+          return s;
+        }
+        return Status::TimedOut(Status::SubCode::kLockTimeout);
+      } else {
+        // lock is on shared lock state, acquire it
+        lock_info.txn_ids.push_back(my_txn_id);
+        // update the expiration time
+        lock_info.expiration_time =
+            std::max(lock_info.expiration_time, txn_lock_info.expiration_time);
+        return Status::OK();
+      }
+    }
+  }
+}
+
+void PerKeyPointLockManager::UnLockKey(PessimisticTransaction* txn,
+                                       const std::string& key,
+                                       LockMapStripe* stripe, LockMap* lock_map,
+                                       Env* env) {
+#ifdef NDEBUG
+  (void)env;
+#endif
+  TransactionID txn_id = txn->GetID();
+
+  auto stripe_iter = stripe->keys.find(key);
+  if (stripe_iter != stripe->keys.end()) {
+    auto& lock_info = stripe_iter->second;
+    auto& txns = lock_info.txn_ids;
+    auto txn_it = std::find(txns.begin(), txns.end(), txn_id);
+
+    if (txn_it != txns.end()) {
+      // If the lock was held in exclusive mode, only one transaction should
+      // holding it.
+      if (lock_info.exclusive) {
+        assert(txns.size() == 1);
+        stripe->ReleaseLastLockHolder(lock_info, stripe_iter, lock_map, txn_id,
+                                      key, max_num_locks_, txns, txn_it);
+      } else {
+        // In shared mode, it is possible that another transaction is holding
+        // a shared lock and is waiting to upgrade the lock to exclusive.
+        assert(txns.size() >= 1);
+        if (txns.size() > 2) {
+          // Including the current transaction, if there are more than 2
+          // transactions holding the lock in shared mode, don't wake up any
+          // waiter, as the next waiter will not be able to acquire the lock
+          // anyway.
+          RemoveTransaction(txns, txn_it);
+        } else if (txns.size() == 2) {
+          // remove the current transaction first.
+          RemoveTransaction(txns, txn_it);
+          // Check whether the one remained is trying to upgrade the lock by
+          // checking whether its id matches.
+          auto& waiter_queue = lock_info.waiter_queue;
+          if (waiter_queue != nullptr && !waiter_queue->empty() &&
+              waiter_queue->front()->id == txns[0]) {
+            // There are waiters in the queue and the next one is same as the
+            // only one that is still holding the shared lock, wake the waiter
+            // up
+            waiter_queue->front()->Notify();
+            DebugWakeUpWaiter(txn_id, waiter_queue->front()->id, key,
+                              "Lock Upgrade");
+          }
+        } else {
+          // Current transaction is the only one holding the shared lock
+          stripe->ReleaseLastLockHolder(lock_info, stripe_iter, lock_map,
+                                        txn_id, key, max_num_locks_, txns,
+                                        txn_it);
+        }
+      }
+    }
+  } else {
+    // This key is either not locked or locked by someone else.  This should
+    // only happen if the unlocking transaction has expired.
+    assert(txn->GetExpirationTime() > 0 &&
+           txn->GetExpirationTime() < env->NowMicros());
+  }
+}
+
+void PerKeyPointLockManager::UnLock(PessimisticTransaction* txn,
+                                    ColumnFamilyId column_family_id,
+                                    const std::string& key, Env* env) {
+  std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(column_family_id);
+  LockMap* lock_map = lock_map_ptr.get();
+  if (lock_map == nullptr) {
+    // Column Family must have been dropped.
+    return;
+  }
+
+  // Lock the mutex for the stripe that this key hashes to
+  size_t stripe_num = lock_map->GetStripe(key);
+  assert(lock_map->lock_map_stripes_.size() > stripe_num);
+  LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num);
+
+  stripe->stripe_mutex->Lock().AssertOK();
+  UnLockKey(txn, key, stripe, lock_map, env);
+  stripe->stripe_mutex->UnLock();
+}
+
+void PerKeyPointLockManager::UnLock(PessimisticTransaction* txn,
+                                    const LockTracker& tracker, Env* env) {
+  std::unique_ptr<LockTracker::ColumnFamilyIterator> cf_it(
+      tracker.GetColumnFamilyIterator());
+  assert(cf_it != nullptr);
+  while (cf_it->HasNext()) {
+    ColumnFamilyId cf = cf_it->Next();
+    std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(cf);
+    LockMap* lock_map = lock_map_ptr.get();
+    if (!lock_map) {
+      // Column Family must have been dropped.
+      return;
+    }
+
+    // Bucket keys by lock_map_ stripe
+    UnorderedMap<size_t, std::vector<const std::string*>> keys_by_stripe(
+        lock_map->num_stripes_);
+    std::unique_ptr<LockTracker::KeyIterator> key_it(
+        tracker.GetKeyIterator(cf));
+    assert(key_it != nullptr);
+    while (key_it->HasNext()) {
+      const std::string& key = key_it->Next();
+      size_t stripe_num = lock_map->GetStripe(key);
+      keys_by_stripe[stripe_num].push_back(&key);
+    }
+
+    // For each stripe, grab the stripe mutex and unlock all keys in this
+    // stripe
+    for (auto& stripe_iter : keys_by_stripe) {
+      size_t stripe_num = stripe_iter.first;
+      auto& stripe_keys = stripe_iter.second;
+
+      assert(lock_map->lock_map_stripes_.size() > stripe_num);
+      LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num);
+
+      stripe->stripe_mutex->Lock().AssertOK();
+
+      for (const std::string* key : stripe_keys) {
+        UnLockKey(txn, *key, stripe, lock_map, env);
+      }
+
+      stripe->stripe_mutex->UnLock();
+    }
+  }
+}
+
+void PerKeyPointLockManager::UnLock(PessimisticTransaction* /* txn */,
+                                    ColumnFamilyId /* cf_id */,
+                                    const Endpoint& /* start */,
+                                    const Endpoint& /* end */, Env* /* env */) {
+  // no-op
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/transactions/lock/point/point_lock_manager.h b/utilities/transactions/lock/point/point_lock_manager.h
index 99183ca1cd2f..1fa8e7a78a3f 100644
--- a/utilities/transactions/lock/point/point_lock_manager.h
+++ b/utilities/transactions/lock/point/point_lock_manager.h
@@ -132,8 +132,12 @@ class PointLockManager : public LockManager {
   // this column family is no longer in use.
   void RemoveColumnFamily(const ColumnFamilyHandle* cf) override;
 
+  // Caller makes sure that a lock on the key is not requested again, unless it
+  // is an upgrade or downgrade.
   Status TryLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
                  const std::string& key, Env* env, bool exclusive) override;
+  // Caller makes sure that a lock on the key is not requested again, unless it
+  // is an upgrade or downgrade.
   Status TryLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
                  const Endpoint& start, const Endpoint& end, Env* env,
                  bool exclusive) override;
@@ -153,7 +157,7 @@ class PointLockManager : public LockManager {
 
   void Resize(uint32_t new_size) override;
 
- private:
+ protected:
   PessimisticTransactionDB* txn_db_impl_;
 
   // Default number of lock map stripes per column family
@@ -179,6 +183,11 @@ class PointLockManager : public LockManager {
   // to avoid acquiring a mutex in order to look up a LockMap
   std::unique_ptr<ThreadLocalPtr> lock_maps_cache_;
 
+  // Thread local variable for KeyLockWaiter. As one thread could only need one
+  // KeyLockWaiter.
+  // Lazy init on first time usage
+  ThreadLocalPtr key_lock_waiter_;
+
   // Must be held when modifying wait_txn_map_ and rev_wait_txn_map_.
   std::mutex wait_txn_map_mutex_;
 
@@ -196,19 +205,16 @@ class PointLockManager : public LockManager {
 
   std::shared_ptr<LockMap> GetLockMap(uint32_t column_family_id);
 
-  Status AcquireWithTimeout(PessimisticTransaction* txn, LockMap* lock_map,
-                            LockMapStripe* stripe, uint32_t column_family_id,
-                            const std::string& key, Env* env, int64_t timeout,
-                            const LockInfo& lock_info);
+  virtual Status AcquireWithTimeout(
+      PessimisticTransaction* txn, LockMap* lock_map, LockMapStripe* stripe,
+      uint32_t column_family_id, const std::string& key, Env* env,
+      int64_t timeout, int64_t deadlock_timeout_us, const LockInfo& lock_info);
 
-  Status AcquireLocked(LockMap* lock_map, LockMapStripe* stripe,
-                       const std::string& key, Env* env,
-                       const LockInfo& lock_info, uint64_t* wait_time,
-                       autovector<TransactionID>* txn_ids);
-
-  void UnLockKey(PessimisticTransaction* txn, const std::string& key,
-                 LockMapStripe* stripe, LockMap* lock_map, Env* env);
+  virtual void UnLockKey(PessimisticTransaction* txn, const std::string& key,
+                         LockMapStripe* stripe, LockMap* lock_map, Env* env);
 
+  // Returns true if a deadlock is detected.
+  // Will DecrementWaiters() if a deadlock is detected.
   bool IncrementWaiters(const PessimisticTransaction* txn,
                         const autovector<TransactionID>& wait_ids,
                         const std::string& key, const uint32_t& cf_id,
@@ -217,6 +223,56 @@ class PointLockManager : public LockManager {
                         const autovector<TransactionID>& wait_ids);
   void DecrementWaitersImpl(const PessimisticTransaction* txn,
                             const autovector<TransactionID>& wait_ids);
+
+ private:
+  Status AcquireLocked(LockMap* lock_map, LockMapStripe* stripe,
+                       const std::string& key, Env* env,
+                       const LockInfo& lock_info, uint64_t* wait_time,
+                       autovector<TransactionID>* txn_ids);
+};
+
+class PerKeyPointLockManager : public PointLockManager {
+ public:
+  PerKeyPointLockManager(PessimisticTransactionDB* db,
+                         const TransactionDBOptions& opt);
+  // No copying allowed
+  PerKeyPointLockManager(const PerKeyPointLockManager&) = delete;
+  PerKeyPointLockManager& operator=(const PerKeyPointLockManager&) = delete;
+  // No move allowed
+  PerKeyPointLockManager(PerKeyPointLockManager&&) = delete;
+  PerKeyPointLockManager& operator=(PerKeyPointLockManager&&) = delete;
+
+  ~PerKeyPointLockManager() override {}
+
+  void UnLock(PessimisticTransaction* txn, const LockTracker& tracker,
+              Env* env) override;
+  void UnLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
+              const std::string& key, Env* env) override;
+  void UnLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
+              const Endpoint& start, const Endpoint& end, Env* env) override;
+
+  void UnLockKey(PessimisticTransaction* txn, const std::string& key,
+                 LockMapStripe* stripe, LockMap* lock_map, Env* env) override;
+
+ protected:
+  Status AcquireWithTimeout(PessimisticTransaction* txn, LockMap* lock_map,
+                            LockMapStripe* stripe, uint32_t column_family_id,
+                            const std::string& key, Env* env, int64_t timeout,
+                            int64_t deadlock_timeout_us,
+                            const LockInfo& lock_info) override;
+
+ private:
+  Status AcquireLocked(LockMap* lock_map, LockMapStripe* stripe,
+                       const std::string& key, Env* env,
+                       const LockInfo& txn_lock_info, uint64_t* wait_time,
+                       autovector<TransactionID>* txn_ids,
+                       LockInfo** lock_info_ptr, bool* isUpgrade, bool fifo);
+
+  int64_t CalculateWaitEndTime(int64_t expire_time_hint, int64_t end_time);
+
+  Status FillWaitIds(LockInfo& lock_info, const LockInfo& txn_lock_info,
+                     autovector<TransactionID>* wait_ids, bool& isUpgrade,
+                     TransactionID& my_txn_id, const std::string& key);
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/transactions/lock/point/point_lock_manager_stress_test.cc b/utilities/transactions/lock/point/point_lock_manager_stress_test.cc
new file mode 100644
index 000000000000..c15a3c04c732
--- /dev/null
+++ b/utilities/transactions/lock/point/point_lock_manager_stress_test.cc
@@ -0,0 +1,103 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "utilities/transactions/lock/point/point_lock_manager_test.h"
+#include "utilities/transactions/lock/point/point_lock_validation_test_runner.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct PointLockCorrectnessCheckTestParam {
+  bool is_per_key_point_lock_manager;
+  uint32_t thread_count;
+  uint32_t key_count;
+  uint32_t max_num_keys_to_lock_per_txn;
+  uint32_t execution_time_sec;
+  LockTypeToTest lock_type;
+  int64_t lock_timeout_us;
+  int64_t lock_expiration_us;
+  bool allow_non_deadlock_error;
+  // to simulate some useful work
+  uint32_t max_sleep_after_lock_acquisition_ms;
+};
+
+class PointLockCorrectnessCheckTest
+    : public PointLockManagerTest,
+      public testing::WithParamInterface<PointLockCorrectnessCheckTestParam> {
+ public:
+  void SetUp() override {
+    init();
+    auto const& param = GetParam();
+    auto per_key_lock_manager = param.is_per_key_point_lock_manager;
+    if (per_key_lock_manager) {
+      locker_ = std::make_shared<PerKeyPointLockManager>(
+          static_cast<PessimisticTransactionDB*>(db_), txndb_opt_);
+    } else {
+      locker_ = std::make_shared<PointLockManager>(
+          static_cast<PessimisticTransactionDB*>(db_), txndb_opt_);
+    }
+
+    txn_opt_.deadlock_detect = true;
+    txn_opt_.lock_timeout = param.lock_timeout_us;
+    txn_opt_.expiration = param.lock_expiration_us;
+  }
+
+ protected:
+  TransactionOptions txn_opt_;
+};
+
+TEST_P(PointLockCorrectnessCheckTest, LockCorrectnessValidation) {
+  auto const& param = GetParam();
+  PointLockValidationTestRunner test_runner(
+      env_, txndb_opt_, locker_, db_, txn_opt_, param.thread_count,
+      param.key_count, param.max_num_keys_to_lock_per_txn,
+      param.execution_time_sec, static_cast<LockTypeToTest>(param.lock_type),
+      param.allow_non_deadlock_error,
+      param.max_sleep_after_lock_acquisition_ms);
+  test_runner.run();
+}
+
+constexpr auto X_S_LOCK = LockTypeToTest::EXCLUSIVE_AND_SHARED;
+constexpr auto X_LOCK = LockTypeToTest::EXCLUSIVE_ONLY;
+constexpr auto S_LOCK = LockTypeToTest::SHARED_ONLY;
+
+INSTANTIATE_TEST_CASE_P(
+    PointLockCorrectnessCheckTestSuite, PointLockCorrectnessCheckTest,
+    ::testing::ValuesIn(std::vector<PointLockCorrectnessCheckTestParam>{
+        // 2 second timeout and no expiration simulates myrocks default
+        // configuration
+        {true, 16, 16, 8, 10, X_S_LOCK, 2000, -1, true, 0},
+        {false, 16, 16, 8, 10, X_S_LOCK, 2000, -1, true, 0},
+        {true, 16, 16, 8, 10, X_LOCK, 2000, -1, true, 0},
+        {false, 16, 16, 8, 10, X_LOCK, 2000, -1, true, 0},
+        {true, 16, 16, 8, 10, S_LOCK, 2000, -1, true, 0},
+        {false, 16, 16, 8, 10, S_LOCK, 2000, -1, true, 0},
+        // short timeout and expiration to test lock stealing
+        {true, 16, 16, 8, 10, X_S_LOCK, 10, 10, true, 10},
+        {false, 16, 16, 8, 10, X_S_LOCK, 10, 10, true, 10},
+        {true, 16, 16, 8, 10, X_LOCK, 10, 10, true, 10},
+        {false, 16, 16, 8, 10, X_LOCK, 10, 10, true, 10},
+        {true, 16, 16, 8, 10, S_LOCK, 10, 10, true, 10},
+        {false, 16, 16, 8, 10, S_LOCK, 10, 10, true, 10},
+        // long timeout and expiration to test deadlock detection without
+        // timeout
+        {true, 16, 16, 8, 10, X_S_LOCK, 100000, 100000, false, 0},
+        {false, 16, 16, 8, 10, X_S_LOCK, 100000, 100000, false, 0},
+        {true, 16, 16, 8, 10, X_LOCK, 100000, 100000, false, 0},
+        {false, 16, 16, 8, 10, X_LOCK, 100000, 100000, false, 0},
+        {true, 16, 16, 8, 10, S_LOCK, 100000, 100000, false, 0},
+        {false, 16, 16, 8, 10, S_LOCK, 100000, 100000, false, 0},
+        // Low lock contention
+        {true, 4, 1024 * 1024, 2, 10, S_LOCK, 100000, 100000, false, 0},
+        {false, 4, 1024 * 1024, 2, 10, S_LOCK, 100000, 100000, false, 0},
+    }));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/utilities/transactions/lock/point/point_lock_manager_test.cc b/utilities/transactions/lock/point/point_lock_manager_test.cc
index 0ed8cd67fe2e..5f7c789282fa 100644
--- a/utilities/transactions/lock/point/point_lock_manager_test.cc
+++ b/utilities/transactions/lock/point/point_lock_manager_test.cc
@@ -5,11 +5,49 @@
 
 #include "utilities/transactions/lock/point/point_lock_manager_test.h"
 
+#include "utilities/transactions/lock/point/any_lock_manager_test.h"
+
 namespace ROCKSDB_NAMESPACE {
 
+struct SpotLockManagerTestParam {
+  bool use_per_key_point_lock_manager;
+  int deadlock_timeout_us;
+};
+
+// Define operator<< for SpotLockManagerTestParam to stop valgrind from
+// complaining uinitialized value when printing SpotLockManagerTestParam.
+std::ostream& operator<<(std::ostream& os,
+                         const SpotLockManagerTestParam& param) {
+  os << "use_per_key_point_lock_manager: "
+     << param.use_per_key_point_lock_manager
+     << ", deadlock_timeout_us: " << param.deadlock_timeout_us;
+  return os;
+}
+
+// including test for both PointLockManager and PerKeyPointLockManager
+class SpotLockManagerTest
+    : public PointLockManagerTest,
+      public testing::WithParamInterface<SpotLockManagerTestParam> {
+ public:
+  void SetUp() override {
+    init();
+    // If a custom setup function was provided, use it. Otherwise, use what we
+    // have inherited.
+    auto param = GetParam();
+    if (param.use_per_key_point_lock_manager) {
+      locker_.reset(new PerKeyPointLockManager(
+          static_cast<PessimisticTransactionDB*>(db_), txndb_opt_));
+    } else {
+      locker_.reset(new PointLockManager(
+          static_cast<PessimisticTransactionDB*>(db_), txndb_opt_));
+    }
+    deadlock_timeout_us = param.deadlock_timeout_us;
+  }
+};
+
 // This test is not applicable for Range Lock manager as Range Lock Manager
 // operates on Column Families, not their ids.
-TEST_F(PointLockManagerTest, LockNonExistingColumnFamily) {
+TEST_P(SpotLockManagerTest, LockNonExistingColumnFamily) {
   MockColumnFamilyHandle cf(1024);
   locker_->RemoveColumnFamily(&cf);
   auto txn = NewTxn();
@@ -19,7 +57,7 @@ TEST_F(PointLockManagerTest, LockNonExistingColumnFamily) {
   delete txn;
 }
 
-TEST_F(PointLockManagerTest, LockStatus) {
+TEST_P(SpotLockManagerTest, LockStatus) {
   MockColumnFamilyHandle cf1(1024), cf2(2048);
   locker_->AddColumnFamily(&cf1);
   locker_->AddColumnFamily(&cf2);
@@ -61,7 +99,7 @@ TEST_F(PointLockManagerTest, LockStatus) {
   delete txn2;
 }
 
-TEST_F(PointLockManagerTest, UnlockExclusive) {
+TEST_P(SpotLockManagerTest, UnlockExclusive) {
   MockColumnFamilyHandle cf(1);
   locker_->AddColumnFamily(&cf);
 
@@ -79,7 +117,7 @@ TEST_F(PointLockManagerTest, UnlockExclusive) {
   delete txn2;
 }
 
-TEST_F(PointLockManagerTest, UnlockShared) {
+TEST_P(SpotLockManagerTest, UnlockShared) {
   MockColumnFamilyHandle cf(1);
   locker_->AddColumnFamily(&cf);
 
@@ -100,7 +138,7 @@ TEST_F(PointLockManagerTest, UnlockShared) {
 // This test doesn't work with Range Lock Manager, because Range Lock Manager
 // doesn't support deadlock_detect_depth.
 
-TEST_F(PointLockManagerTest, DeadlockDepthExceeded) {
+TEST_P(SpotLockManagerTest, DeadlockDepthExceeded) {
   // Tests that when detecting deadlock, if the detection depth is exceeded,
   // it's also viewed as deadlock.
   MockColumnFamilyHandle cf(1);
@@ -108,7 +146,7 @@ TEST_F(PointLockManagerTest, DeadlockDepthExceeded) {
   TransactionOptions txn_opt;
   txn_opt.deadlock_detect = true;
   txn_opt.deadlock_detect_depth = 1;
-  txn_opt.lock_timeout = 1000000;
+  txn_opt.lock_timeout = kLongTxnTimeoutMs;
   auto txn1 = NewTxn(txn_opt);
   auto txn2 = NewTxn(txn_opt);
   auto txn3 = NewTxn(txn_opt);
@@ -124,7 +162,8 @@ TEST_F(PointLockManagerTest, DeadlockDepthExceeded) {
   // it must have another txn waiting on it, which is txn4 in this case.
   ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
 
-  port::Thread t1 = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() {
+  port::Thread t1;
+  BlockUntilWaitingTxn(wait_sync_point_name_, t1, [&]() {
     ASSERT_OK(locker_->TryLock(txn2, 1, "k2", env_, true));
     // block because txn1 is holding a lock on k1.
     ASSERT_OK(locker_->TryLock(txn2, 1, "k1", env_, true));
@@ -132,7 +171,8 @@ TEST_F(PointLockManagerTest, DeadlockDepthExceeded) {
 
   ASSERT_OK(locker_->TryLock(txn3, 1, "k3", env_, true));
 
-  port::Thread t2 = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() {
+  port::Thread t2;
+  BlockUntilWaitingTxn(wait_sync_point_name_, t2, [&]() {
     // block because txn3 is holding a lock on k1.
     ASSERT_OK(locker_->TryLock(txn4, 1, "k3", env_, true));
   });
@@ -150,15 +190,1242 @@ TEST_F(PointLockManagerTest, DeadlockDepthExceeded) {
   t1.join();
   t2.join();
 
+  locker_->UnLock(txn2, 1, "k2", env_);
+  locker_->UnLock(txn2, 1, "k1", env_);
+  locker_->UnLock(txn4, 1, "k3", env_);
+
   delete txn4;
   delete txn3;
   delete txn2;
   delete txn1;
 }
 
+TEST_P(SpotLockManagerTest, PrioritizedLockUpgradeWithExclusiveLock) {
+  // Tests that a lock upgrade request is prioritized over other lock requests.
+
+  // txn1 acquires shared lock on k1.
+  // txn2 acquires exclusive lock on k1.
+  // txn1 acquires exclusive locks k1 successfully
+
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  TransactionOptions txn_opt;
+  txn_opt.deadlock_detect = true;
+  txn_opt.lock_timeout = kLongTxnTimeoutMs;
+  auto txn1 = NewTxn(txn_opt);
+  auto txn2 = NewTxn(txn_opt);
+
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, false));
+
+  // txn2 tries to lock k1 exclusively, will be blocked.
+  port::Thread t;
+  BlockUntilWaitingTxn(wait_sync_point_name_, t, [this, &txn2]() {
+    // block because txn1 is holding a shared lock on k1.
+    ASSERT_OK(locker_->TryLock(txn2, 1, "k1", env_, true));
+  });
+
+  // verify lock upgrade successfully
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+
+  // unlock txn1, so txn2 could proceed
+  locker_->UnLock(txn1, 1, "k1", env_);
+
+  // Cleanup
+  t.join();
+
+  // Cleanup
+  locker_->UnLock(txn2, 1, "k1", env_);
+  delete txn2;
+  delete txn1;
+}
+
+TEST_P(SpotLockManagerTest,
+       PrioritizedLockUpgradeWithExclusiveLockAndSharedLock) {
+  // Tests that lock upgrade is prioritized when mixed with shared and exclusive
+  // locks requests
+
+  // txn1 acquires shared lock on k1.
+  // txn2 acquires shared lock on k1.
+  // txn3 acquires exclusive lock on k1.
+  // txn1 acquires exclusive locks k1 <- request granted after txn2 release the
+  // lock
+
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  TransactionOptions txn_opt;
+  txn_opt.deadlock_detect = true;
+  txn_opt.lock_timeout = kLongTxnTimeoutMs;
+  auto txn1 = NewTxn(txn_opt);
+  auto txn2 = NewTxn(txn_opt);
+  auto txn3 = NewTxn(txn_opt);
+
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, false));
+  ASSERT_OK(locker_->TryLock(txn2, 1, "k1", env_, false));
+
+  // txn3 tries to lock k1 exclusively, will be blocked.
+  port::Thread txn3_thread;
+  BlockUntilWaitingTxn(wait_sync_point_name_, txn3_thread, [this, &txn3]() {
+    // block because txn1 and txn2 are holding a shared lock on k1.
+    ASSERT_OK(locker_->TryLock(txn3, 1, "k1", env_, true));
+  });
+  // Verify txn3 is blocked
+  ASSERT_TRUE(txn3_thread.joinable());
+
+  // txn1 tries to lock k1 exclusively, will be blocked.
+  port::Thread txn1_thread;
+  BlockUntilWaitingTxn(wait_sync_point_name_, txn1_thread, [this, &txn1]() {
+    // block because txn1 and txn2 are holding a shared lock on k1.
+    ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+  });
+  // Verify txn1 is blocked
+  ASSERT_TRUE(txn1_thread.joinable());
+
+  // Unlock txn2, so txn1 could proceed
+  locker_->UnLock(txn2, 1, "k1", env_);
+  txn1_thread.join();
+
+  // Unlock txn1, so txn3 could proceed
+  locker_->UnLock(txn1, 1, "k1", env_);
+  txn3_thread.join();
+
+  // Cleanup
+  locker_->UnLock(txn3, 1, "k1", env_);
+  delete txn3;
+  delete txn2;
+  delete txn1;
+}
+
+TEST_P(SpotLockManagerTest, Deadlock_MultipleUpgrade) {
+  // Tests that deadlock can be detected for shared locks and exclusive locks
+  // mixed Deadlock scenario:
+
+  // txn1 acquires shared lock on k1.
+  // txn2 acquires shared lock on k1.
+  // txn1 acquires exclusive locks k1
+  // txn2 acquires exclusive locks k1 <- dead lock detected
+
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  TransactionOptions txn_opt;
+  txn_opt.deadlock_detect = true;
+  txn_opt.lock_timeout = kLongTxnTimeoutMs;
+  auto txn1 = NewTxn(txn_opt);
+  auto txn2 = NewTxn(txn_opt);
+
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, false));
+  ASSERT_OK(locker_->TryLock(txn2, 1, "k1", env_, false));
+
+  // txn1 tries to lock k1 exclusively, will be blocked.
+  port::Thread t;
+  BlockUntilWaitingTxn(wait_sync_point_name_, t, [this, &txn1]() {
+    // block because txn2 is holding a shared lock on k1.
+    ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+  });
+
+  auto s = locker_->TryLock(txn2, 1, "k1", env_, true);
+  ASSERT_TRUE(s.IsBusy());
+  ASSERT_EQ(s.subcode(), Status::SubCode::kDeadlock);
+
+  std::vector<DeadlockPath> deadlock_paths = locker_->GetDeadlockInfoBuffer();
+  ASSERT_EQ(deadlock_paths.size(), 1u);
+  ASSERT_FALSE(deadlock_paths[0].limit_exceeded);
+
+  std::vector<DeadlockInfo> deadlocks = deadlock_paths[0].path;
+  ASSERT_EQ(deadlocks.size(), 2u);
+
+  ASSERT_EQ(deadlocks[0].m_txn_id, txn1->GetID());
+  ASSERT_EQ(deadlocks[0].m_cf_id, 1u);
+  ASSERT_TRUE(deadlocks[0].m_exclusive);
+  ASSERT_EQ(deadlocks[0].m_waiting_key, "k1");
+
+  ASSERT_EQ(deadlocks[1].m_txn_id, txn2->GetID());
+  ASSERT_EQ(deadlocks[1].m_cf_id, 1u);
+  ASSERT_TRUE(deadlocks[1].m_exclusive);
+  ASSERT_EQ(deadlocks[1].m_waiting_key, "k1");
+
+  locker_->UnLock(txn2, 1, "k1", env_);
+  t.join();
+
+  // Cleanup
+  locker_->UnLock(txn1, 1, "k1", env_);
+  delete txn2;
+  delete txn1;
+}
+
+TEST_P(SpotLockManagerTest, Deadlock_MultipleUpgradeInterleaveExclusive) {
+  // Tests that deadlock can be detected for shared locks and exclusive locks
+  // mixed Deadlock scenario:
+
+  // txn1 acquires shared lock on k1.
+  // txn2 acquires shared lock on k1.
+  // txn3 acquires exclusive lock on k1.
+  // txn1 acquires exclusive locks k1 <- request granted after txn2 release the
+  // lock.
+  // txn2 acquires exclusive locks k1 <- dead lock detected
+
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  TransactionOptions txn_opt;
+  txn_opt.deadlock_detect = true;
+  txn_opt.lock_timeout = kLongTxnTimeoutMs;
+  auto txn1 = NewTxn(txn_opt);
+  auto txn2 = NewTxn(txn_opt);
+  auto txn3 = NewTxn(txn_opt);
+
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, false));
+  ASSERT_OK(locker_->TryLock(txn2, 1, "k1", env_, false));
+
+  // txn3 tries to lock k1 exclusively, will be blocked.
+  port::Thread txn3_thread;
+  BlockUntilWaitingTxn(wait_sync_point_name_, txn3_thread, [this, &txn3]() {
+    // block because txn1 and txn2 are holding a shared lock on k1.
+    ASSERT_OK(locker_->TryLock(txn3, 1, "k1", env_, true));
+  });
+  // Verify txn3 is blocked
+  ASSERT_TRUE(txn3_thread.joinable());
+
+  // txn1 tries to lock k1 exclusively, will be blocked.
+  port::Thread txn1_thread;
+  BlockUntilWaitingTxn(wait_sync_point_name_, txn1_thread, [this, &txn1]() {
+    // block because txn1 and txn2 are holding a shared lock on k1.
+    ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+  });
+  // Verify txn1 is blocked
+  ASSERT_TRUE(txn1_thread.joinable());
+
+  auto s = locker_->TryLock(txn2, 1, "k1", env_, true);
+  ASSERT_TRUE(s.IsBusy());
+  ASSERT_EQ(s.subcode(), Status::SubCode::kDeadlock);
+
+  std::vector<DeadlockPath> deadlock_paths = locker_->GetDeadlockInfoBuffer();
+  ASSERT_EQ(deadlock_paths.size(), 1u);
+  ASSERT_FALSE(deadlock_paths[0].limit_exceeded);
+
+  std::vector<DeadlockInfo> deadlocks = deadlock_paths[0].path;
+  ASSERT_EQ(deadlocks.size(), 2u);
+
+  ASSERT_EQ(deadlocks[0].m_txn_id, txn1->GetID());
+  ASSERT_EQ(deadlocks[0].m_cf_id, 1u);
+  ASSERT_TRUE(deadlocks[0].m_exclusive);
+  ASSERT_EQ(deadlocks[0].m_waiting_key, "k1");
+
+  ASSERT_EQ(deadlocks[1].m_txn_id, txn2->GetID());
+  ASSERT_EQ(deadlocks[1].m_cf_id, 1u);
+  ASSERT_TRUE(deadlocks[1].m_exclusive);
+  ASSERT_EQ(deadlocks[1].m_waiting_key, "k1");
+
+  // Unlock txn2, so txn1 could proceed
+  locker_->UnLock(txn2, 1, "k1", env_);
+  txn1_thread.join();
+
+  // Unlock txn1, so txn3 could proceed
+  locker_->UnLock(txn1, 1, "k1", env_);
+  txn3_thread.join();
+
+  // Cleanup
+  locker_->UnLock(txn3, 1, "k1", env_);
+  delete txn3;
+  delete txn2;
+  delete txn1;
+}
+
+class PerKeyPointLockManagerTest : public PointLockManagerTest {
+ public:
+  void SetUp() override {
+    init();
+    cf_ = std::make_unique<MockColumnFamilyHandle>(1);
+    txn_opt_.deadlock_detect = true;
+    // by default use long timeout and disable expiration
+    txn_opt_.lock_timeout = kLongTxnTimeoutMs;
+    txn_opt_.expiration = -1;
+
+    // CAUTION: This test creates a separate lock manager object (right, NOT
+    // the one that the TransactionDB is using!), and runs tests on it.
+    locker_.reset(new PerKeyPointLockManager(
+        static_cast<PessimisticTransactionDB*>(db_), txndb_opt_));
+    locker_->AddColumnFamily(cf_.get());
+  }
+
+  TransactionOptions txn_opt_;
+  std::unique_ptr<MockColumnFamilyHandle> cf_;
+};
+
+TEST_F(PerKeyPointLockManagerTest, LockEfficiency) {
+  // Create multiple transactions, each acquire exclusive lock on the same key
+  std::vector<PessimisticTransaction*> txns;
+  std::vector<port::Thread> blockingThreads;
+
+  // Count the total number of wait sync point calls
+  std::atomic_int wait_sync_point_times = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      wait_sync_point_name_,
+      [&wait_sync_point_times](void* /*arg*/) { wait_sync_point_times++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr auto num_of_txn = 10;
+  // create 10 transactions, each of them try to acquire exclusive lock on the
+  // same key
+  for (int i = 0; i < num_of_txn; i++) {
+    auto txn = NewTxn(txn_opt_);
+    txns.push_back(txn);
+
+    if (i == 0) {
+      // txn0 acquires the lock, so the rest of the transactions could block
+      ASSERT_OK(locker_->TryLock(txn, 1, "k1", env_, true));
+    } else {
+      blockingThreads.emplace_back([this, txn]() {
+        // block because first txn is holding an exclusive lock on k1.
+        ASSERT_OK(locker_->TryLock(txn, 1, "k1", env_, true));
+      });
+    }
+
+    // wait for transaction i to be blocked
+    while (wait_sync_point_times.load() < i) {
+      std::this_thread::sleep_for(std::chrono::milliseconds(1));
+    }
+  }
+
+  // unlock the key, so next transaction could take the lock.
+  locker_->UnLock(txns[0], 1, "k1", env_);
+
+  auto num_of_blocking_thread = num_of_txn - 1;
+
+  for (int i = 0; i < num_of_blocking_thread; i++) {
+    // validate the thread is finished
+    blockingThreads[i].join();
+    auto num_of_threads_completed = i + 1;
+    for (int j = 0; j < num_of_blocking_thread; j++) {
+      if (j < num_of_threads_completed) {
+        // validate the thread is no longer joinable
+        ASSERT_FALSE(blockingThreads[j].joinable());
+      } else {
+        // validate the rest of the threads are still joinable
+        ASSERT_TRUE(blockingThreads[j].joinable());
+      }
+    }
+    // unlock the key, so next transaction could take the lock.
+    locker_->UnLock(txns[i + 1], 1, "k1", env_);
+  }
+
+  ASSERT_EQ(wait_sync_point_times.load(), num_of_blocking_thread);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  for (int i = 0; i < num_of_txn; i++) {
+    delete txns[num_of_txn - i - 1];
+  }
+}
+
+TEST_F(PerKeyPointLockManagerTest, LockFairness) {
+  // Create multiple transactions requesting locks on the same key, validate
+  // that they are executed in FIFO order
+
+  // txn0 acquires exclusive lock on k1.
+  // txn1 acquires shared lock on k1.
+  // txn2 acquires shared lock on k1.
+  // txn3 acquires exclusive lock on k1.
+  // txn4 acquires shared lock on k1.
+  // txn5 acquires exclusive lock on k1.
+  // txn6 acquires exclusive lock on k1.
+  // txn7 acquires shared lock on k1.
+  // txn8 acquires shared lock on k1.
+  // txn9 acquires exclusive lock on k1.
+
+  std::vector<PessimisticTransaction*> txns;
+  std::vector<port::Thread> blockingThreads;
+
+  // Count the total number of wait sync point calls
+  std::atomic_int wait_sync_point_times = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      wait_sync_point_name_,
+      [&wait_sync_point_times](void* /*arg*/) { wait_sync_point_times++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr auto num_of_txn = 10;
+  std::vector<bool> txn_lock_types = {true, false, false, true,  false,
+                                      true, true,  false, false, true};
+  // create 10 transactions, each of them try to acquire exclusive lock on the
+  // same key
+  for (int i = 0; i < num_of_txn; i++) {
+    auto txn = NewTxn(txn_opt_);
+    txns.push_back(txn);
+
+    if (i == 0) {
+      // txn0 acquires the lock, so the rest of the transactions would block
+      ASSERT_OK(locker_->TryLock(txn, 1, "k1", env_, txn_lock_types[0]));
+    } else {
+      blockingThreads.emplace_back([this, txn, type = txn_lock_types[i]]() {
+        ASSERT_OK(locker_->TryLock(txn, 1, "k1", env_, type));
+      });
+    }
+
+    // wait for transaction i to be blocked
+    while (wait_sync_point_times.load() < i) {
+      std::this_thread::sleep_for(std::chrono::milliseconds(1));
+    }
+  }
+
+  auto num_of_blocking_thread = num_of_txn - 1;
+
+  auto thread_idx = 0;
+  auto txn_idx = 0;
+
+  auto unlockTxn = [&]() {
+    // unlock the key in transaction.
+    locker_->UnLock(txns[txn_idx++], 1, "k1", env_);
+  };
+
+  auto validateLockTakenByNextTxn = [&]() {
+    // validate the thread is finished
+    blockingThreads[thread_idx++].join();
+  };
+
+  auto stillWaitingForLock = [&]() {
+    // validate the thread is no longer joinable
+    ASSERT_TRUE(blockingThreads[thread_idx].joinable());
+  };
+
+  // unlock the key, so next group of transactions could take the lock.
+  unlockTxn();
+
+  // txn1 acquires shared lock on k1.
+  // txn2 acquires shared lock on k1.
+  validateLockTakenByNextTxn();
+  validateLockTakenByNextTxn();
+
+  // txn3 acquires exclusive lock on k1.
+  stillWaitingForLock();
+  unlockTxn();
+  unlockTxn();
+  validateLockTakenByNextTxn();
+
+  // txn4 acquires shared lock on k1.
+  stillWaitingForLock();
+  unlockTxn();
+  validateLockTakenByNextTxn();
+
+  // txn5 acquires exclusive lock on k1.
+  stillWaitingForLock();
+  unlockTxn();
+  validateLockTakenByNextTxn();
+
+  // txn6 acquires exclusive lock on k1.
+  stillWaitingForLock();
+  unlockTxn();
+  validateLockTakenByNextTxn();
+
+  // txn7 acquires shared lock on k1.
+  // txn8 acquires shared lock on k1.
+  stillWaitingForLock();
+  unlockTxn();
+  validateLockTakenByNextTxn();
+  validateLockTakenByNextTxn();
+
+  // txn9 acquires exclusive lock on k1.
+  stillWaitingForLock();
+  unlockTxn();
+  unlockTxn();
+  validateLockTakenByNextTxn();
+
+  // clean up
+  unlockTxn();
+
+  ASSERT_EQ(wait_sync_point_times.load(), num_of_blocking_thread);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  for (int i = 0; i < num_of_txn; i++) {
+    delete txns[num_of_txn - i - 1];
+  }
+}
+
+TEST_F(PerKeyPointLockManagerTest, FIFO) {
+  // validate S, X, S lock order would be executed in FIFO order
+  // txn1 acquires shared lock on k1.
+  // txn2 acquires exclusive lock on k1.
+  // txn3 acquires shared lock on k1.
+
+  std::vector<PessimisticTransaction*> txns;
+  std::vector<port::Thread> blockingThreads;
+
+  // Count the total number of wait sync point calls
+  std::atomic_int wait_sync_point_times = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      wait_sync_point_name_,
+      [&wait_sync_point_times](void* /*arg*/) { wait_sync_point_times++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr auto num_of_txn = 3;
+  std::vector<bool> txn_lock_types = {false, true, false};
+  // create 3 transactions, each of them try to acquire exclusive lock on the
+  // same key
+  for (int i = 0; i < num_of_txn; i++) {
+    auto txn = NewTxn(txn_opt_);
+    txns.push_back(txn);
+
+    if (i == 0) {
+      // txn0 acquires the lock, so the rest of the transactions would block
+      ASSERT_OK(locker_->TryLock(txn, 1, "k1", env_, txn_lock_types[0]));
+    } else {
+      blockingThreads.emplace_back([this, txn, type = txn_lock_types[i]]() {
+        ASSERT_OK(locker_->TryLock(txn, 1, "k1", env_, type));
+      });
+    }
+
+    // wait for transaction i to be blocked
+    while (wait_sync_point_times.load() < i) {
+      std::this_thread::sleep_for(std::chrono::milliseconds(1));
+    }
+  }
+
+  auto num_of_blocking_thread = num_of_txn - 1;
+
+  auto thread_idx = 0;
+  auto txn_idx = 0;
+
+  auto unlockTxn = [&]() {
+    // unlock the key in transaction.
+    locker_->UnLock(txns[txn_idx++], 1, "k1", env_);
+  };
+
+  auto validateLockTakenByNextTxn = [&]() {
+    // validate the thread is finished
+    blockingThreads[thread_idx++].join();
+  };
+
+  auto stillWaitingForLock = [&]() {
+    // validate the thread is no longer joinable
+    ASSERT_TRUE(blockingThreads[thread_idx].joinable());
+  };
+
+  // unlock the key, so next group of transactions could take the lock.
+  stillWaitingForLock();
+  unlockTxn();
+
+  // txn1 acquires exclusive lock on k1.
+  validateLockTakenByNextTxn();
+
+  // txn2 acquires shared lock on k1.
+  stillWaitingForLock();
+  unlockTxn();
+  validateLockTakenByNextTxn();
+
+  // clean up
+  unlockTxn();
+
+  ASSERT_EQ(wait_sync_point_times.load(), num_of_blocking_thread);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  for (int i = 0; i < num_of_txn; i++) {
+    delete txns[num_of_txn - i - 1];
+  }
+}
+
+TEST_P(SpotLockManagerTest, LockDownGradeWithOtherLockRequests) {
+  // Test lock down grade always succeeds, even if there are other lock requests
+  // waiting for the same lock.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  TransactionOptions txn_opt;
+  txn_opt.deadlock_detect = true;
+  txn_opt.lock_timeout = kLongTxnTimeoutMs;
+  auto txn1 = NewTxn(txn_opt);
+  auto txn2 = NewTxn(txn_opt);
+
+  for (bool exclusive : {true, false}) {
+    ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+
+    port::Thread t;
+    BlockUntilWaitingTxn(wait_sync_point_name_, t, [this, &txn2, exclusive]() {
+      // block because txn1 is holding a exclusive lock on k1.
+      ASSERT_OK(locker_->TryLock(txn2, 1, "k1", env_, exclusive));
+    });
+
+    // txn1 downgrades the lock to shared lock, so txn2 could proceed
+    ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, false));
+
+    locker_->UnLock(txn1, 1, "k1", env_);
+    t.join();
+    locker_->UnLock(txn2, 1, "k1", env_);
+  }
+
+  // clean up
+  delete txn2;
+  delete txn1;
+}
+
+TEST_P(SpotLockManagerTest, LockTimeout) {
+  // Test lock timeout
+  // txn1 acquires an exclusive lock on k1 successfully.
+  // txn2 try to acquire a lock on k1, but timedout.
+
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  TransactionOptions txn_opt;
+  txn_opt.deadlock_detect = true;
+  txn_opt.lock_timeout = kShortTxnTimeoutMs;
+  auto txn1 = NewTxn(txn_opt);
+  auto txn2 = NewTxn(txn_opt);
+
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+
+  for (bool exclusive : {true, false}) {
+    auto ret = locker_->TryLock(txn2, 1, "k1", env_, exclusive);
+    ASSERT_TRUE(ret.IsTimedOut());
+  }
+
+  // clean up
+  locker_->UnLock(txn1, 1, "k1", env_);
+  delete txn2;
+  delete txn1;
+}
+
+TEST_P(SpotLockManagerTest, ExpiredLockStolenAfterTimeout) {
+  // validate an expired lock can be stolen by another transaction that timed
+  // out on the lock.
+  // txn1 acquires an exclusive lock on k1 successfully with a short expiration
+  // time.
+  // txn2 try to acquire a shared lock on k1 with timeout that is slightly
+  // longer than the txn1 expiration.
+  // Validate txn2 will take the lock.
+
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  TransactionOptions txn_opt;
+  txn_opt.deadlock_detect = true;
+  txn_opt.expiration = 1000;
+  txn_opt.lock_timeout = 1000 * 2;
+  auto txn1 = NewTxn(txn_opt);
+  auto txn2 = NewTxn(txn_opt);
+
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+
+  port::Thread t1;
+  BlockUntilWaitingTxn(wait_sync_point_name_, t1, [this, &txn2]() {
+    // block because txn1 is holding an exclusive lock on k1.
+    ASSERT_OK(locker_->TryLock(txn2, 1, "k1", env_, false));
+  });
+
+  t1.join();
+
+  // clean up
+  locker_->UnLock(txn2, 1, "k1", env_);
+  locker_->UnLock(txn1, 1, "k1", env_);
+
+  delete txn2;
+  delete txn1;
+}
+
+// Try to block until transaction enters waiting state.
+// However due to timing, it could fail, so return true if succeeded, false
+// otherwise.
+bool TryBlockUntilWaitingTxn(const char* sync_point_name, port::Thread& t,
+                             std::function<void()> function) {
+  std::atomic<bool> reached(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      sync_point_name, [&](void* /*arg*/) { reached.store(true); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // As the lifetime of the complete variable could go beyond the scope of this
+  // function, so we wrap it in a shared_ptr, and copy it into the lambda
+  std::shared_ptr<std::atomic<bool>> complete =
+      std::make_shared<std::atomic<bool>>(false);
+  t = port::Thread([complete, &function]() {
+    function();
+    complete->store(true);
+  });
+
+  auto ret = false;
+
+  while (true) {
+    if (complete->load()) {
+      // function completed, before sync point was reached, return false
+      t.join();
+      ret = false;
+      break;
+    }
+    if (reached.load()) {
+      // sync point was reached before function completed, return true
+      ret = true;
+      break;
+    }
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  return ret;
+}
+
+TEST_F(PerKeyPointLockManagerTest, LockStealAfterExpirationExclusive) {
+  // There are multiple transactions waiting for the same lock.
+  // txn1 acquires an exclusive lock on k1 successfully with a short expiration
+  // time.
+  // txn2 try to acquire an exclusive lock on k1, before expiration time,
+  // so it is blocked and waits for txn1 lock expired.
+  // txn3 try to acquire an exclusive lock on k1 after txn1 lock expires, FIFO
+  // order is respected.
+  // txn2 is woken up and takes the lock. unlock txn2, txn3 should proceed.
+
+  txn_opt_.expiration = 1000;
+  auto txn1 = NewTxn(txn_opt_);
+  txn_opt_.expiration = -1;
+  auto txn2 = NewTxn(txn_opt_);
+  auto txn3 = NewTxn(txn_opt_);
+
+  port::Thread t1;
+  auto retry_times = 10;
+
+  // Use a loop to reduce test flakiness.
+  // that the test is flaky because the txn2 thread start could be delayed until
+  // txn1 lock expired. In that case, txn2 will not enter into wait state, which
+  // will defeat the test purpose. Use a loop to retry a few times, until it is
+  // able to enter into wait state.
+  while (retry_times--) {
+    ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+    if (TryBlockUntilWaitingTxn(wait_sync_point_name_, t1, [this, &txn2]() {
+          // block because txn1 is holding a shared lock on k1.
+          ASSERT_OK(locker_->TryLock(txn2, 1, "k1", env_, true));
+        })) {
+      break;
+    }
+    // failed, retry again
+    locker_->UnLock(txn1, 1, "k1", env_);
+    locker_->UnLock(txn2, 1, "k1", env_);
+  }
+  // make sure txn2 is able to reach the wait state before proceed
+  ASSERT_GT(retry_times, 0);
+
+  // txn3 try to acquire an exclusive lock on k1, FIFO order is respected.
+  port::Thread t2;
+  BlockUntilWaitingTxn(wait_sync_point_name_, t2, [this, &txn3]() {
+    // block because txn1 is holding an exclusive lock on k1.
+    ASSERT_OK(locker_->TryLock(txn3, 1, "k1", env_, true));
+  });
+
+  // validate txn2 is woken up and takes the lock
+  t1.join();
+
+  // unlock txn2, txn3 should proceed
+  locker_->UnLock(txn2, 1, "k1", env_);
+  t2.join();
+
+  // clean up
+  locker_->UnLock(txn3, 1, "k1", env_);
+
+  delete txn3;
+  delete txn2;
+  delete txn1;
+}
+
+TEST_F(PerKeyPointLockManagerTest, LockStealAfterExpirationShared) {
+  // There are multiple transactions waiting for the same lock.
+  // txn1 acquires a shared lock on k1 successfully with a short expiration
+  // time.
+  // txn2 try to acquire an exclusive lock on k1, before expiration time,
+  // so it is blocked and waits for txn1 lock expired.
+  // txn3 try to acquire a shared lock on k1 after txn1 lock expires, FIFO
+  // order is respected.
+  // txn2 is woken up and takes the lock. unlock txn2, txn3 should proceed.
+
+  txn_opt_.expiration = 1000;
+  auto txn1 = NewTxn(txn_opt_);
+  txn_opt_.expiration = -1;
+  auto txn2 = NewTxn(txn_opt_);
+  auto txn3 = NewTxn(txn_opt_);
+
+  port::Thread t1;
+  auto retry_times = 10;
+
+  // Use a loop to reduce test flakiness.
+  // that the test is flaky because the txn2 thread start could be delayed until
+  // txn1 lock expired. In that case, txn2 will not enter into wait state, which
+  // will defeat the test purpose. Use a loop to retry a few times, until it is
+  // able to enter into wait state.
+  while (retry_times--) {
+    ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, false));
+    if (TryBlockUntilWaitingTxn(wait_sync_point_name_, t1, [this, &txn2]() {
+          // block because txn1 is holding an exclusive lock on k1.
+          ASSERT_OK(locker_->TryLock(txn2, 1, "k1", env_, true));
+        })) {
+      break;
+    }
+    // failed, retry again
+    locker_->UnLock(txn1, 1, "k1", env_);
+    locker_->UnLock(txn2, 1, "k1", env_);
+  }
+  // make sure txn2 is able to reach the wait state before proceed
+  ASSERT_GT(retry_times, 0);
+
+  // txn3 try to acquire an exclusive lock on k1, FIFO order is respected.
+  port::Thread t2;
+  BlockUntilWaitingTxn(wait_sync_point_name_, t2, [this, &txn3]() {
+    // block because txn1 is holding an exclusive lock on k1.
+    ASSERT_OK(locker_->TryLock(txn3, 1, "k1", env_, false));
+  });
+
+  // validate txn2 is woken up and takes the lock
+  t1.join();
+
+  // unlock txn2, txn3 should proceed
+  locker_->UnLock(txn2, 1, "k1", env_);
+  t2.join();
+
+  // clean up
+  locker_->UnLock(txn3, 1, "k1", env_);
+
+  delete txn3;
+  delete txn2;
+  delete txn1;
+}
+
+TEST_F(PerKeyPointLockManagerTest, DeadLockOnWaiter) {
+  // Txn1 acquires exclusive lock on k1
+  // Txn3 acquires shared lock on k2
+  // Txn2 tries to acquire exclusive lock on k1, waiting in the waiter queue.
+  // Txn3 tries to acquire exclusive lock on k1, waiting in the waiter queue.
+  // Txn3 depends on both Txn1 and Txn2. Txn1 unlocks k1.
+  // Txn2 takes the lock k1, and tries to acquire lock k2.
+  // Now Txn2 depends on Txn3.
+  // Deadlock is detected, and Txn2 is aborted.
+
+  auto txn1 = NewTxn(txn_opt_);
+  auto txn2 = NewTxn(txn_opt_);
+  auto txn3 = NewTxn(txn_opt_);
+
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+  ASSERT_OK(locker_->TryLock(txn3, 1, "k2", env_, false));
+
+  port::Thread t1;
+  BlockUntilWaitingTxn(wait_sync_point_name_, t1, [this, &txn2]() {
+    ASSERT_OK(locker_->TryLock(txn2, 1, "k1", env_, true));
+    auto s = locker_->TryLock(txn2, 1, "k2", env_, true);
+    ASSERT_TRUE(s.IsDeadlock());
+  });
+
+  port::Thread t2;
+  BlockUntilWaitingTxn(wait_sync_point_name_, t2, [this, &txn3]() {
+    ASSERT_OK(locker_->TryLock(txn3, 1, "k1", env_, true));
+  });
+
+  locker_->UnLock(txn1, 1, "k1", env_);
+
+  t1.join();
+
+  locker_->UnLock(txn2, 1, "k1", env_);
+  t2.join();
+
+  // clean up
+  locker_->UnLock(txn3, 1, "k1", env_);
+  locker_->UnLock(txn3, 1, "k2", env_);
+
+  delete txn3;
+  delete txn2;
+  delete txn1;
+}
+
+TEST_F(PerKeyPointLockManagerTest, SharedLockRaceCondition) {
+  // Verify a shared lock race condition is handled properly.
+  // When there are waiters in the queue, and all of them are shared waiters,
+  // and no one has taken the lock and all of them just got woken up and not
+  // yet taken the lock yet. A new shared lock request should be granted
+  // directly, without wait in the queue. If it did, It would not be woken up
+  // until the last shared lock is released.
+
+  // Disable deadlock detection timeout to prevent test flakyness.
+  deadlock_timeout_us = 0;
+  auto txn1 = NewTxn(txn_opt_);
+  auto txn2 = NewTxn(txn_opt_);
+  auto txn3 = NewTxn(txn_opt_);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"PerKeyPointLockManager::AcquireWithTimeout:AfterWokenUp",
+        "PerKeyPointLockManagerTest::SharedLockRaceCondition:"
+        "BeforeNewSharedLockRequest"},
+       {"PerKeyPointLockManagerTest::SharedLockRaceCondition:"
+        "AfterNewSharedLockRequest",
+        "PerKeyPointLockManager::AcquireWithTimeout:BeforeTakeLock"}});
+
+  std::atomic<bool> reached(false);
+  SyncPoint::GetInstance()->SetCallBack(
+      wait_sync_point_name_,
+      [&reached](void* /*arg*/) { reached.store(true); });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // txn1 acquires an exclusive lock on k1, so that the following shared lock
+  // request would be blocked
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+
+  // txn2 try to acquire a shared lock on k1, and get blocked
+  auto t1 = port::Thread([this, &txn2]() {
+    ASSERT_OK(locker_->TryLock(txn2, 1, "k1", env_, false));
+  });
+
+  while (!reached.load()) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+  }
+
+  // unlock txn1, txn2 should be woken up, but txn2 stops on the sync point
+  locker_->UnLock(txn1, 1, "k1", env_);
+
+  // Use sync point to simulate the race condition.
+  // txn3 tries to take the lock right after txn2 is woken up, but before it
+  // takes the lock
+  TEST_SYNC_POINT(
+      "PerKeyPointLockManagerTest::SharedLockRaceCondition:"
+      "BeforeNewSharedLockRequest");
+
+  // txn3 try to acquire a shared lock on k1, and get granted immediately
+  ASSERT_OK(locker_->TryLock(txn3, 1, "k1", env_, false));
+
+  TEST_SYNC_POINT(
+      "PerKeyPointLockManagerTest::SharedLockRaceCondition:"
+      "AfterNewSharedLockRequest");
+
+  // validate txn2 is woken up and takes the lock
+  t1.join();
+
+  // cleanup
+  locker_->UnLock(txn2, 1, "k1", env_);
+  locker_->UnLock(txn3, 1, "k1", env_);
+
+  delete txn3;
+  delete txn2;
+  delete txn1;
+}
+
+TEST_F(PerKeyPointLockManagerTest, UpgradeLockRaceCondition) {
+  // Verify an upgrade lock race condition is handled properly.
+  // When a key is locked in exlusive mode, shared lock waiters will be enqueued
+  // as waiters.
+  // When the exclusive lock holder release the lock. The shared lock waiters
+  // are woken up to take the lock. At this point, when a new shared lock
+  // requester comes in, it will take the lock directly without waiting or
+  // queueing. This requester then immediately upgrade the lock to exclusive
+  // lock. This request will be prioritized to the head of the queue.
+  // Meantime, it should also depend on the shared lock waiters which are still
+  // in the queue that are ready to take the lock. Later, when one of the reader
+  // lock want to also upgrade its lock, it will detect a dead lock and abort.
+
+  auto txn1 = NewTxn(txn_opt_);
+  auto txn2 = NewTxn(txn_opt_);
+  auto txn3 = NewTxn(txn_opt_);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"PerKeyPointLockManager::AcquireWithTimeout:AfterWokenUp",
+        "PerKeyPointLockManagerTest::UpgradeLockRaceCondition:"
+        "BeforeNewSharedLockRequest"},
+       {"PerKeyPointLockManagerTest::UpgradeLockRaceCondition:"
+        "AfterNewSharedLockRequest",
+        "PerKeyPointLockManager::AcquireWithTimeout:BeforeTakeLock"}});
+
+  std::atomic<bool> reached(false);
+  SyncPoint::GetInstance()->SetCallBack(
+      wait_sync_point_name_,
+      [&reached](void* /*arg*/) { reached.store(true); });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // txn1 acquires an exclusive lock on k1, so that the following shared lock
+  // request would be blocked
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+
+  auto t1 = port::Thread([this, &txn2]() {
+    // txn2 try to acquire a shared lock on k1, and get blocked
+    ASSERT_OK(locker_->TryLock(txn2, 1, "k1", env_, false));
+  });
+
+  while (!reached.load()) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+  }
+
+  // unlock txn1, txn2 should be woken up, but txn2 stops on the sync point
+  locker_->UnLock(txn1, 1, "k1", env_);
+
+  // Use sync point to simulate the race condition.
+  // txn3 tries to take the lock right after txn2 is woken up, but before it
+  // takes the lock
+  TEST_SYNC_POINT(
+      "PerKeyPointLockManagerTest::UpgradeLockRaceCondition:"
+      "BeforeNewSharedLockRequest");
+
+  // txn3 try to acquire a shared lock on k1, and get granted immediately
+  ASSERT_OK(locker_->TryLock(txn3, 1, "k1", env_, false));
+
+  // txn3 try to upgrade its lock to exclusive lock and get blocked.
+  reached = false;
+  auto t2 = port::Thread([this, &txn3]() {
+    ASSERT_OK(locker_->TryLock(txn3, 1, "k1", env_, true));
+  });
+
+  while (!reached.load()) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+  }
+
+  TEST_SYNC_POINT(
+      "PerKeyPointLockManagerTest::UpgradeLockRaceCondition:"
+      "AfterNewSharedLockRequest");
+
+  // validate txn2 is woken up and takes the shared lock
+  t1.join();
+
+  // validate txn2 would get deadlock when it try to upgrade its lock to
+  // exclusive
+  auto s = locker_->TryLock(txn2, 1, "k1", env_, true);
+  ASSERT_TRUE(s.IsDeadlock());
+
+  // cleanup
+  locker_->UnLock(txn2, 1, "k1", env_);
+  t2.join();
+  locker_->UnLock(txn3, 1, "k1", env_);
+
+  delete txn3;
+  delete txn2;
+  delete txn1;
+}
+
+TEST_P(SpotLockManagerTest, Catch22) {
+  // Benchmark the overhead of one transaction depends on another in a circle
+  // repeatedly
+
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  TransactionOptions txn_opt;
+  txn_opt.deadlock_detect = true;
+  txn_opt.lock_timeout = kLongTxnTimeoutMs;
+  txn_opt.expiration = kLongTxnTimeoutMs;
+
+  auto txn1 = NewTxn(txn_opt);
+  auto txn2 = NewTxn(txn_opt);
+
+  // use a wait count to count the number of times the lock is waited inside
+  // transaction lock
+  std::atomic_int wait_count(0);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  if (GetParam().use_per_key_point_lock_manager &&
+      GetParam().deadlock_timeout_us != 0) {
+    // Use special sync point when deadlock timeout is enabled, so the test run
+    // faster
+    SyncPoint::GetInstance()->SetCallBack(
+        "PerKeyPointLockManager::AcquireWithTimeout:"
+        "WaitingTxnBeforeDeadLockDetection",
+        [&wait_count](void* /*arg*/) { wait_count++; });
+  } else {
+    // PointLockManager
+    SyncPoint::GetInstance()->SetCallBack(
+        wait_sync_point_name_, [&wait_count](void* /*arg*/) { wait_count++; });
+  }
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // txn1 X lock
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+
+  std::mutex coordinator_mutex;
+  int iteration_count = 10000;
+
+  // txn1 try to lock X lock in a loop
+  auto t1 = port::Thread(
+      [this, &txn1, &wait_count, &coordinator_mutex, &iteration_count]() {
+        while (wait_count.load() < iteration_count) {
+          // spin wait until the other thread enters the lock waiter queue.
+          while (wait_count.load() % 2 == 0);
+          // unlock the lock, so that the other thread can acquire the lock
+          locker_->UnLock(txn1, 1, "k1", env_);
+          {
+            // Use the coordinator mutex to make sure the other thread has been
+            // waked up and acquired the lock, before this thread try to acquire
+            // the lock again.
+            std::scoped_lock<std::mutex> lock(coordinator_mutex);
+            ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+          }
+        }
+        locker_->UnLock(txn1, 1, "k1", env_);
+      });
+
+  // txn2 try to lock X lock in a loop
+  auto t2 = port::Thread(
+      [this, &txn2, &wait_count, &coordinator_mutex, &iteration_count]() {
+        while (wait_count.load() < iteration_count) {
+          {
+            // Use the coordinator mutex to make sure the other thread has been
+            // waked up and acquired the lock, before this thread try to acquire
+            // the lock again.
+            std::scoped_lock<std::mutex> lock(coordinator_mutex);
+            ASSERT_OK(locker_->TryLock(txn2, 1, "k1", env_, true));
+          }
+          // spin wait until the other thread enters the lock waiter queue.
+          while (wait_count.load() % 2 == 1);
+          // unlock the lock, so that the other thread can acquire the lock
+          locker_->UnLock(txn2, 1, "k1", env_);
+        }
+      });
+
+  // clean up
+  t1.join();
+  t2.join();
+
+  delete txn2;
+  delete txn1;
+}
+
+TEST_F(PerKeyPointLockManagerTest, LockUpgradeOrdering) {
+  // When lock is upgraded, verify that it will only upgrade its lock after all
+  // the shared lock that are before the first exclusive lock in the lock wait
+  // queue.
+
+  auto txn1 = NewTxn(txn_opt_);
+  auto txn2 = NewTxn(txn_opt_);
+  auto txn3 = NewTxn(txn_opt_);
+  auto txn4 = NewTxn(txn_opt_);
+
+  std::mutex txn4_mutex;
+  std::unique_lock<std::mutex> txn4_lock(txn4_mutex);
+  std::atomic_bool txn4_waked_up(false);
+  std::atomic_int wait_count(0);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->SetCallBack(
+      wait_sync_point_name_, [&wait_count](void* /*arg*/) { wait_count++; });
+  SyncPoint::GetInstance()->SetCallBack(
+      "PerKeyPointLockManager::AcquireWithTimeout:AfterWokenUp",
+      [&txn4, &txn4_mutex, &txn4_waked_up](void* arg) {
+        auto transaction_id = *(static_cast<TransactionID*>(arg));
+        if (transaction_id == txn4->GetID()) {
+          txn4_waked_up.store(true);
+          {
+            // wait for txn4 mutex to be released, so that this thread will be
+            // blocked.
+            std::scoped_lock<std::mutex> lock(txn4_mutex);
+          }
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Txn1 X lock
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+
+  // Txn2,3,4 try S lock
+  port::Thread t1([this, &txn2]() {
+    ASSERT_OK(locker_->TryLock(txn2, 1, "k1", env_, false));
+  });
+  port::Thread t2([this, &txn3]() {
+    ASSERT_OK(locker_->TryLock(txn3, 1, "k1", env_, false));
+  });
+  port::Thread t3([this, &txn4]() {
+    ASSERT_OK(locker_->TryLock(txn4, 1, "k1", env_, false));
+  });
+
+  // wait for all 3 transactions to enter wait state
+  while (wait_count.load() < 3) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(1));
+  }
+
+  // Txn1 unlock
+  locker_->UnLock(txn1, 1, "k1", env_);
+
+  // Txn2,3 take S lock
+  t1.join();
+  t2.join();
+
+  // wait for txn4 to be woken up, otherwise txn2 will get deadlock
+  while (!txn4_waked_up.load()) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(1));
+  }
+
+  // Txn2 try X lock
+  std::atomic_bool txn2_exclusive_lock_acquired(false);
+  port::Thread t4([this, &txn2, &txn2_exclusive_lock_acquired]() {
+    ASSERT_OK(locker_->TryLock(txn2, 1, "k1", env_, true));
+    txn2_exclusive_lock_acquired.store(true);
+  });
+
+  // wait for txn2 to enter wait state
+  while (wait_count.load() < 4) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(1));
+  }
+
+  // Txn3 release S lock
+  locker_->UnLock(txn3, 1, "k1", env_);
+
+  // Validate Txn2 has not acquired the lock yet
+  ASSERT_FALSE(txn2_exclusive_lock_acquired.load());
+
+  // Txn4 take S lock
+  txn4_lock.unlock();
+  t3.join();
+
+  // Txn4 release S lock Txn2 upgraded to X lock Txn2
+  locker_->UnLock(txn4, 1, "k1", env_);
+  t4.join();
+  ASSERT_TRUE(txn2_exclusive_lock_acquired.load());
+
+  // release lock clean up
+  locker_->UnLock(txn2, 1, "k1", env_);
+
+  delete txn4;
+  delete txn3;
+  delete txn2;
+  delete txn1;
+}
+
+TEST_F(PerKeyPointLockManagerTest, LockDownGradeRaceCondition) {
+  // When a lock is downgraded, it should notify all the shared waiters in the
+  // queue to take the lock.
+
+  auto txn1 = NewTxn(txn_opt_);
+  auto txn2 = NewTxn(txn_opt_);
+
+  // Txn1 X lock
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+
+  // Txn2 try S lock
+  port::Thread t1;
+  BlockUntilWaitingTxn(wait_sync_point_name_, t1, [this, &txn2]() {
+    ASSERT_OK(locker_->TryLock(txn2, 1, "k1", env_, false));
+  });
+
+  // Txn1 downgrade to S lock
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, false));
+
+  // Txn2 take S lock
+  t1.join();
+
+  // clean up
+  locker_->UnLock(txn1, 1, "k1", env_);
+  locker_->UnLock(txn2, 1, "k1", env_);
+
+  delete txn2;
+  delete txn1;
+}
+
+// Run AnyLockManagerTest with PointLockManager
 INSTANTIATE_TEST_CASE_P(PointLockManager, AnyLockManagerTest,
                         ::testing::Values(nullptr));
 
+// Run AnyLockManagerTest with PerKeyPointLockManager
+template <int64_t N>
+void PerKeyPointLockManagerTestSetup(PointLockManagerTest* self) {
+  self->init();
+  self->deadlock_timeout_us = N;
+  self->UsePerKeyPointLockManager();
+}
+
+INSTANTIATE_TEST_CASE_P(
+    PerLockPointLockManager, AnyLockManagerTest,
+    ::testing::Values(PerKeyPointLockManagerTestSetup<0>,
+                      PerKeyPointLockManagerTestSetup<100>,
+                      PerKeyPointLockManagerTestSetup<1000>));
+
+// Run PointLockManagerTest with PerLockPointLockManager and PointLockManager
+INSTANTIATE_TEST_CASE_P(
+    PointLockCorrectnessCheckTestSuite, SpotLockManagerTest,
+    ::testing::ValuesIn(std::vector<SpotLockManagerTestParam>{
+        {true, 0}, {true, 100}, {true, 1000}, {false, 0}}));
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/utilities/transactions/lock/point/point_lock_manager_test.h b/utilities/transactions/lock/point/point_lock_manager_test.h
index 4f0054459c99..0261a7b3b07c 100644
--- a/utilities/transactions/lock/point/point_lock_manager_test.h
+++ b/utilities/transactions/lock/point/point_lock_manager_test.h
@@ -4,321 +4,99 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#pragma once
+
 #include "file/file_util.h"
-#include "port/port.h"
-#include "port/stack_trace.h"
 #include "rocksdb/utilities/transaction_db.h"
 #include "test_util/testharness.h"
-#include "test_util/testutil.h"
 #include "utilities/transactions/lock/point/point_lock_manager.h"
+#include "utilities/transactions/lock/point/point_lock_manager_test_common.h"
 #include "utilities/transactions/pessimistic_transaction_db.h"
-#include "utilities/transactions/transaction_db_mutex_impl.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-class MockColumnFamilyHandle : public ColumnFamilyHandle {
- public:
-  explicit MockColumnFamilyHandle(ColumnFamilyId cf_id) : cf_id_(cf_id) {}
-
-  ~MockColumnFamilyHandle() override {}
-
-  const std::string& GetName() const override { return name_; }
-
-  ColumnFamilyId GetID() const override { return cf_id_; }
-
-  Status GetDescriptor(ColumnFamilyDescriptor*) override {
-    return Status::OK();
-  }
-
-  const Comparator* GetComparator() const override {
-    return BytewiseComparator();
-  }
-
- private:
-  ColumnFamilyId cf_id_;
-  std::string name_ = "MockCF";
-};
-
 class PointLockManagerTest : public testing::Test {
  public:
-  void SetUp() override {
+  void init() {
     env_ = Env::Default();
     db_dir_ = test::PerThreadDBPath("point_lock_manager_test");
     ASSERT_OK(env_->CreateDir(db_dir_));
 
     Options opt;
     opt.create_if_missing = true;
-    TransactionDBOptions txn_opt;
-    txn_opt.transaction_lock_timeout = 0;
+    // Reduce the number of stripes to 4 to increase contention in test
+    txndb_opt_.num_stripes = 4;
+    txndb_opt_.transaction_lock_timeout = 0;
 
-    ASSERT_OK(TransactionDB::Open(opt, txn_opt, db_dir_, &db_));
+    ASSERT_OK(TransactionDB::Open(opt, txndb_opt_, db_dir_, &db_));
+
+    wait_sync_point_name_ = "PointLockManager::AcquireWithTimeout:WaitingTxn";
+  }
 
+  void SetUp() override {
+    init();
     // CAUTION: This test creates a separate lock manager object (right, NOT
     // the one that the TransactionDB is using!), and runs tests on it.
     locker_.reset(new PointLockManager(
-        static_cast<PessimisticTransactionDB*>(db_), txn_opt));
-
-    wait_sync_point_name_ = "PointLockManager::AcquireWithTimeout:WaitingTxn";
+        static_cast<PessimisticTransactionDB*>(db_), txndb_opt_));
   }
 
   void TearDown() override {
+    std::string errmsg;
+    auto no_lock_held = verifyNoLocksHeld(locker_, errmsg);
+    ASSERT_TRUE(no_lock_held) << errmsg;
     delete db_;
     EXPECT_OK(DestroyDir(env_, db_dir_));
   }
 
   PessimisticTransaction* NewTxn(
       TransactionOptions txn_opt = TransactionOptions()) {
+    // override deadlock_timeout_us;
+    txn_opt.deadlock_timeout_us = deadlock_timeout_us;
     Transaction* txn = db_->BeginTransaction(WriteOptions(), txn_opt);
     return static_cast<PessimisticTransaction*>(txn);
   }
 
+  int64_t deadlock_timeout_us = 0;
+
+  void UsePerKeyPointLockManager() {
+    locker_.reset(new PerKeyPointLockManager(
+        static_cast<PessimisticTransactionDB*>(db_), txndb_opt_));
+  }
+
  protected:
   Env* env_;
+  TransactionDBOptions txndb_opt_;
   std::shared_ptr<LockManager> locker_;
   const char* wait_sync_point_name_;
   friend void PointLockManagerTestExternalSetup(PointLockManagerTest*);
 
- private:
   std::string db_dir_;
   TransactionDB* db_;
 };
 
-using init_func_t = void (*)(PointLockManagerTest*);
-
-class AnyLockManagerTest : public PointLockManagerTest,
-                           public testing::WithParamInterface<init_func_t> {
- public:
-  void SetUp() override {
-    // If a custom setup function was provided, use it. Otherwise, use what we
-    // have inherited.
-    auto init_func = GetParam();
-    if (init_func)
-      (*init_func)(this);
-    else
-      PointLockManagerTest::SetUp();
-  }
-};
-
-TEST_P(AnyLockManagerTest, ReentrantExclusiveLock) {
-  // Tests that a txn can acquire exclusive lock on the same key repeatedly.
-  MockColumnFamilyHandle cf(1);
-  locker_->AddColumnFamily(&cf);
-  auto txn = NewTxn();
-  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true));
-  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true));
-
-  // Cleanup
-  locker_->UnLock(txn, 1, "k", env_);
-
-  delete txn;
-}
-
-TEST_P(AnyLockManagerTest, ReentrantSharedLock) {
-  // Tests that a txn can acquire shared lock on the same key repeatedly.
-  MockColumnFamilyHandle cf(1);
-  locker_->AddColumnFamily(&cf);
-  auto txn = NewTxn();
-  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false));
-  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false));
-
-  // Cleanup
-  locker_->UnLock(txn, 1, "k", env_);
-
-  delete txn;
-}
-
-TEST_P(AnyLockManagerTest, LockUpgrade) {
-  // Tests that a txn can upgrade from a shared lock to an exclusive lock.
-  MockColumnFamilyHandle cf(1);
-  locker_->AddColumnFamily(&cf);
-  auto txn = NewTxn();
-  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false));
-  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true));
-
-  // Cleanup
-  locker_->UnLock(txn, 1, "k", env_);
-  delete txn;
-}
-
-TEST_P(AnyLockManagerTest, LockDowngrade) {
-  // Tests that a txn can acquire a shared lock after acquiring an exclusive
-  // lock on the same key.
-  MockColumnFamilyHandle cf(1);
-  locker_->AddColumnFamily(&cf);
-  auto txn = NewTxn();
-  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true));
-  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false));
-
-  // Cleanup
-  locker_->UnLock(txn, 1, "k", env_);
-  delete txn;
-}
-
-TEST_P(AnyLockManagerTest, LockConflict) {
-  // Tests that lock conflicts lead to lock timeout.
-  MockColumnFamilyHandle cf(1);
-  locker_->AddColumnFamily(&cf);
-  auto txn1 = NewTxn();
-  auto txn2 = NewTxn();
-
-  {
-    // exclusive-exclusive conflict.
-    ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
-    auto s = locker_->TryLock(txn2, 1, "k1", env_, true);
-    ASSERT_TRUE(s.IsTimedOut());
-  }
-
-  {
-    // exclusive-shared conflict.
-    ASSERT_OK(locker_->TryLock(txn1, 1, "k2", env_, true));
-    auto s = locker_->TryLock(txn2, 1, "k2", env_, false);
-    ASSERT_TRUE(s.IsTimedOut());
-  }
-
-  {
-    // shared-exclusive conflict.
-    ASSERT_OK(locker_->TryLock(txn1, 1, "k2", env_, false));
-    auto s = locker_->TryLock(txn2, 1, "k2", env_, true);
-    ASSERT_TRUE(s.IsTimedOut());
-  }
-
-  // Cleanup
-  locker_->UnLock(txn1, 1, "k1", env_);
-  locker_->UnLock(txn1, 1, "k2", env_);
-
-  delete txn1;
-  delete txn2;
-}
-
-port::Thread BlockUntilWaitingTxn(const char* sync_point_name,
-                                  std::function<void()> f) {
+void BlockUntilWaitingTxn(const char* sync_point_name, port::Thread& t,
+                          std::function<void()> f) {
   std::atomic<bool> reached(false);
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       sync_point_name, [&](void* /*arg*/) { reached.store(true); });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
-  port::Thread t(f);
+  t = port::Thread(f);
 
-  while (!reached.load()) {
-    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+  // timeout after 30 seconds, so test does not hang forever
+  // 30 seconds should be enough for the test to reach the expected state
+  // without causing too much flakiness
+  for (int i = 0; i < 3000; i++) {
+    if (reached.load()) {
+      break;
+    }
+    std::this_thread::sleep_for(std::chrono::milliseconds(10));
   }
+
+  ASSERT_TRUE(reached.load());
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
-
-  return t;
-}
-
-TEST_P(AnyLockManagerTest, SharedLocks) {
-  // Tests that shared locks can be concurrently held by multiple transactions.
-  MockColumnFamilyHandle cf(1);
-  locker_->AddColumnFamily(&cf);
-  auto txn1 = NewTxn();
-  auto txn2 = NewTxn();
-  ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, false));
-  ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, false));
-
-  // Cleanup
-  locker_->UnLock(txn1, 1, "k", env_);
-  locker_->UnLock(txn2, 1, "k", env_);
-
-  delete txn1;
-  delete txn2;
-}
-
-TEST_P(AnyLockManagerTest, Deadlock) {
-  // Tests that deadlock can be detected.
-  // Deadlock scenario:
-  // txn1 exclusively locks k1, and wants to lock k2;
-  // txn2 exclusively locks k2, and wants to lock k1.
-  MockColumnFamilyHandle cf(1);
-  locker_->AddColumnFamily(&cf);
-  TransactionOptions txn_opt;
-  txn_opt.deadlock_detect = true;
-  txn_opt.lock_timeout = 1000000;
-  auto txn1 = NewTxn(txn_opt);
-  auto txn2 = NewTxn(txn_opt);
-
-  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
-  ASSERT_OK(locker_->TryLock(txn2, 1, "k2", env_, true));
-
-  // txn1 tries to lock k2, will block forever.
-  port::Thread t = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() {
-    // block because txn2 is holding a lock on k2.
-    ASSERT_OK(locker_->TryLock(txn1, 1, "k2", env_, true));
-  });
-
-  auto s = locker_->TryLock(txn2, 1, "k1", env_, true);
-  ASSERT_TRUE(s.IsBusy());
-  ASSERT_EQ(s.subcode(), Status::SubCode::kDeadlock);
-
-  std::vector<DeadlockPath> deadlock_paths = locker_->GetDeadlockInfoBuffer();
-  ASSERT_EQ(deadlock_paths.size(), 1u);
-  ASSERT_FALSE(deadlock_paths[0].limit_exceeded);
-
-  std::vector<DeadlockInfo> deadlocks = deadlock_paths[0].path;
-  ASSERT_EQ(deadlocks.size(), 2u);
-
-  ASSERT_EQ(deadlocks[0].m_txn_id, txn1->GetID());
-  ASSERT_EQ(deadlocks[0].m_cf_id, 1u);
-  ASSERT_TRUE(deadlocks[0].m_exclusive);
-  ASSERT_EQ(deadlocks[0].m_waiting_key, "k2");
-
-  ASSERT_EQ(deadlocks[1].m_txn_id, txn2->GetID());
-  ASSERT_EQ(deadlocks[1].m_cf_id, 1u);
-  ASSERT_TRUE(deadlocks[1].m_exclusive);
-  ASSERT_EQ(deadlocks[1].m_waiting_key, "k1");
-
-  locker_->UnLock(txn2, 1, "k2", env_);
-  t.join();
-
-  // Cleanup
-  locker_->UnLock(txn1, 1, "k1", env_);
-  locker_->UnLock(txn1, 1, "k2", env_);
-  delete txn2;
-  delete txn1;
-}
-
-TEST_P(AnyLockManagerTest, GetWaitingTxns_MultipleTxns) {
-  MockColumnFamilyHandle cf(1);
-  locker_->AddColumnFamily(&cf);
-
-  auto txn1 = NewTxn();
-  ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, false));
-
-  auto txn2 = NewTxn();
-  ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, false));
-
-  auto txn3 = NewTxn();
-  txn3->SetLockTimeout(10000);
-  port::Thread t1 = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() {
-    ASSERT_OK(locker_->TryLock(txn3, 1, "k", env_, true));
-    locker_->UnLock(txn3, 1, "k", env_);
-  });
-
-  // Ok, now txn3 is waiting for lock on "k", which is owned by two
-  // transactions. Check that GetWaitingTxns reports this correctly
-  uint32_t wait_cf_id;
-  std::string wait_key;
-  auto waiters = txn3->GetWaitingTxns(&wait_cf_id, &wait_key);
-
-  ASSERT_EQ(wait_cf_id, 1u);
-  ASSERT_EQ(wait_key, "k");
-  ASSERT_EQ(waiters.size(), 2);
-  bool waits_correct =
-      (waiters[0] == txn1->GetID() && waiters[1] == txn2->GetID()) ||
-      (waiters[1] == txn1->GetID() && waiters[0] == txn2->GetID());
-  ASSERT_EQ(waits_correct, true);
-
-  // Release locks so txn3 can proceed with execution
-  locker_->UnLock(txn1, 1, "k", env_);
-  locker_->UnLock(txn2, 1, "k", env_);
-
-  // Wait until txn3 finishes
-  t1.join();
-
-  delete txn1;
-  delete txn2;
-  delete txn3;
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/transactions/lock/point/point_lock_manager_test_common.h b/utilities/transactions/lock/point/point_lock_manager_test_common.h
new file mode 100644
index 000000000000..a4cc7dafc135
--- /dev/null
+++ b/utilities/transactions/lock/point/point_lock_manager_test_common.h
@@ -0,0 +1,78 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <sstream>
+
+#include "rocksdb/db.h"
+#include "utilities/transactions/lock/lock_manager.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+constexpr auto kLongTxnTimeoutMs = 100000;
+constexpr auto kShortTxnTimeoutMs = 100;
+
+class MockColumnFamilyHandle : public ColumnFamilyHandle {
+ public:
+  explicit MockColumnFamilyHandle(ColumnFamilyId cf_id) : cf_id_(cf_id) {}
+
+  // disable copy and assignment
+  MockColumnFamilyHandle(const MockColumnFamilyHandle&) = delete;
+  MockColumnFamilyHandle& operator=(const MockColumnFamilyHandle&) = delete;
+  // disable move
+  MockColumnFamilyHandle(MockColumnFamilyHandle&&) = delete;
+  MockColumnFamilyHandle& operator=(MockColumnFamilyHandle&&) = delete;
+
+  ~MockColumnFamilyHandle() override {}
+
+  const std::string& GetName() const override { return name_; }
+
+  ColumnFamilyId GetID() const override { return cf_id_; }
+
+  Status GetDescriptor(ColumnFamilyDescriptor*) override {
+    return Status::OK();
+  }
+
+  const Comparator* GetComparator() const override {
+    return BytewiseComparator();
+  }
+
+ private:
+  ColumnFamilyId cf_id_;
+  std::string name_ = "MockCF";
+};
+
+// Verify no lock was held. Return true, if success. False, if there is. Set
+// error message on False.
+bool verifyNoLocksHeld(std::shared_ptr<LockManager>& locker,
+                       std::string& errmsg) {
+  // Validate no lock was held at the end of the test
+  auto lock_status = locker->GetPointLockStatus();
+  // print the lock status for debugging
+  std::stringstream ss;
+  for (auto& s : lock_status) {
+    ss << "id " << s.first;
+    ss << " key " << s.second.key;
+    ss << " type " << (s.second.exclusive ? "exclusive" : "shared");
+    ss << " txn ids [";
+    for (auto& t : s.second.ids) {
+      ss << t << ",";
+    }
+    ss << "]";
+    ss << std::endl;
+  }
+
+  if (!lock_status.empty()) {
+    errmsg = std::to_string(lock_status.size()) +
+             " locks were held at the end. " + ss.str();
+    return false;
+  }
+
+  return true;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/transactions/lock/point/point_lock_validation_test_runner.h b/utilities/transactions/lock/point/point_lock_validation_test_runner.h
new file mode 100644
index 000000000000..00ae526e9c0d
--- /dev/null
+++ b/utilities/transactions/lock/point/point_lock_validation_test_runner.h
@@ -0,0 +1,469 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cinttypes>
+#include <cstddef>
+#include <cstdio>
+#include <iostream>
+#include <memory>
+
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "utilities/transactions/lock/lock_manager.h"
+#include "utilities/transactions/lock/point/point_lock_manager_test_common.h"
+#include "utilities/transactions/pessimistic_transaction.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+constexpr bool kDebugLog = false;
+
+// Since this code is executed both with and without gtest, it supports assert
+// with different ways.
+#ifdef ASSERT_TRUE
+#define ASSERT_TRUE_WITH_MSG(expr, errmsg) ASSERT_TRUE(expr) << (errmsg)
+#else
+#define ASSERT_TRUE_WITH_MSG(expr, errmsg)                             \
+  if (!(expr)) {                                                       \
+    std::cerr << "Assert true failed with error message: " << (errmsg) \
+              << std::endl;                                            \
+    abort();                                                           \
+  }
+#endif
+
+#ifndef ASSERT_OK
+#define ASSERT_OK(s) \
+  ASSERT_TRUE_WITH_MSG(s.ok(), "Failed with " + s.ToString());
+#endif
+
+#define ASSERT_TRUE_WITH_INFO(X) \
+  ASSERT_TRUE_WITH_MSG(          \
+      (X), " Txn " + std::to_string(txn_id) + " key " + std::to_string(key))
+
+#define ASSERT_EQ_WITH_INFO(X, Y) ASSERT_TRUE_WITH_INFO((X) == (Y))
+
+#define DEBUG_LOG(...)            \
+  if (kDebugLog) {                \
+    fprintf(stderr, __VA_ARGS__); \
+    fflush(stderr);               \
+  }
+
+#define DEBUG_LOG_WITH_PREFIX(format, ...) \
+  DEBUG_LOG("Txn %" PRIu64 " " format, txn_id, ##__VA_ARGS__);
+
+enum class LockTypeToTest : int8_t {
+  EXCLUSIVE_ONLY = 0,
+  SHARED_ONLY = 1,
+  EXCLUSIVE_AND_SHARED = 2,
+};
+
+struct KeyStatus {
+  KeyStatus(uint32_t k, bool ex, int v) : key(k), exclusive(ex), value(v) {}
+  uint32_t key;
+  bool exclusive;
+  int value;
+};
+
+class PointLockValidationTestRunner {
+ public:
+  PointLockValidationTestRunner(
+      Env* env, TransactionDBOptions txndb_opt,
+      std::shared_ptr<LockManager> locker, TransactionDB* db,
+      TransactionOptions txn_opt, uint32_t thd_cnt, uint32_t key_cnt,
+      uint32_t max_num_keys_to_lock_per_txn, uint32_t execution_time_sec,
+      LockTypeToTest lock_type, bool allow_non_deadlock_error,
+      uint32_t max_sleep_after_lock_acquisition_ms,
+      bool enable_per_thread_lock_count_assertion = false)
+      : env_(env),
+        txndb_opt_(std::move(txndb_opt)),
+        locker_(std::move(locker)),
+        db_(db),
+        txn_opt_(std::move(txn_opt)),
+        thread_count_(thd_cnt),
+        key_count_(key_cnt),
+        max_num_keys_to_lock_per_txn_(max_num_keys_to_lock_per_txn),
+        execution_time_sec_(execution_time_sec),
+        lock_type_(lock_type),
+        allow_non_deadlock_error_(allow_non_deadlock_error),
+        max_sleep_after_lock_acquisition_ms_(
+            max_sleep_after_lock_acquisition_ms),
+        enable_per_thread_lock_count_assertion_(
+            enable_per_thread_lock_count_assertion),
+        shutdown_(false) {
+    // Only enable lock status validation when lock expiration/stealing isk
+    // disabled.
+    enable_lock_status_validation_ = txn_opt_.expiration == -1;
+    values_.resize(key_count_, 0);
+    exclusive_lock_status_.resize(key_count_, 0);
+
+    // init counters and values
+    for (size_t i = 0; i < key_count_; i++) {
+      counters_.emplace_back(std::make_unique<std::atomic_int>(0));
+      shared_lock_count_.emplace_back(std::make_unique<std::atomic_int>(0));
+    }
+
+    for (size_t i = 0; i < thread_count_; i++) {
+      num_of_locks_acquired_per_thread_.emplace_back(
+          std::make_unique<std::atomic_int64_t>(0));
+    }
+  }
+
+  // Decide which lock type to acquire
+  // If the key is already locked and only one type of locks to be tested,
+  // return false, so caller could try to lock a different key.
+  // Otherwise, return true.
+  bool DecideLockType(
+      bool& acquire_exclusive_lock, uint32_t key,
+      std::unordered_map<uint32_t, KeyStatus>& locked_key_status,
+      bool& isUpgrade, bool& isDowngrade) {
+    // Decide lock type
+    acquire_exclusive_lock = Random::GetTLSInstance()->OneIn(2);
+
+    // check whether a lock on the same key is already held
+    auto it = locked_key_status.find(key);
+    if (it != locked_key_status.end()) {
+      // a lock on the same key is already held.
+      if (lock_type_ == LockTypeToTest::EXCLUSIVE_AND_SHARED) {
+        // if test both shared and exclusive locks, switch their type
+        if (it->second.exclusive == false) {
+          // If it is a shared lock, upgrade to an exclusive lock
+          acquire_exclusive_lock = true;
+          isUpgrade = true;
+        } else {
+          // If it is an exclusive lock, downgrade to a shared lock
+          acquire_exclusive_lock = false;
+          isDowngrade = true;
+        }
+      } else {
+        // Only one type of lock to test, and the key is already locked,
+        return false;
+      }
+    }
+
+    // This is a new key to lock or the lock type is switched.
+    if (lock_type_ != LockTypeToTest::EXCLUSIVE_AND_SHARED) {
+      // if only one type of locks to be acquired, update its type
+      acquire_exclusive_lock = (lock_type_ == LockTypeToTest::EXCLUSIVE_ONLY);
+    }
+    return true;
+  }
+
+  void run() {
+    // Verify lock guarantee. Exclusive lock provide unique access guarantee.
+    // Shared lock provide shared access guarantee.
+    // Create multiple threads. Each try to grab a lock with random type on
+    // random key.
+
+    // To validate lock exclusive guarantee, each key has a value and a counter
+    // used for tracking the number of exclusive locks have been acquired on it
+    // in each test run across all threads.
+
+    // Every time an exclusive lock is acquired, both the counter and the value
+    // are bumped by 1. The difference between the counter and the value is that
+    // counter is atomic, so it is guaranteed that it would not lose update,
+    // while value is not atomic. Its correctness is only guaranteed by the
+    // exclusiveness provided by the lock manager which is being tested. If the
+    // lock manager does not guarantee exclusiveness, the value would lose
+    // update, and the counter would mismatch with the value, which fails the
+    // test.
+
+    // To validate lock shared guarantee, after a shared lock is acquired, the
+    // counter and value are read and stored in a local variable inside the
+    // thread. Before the lock is released, the local copy is compared against
+    // the counter and value. If they mismatch, it means the shared lock
+    // guaranteed is violated.
+
+    MockColumnFamilyHandle cf(1);
+    locker_->AddColumnFamily(&cf);
+
+    for (uint32_t thd_idx = 0; thd_idx < thread_count_; thd_idx++) {
+      threads_.emplace_back([this, thd_idx]() {
+        auto txn = static_cast<PessimisticTransaction*>(
+            db_->BeginTransaction(WriteOptions(), txn_opt_));
+        auto txn_id = txn->GetID();
+        DEBUG_LOG_WITH_PREFIX("Thd %" PRIu32 " new txn\n", thd_idx);
+        while (!shutdown_) {
+          std::unordered_map<uint32_t, KeyStatus> locked_key_status;
+          auto num_key_to_lock = max_num_keys_to_lock_per_txn_;
+          Status s;
+
+          for (uint32_t j = 0; j < num_key_to_lock; j++) {
+            uint32_t key = 0;
+            key = Random::GetTLSInstance()->Uniform(key_count_);
+            auto key_str = std::to_string(key);
+            bool isUpgrade = false;
+            bool isDowngrade = false;
+            bool exclusive_lock_type;
+
+            if (!DecideLockType(exclusive_lock_type, key, locked_key_status,
+                                isUpgrade, isDowngrade)) {
+              // try a different key
+              j--;
+              continue;
+            }
+
+            if (enable_lock_status_validation_) {
+              if (isDowngrade) {
+                // Before downgrade, validate the lock is in exlusive status
+                // This could not be done after downgrade, as another thread
+                // could take a shared lock and update lock status
+                ASSERT_TRUE_WITH_INFO(exclusive_lock_status_[key]);
+                ASSERT_EQ_WITH_INFO(*shared_lock_count_[key], 0);
+                // for downgrade, update the lock status before acquiring the
+                // lock, as afterwards, it will not have exclusive access to it
+                exclusive_lock_status_[key] = 0;
+              }
+            }
+
+            // try to acquire the lock
+            DEBUG_LOG_WITH_PREFIX("try to acquire lock %" PRIu32 " type %s\n",
+                                  key,
+                                  exclusive_lock_type ? "exclusive" : "shared");
+            s = locker_->TryLock(txn, 1, key_str, env_, exclusive_lock_type);
+
+            if (s.ok()) {
+              DEBUG_LOG_WITH_PREFIX(
+                  "acquired lock %" PRIu32 " type %s\n", key,
+                  exclusive_lock_type ? "exclusive" : "shared");
+
+              auto it = locked_key_status.find(key);
+              if (isUpgrade || isDowngrade) {
+                // If it is either upgrade or downgrade, the key should exist
+                // already.
+                ASSERT_TRUE_WITH_INFO(it != locked_key_status.end());
+              } else {
+                locked_key_status.emplace(
+                    std::piecewise_construct, std::forward_as_tuple(key),
+                    std::forward_as_tuple(key, exclusive_lock_type,
+                                          values_[key]));
+              }
+              // update local lock status
+              if (exclusive_lock_type) {
+                if (isUpgrade) {
+                  it->second.exclusive = true;
+                }
+                num_of_exclusive_locks_acquired_++;
+              } else {
+                if (isDowngrade) {
+                  it->second.exclusive = false;
+                }
+                num_of_shared_locks_acquired_++;
+              }
+              num_of_locks_acquired_++;
+              (*num_of_locks_acquired_per_thread_[thd_idx])++;
+
+              if (enable_lock_status_validation_) {
+                if (exclusive_lock_type) {
+                  // validate the lock is not in exclusive status
+                  ASSERT_TRUE_WITH_INFO(!exclusive_lock_status_[key]);
+                  if (isUpgrade) {
+                    // validate the lock is in shared status and only had one
+                    // shared lock
+                    ASSERT_EQ_WITH_INFO(*shared_lock_count_[key], 1);
+                    shared_lock_count_[key]->fetch_sub(1);
+                  } else {
+                    ASSERT_EQ_WITH_INFO(*shared_lock_count_[key], 0);
+                  }
+                  // update the lock status
+                  exclusive_lock_status_[key] = 1;
+                } else {
+                  shared_lock_count_[key]->fetch_add(1);
+                  ASSERT_TRUE_WITH_INFO(!exclusive_lock_status_[key]);
+                }
+              }
+            } else {
+              if (!allow_non_deadlock_error_) {
+                ASSERT_TRUE_WITH_INFO(s.IsDeadlock());
+              }
+              if (s.IsDeadlock()) {
+                DEBUG_LOG_WITH_PREFIX(
+                    "detected deadlock on key %" PRIu32 ", abort\n", key);
+                num_of_deadlock_detected_++;
+                // for deadlock, release all locks acquired
+                break;
+              } else {
+                // for other errors, try again
+                DEBUG_LOG_WITH_PREFIX("failed to acquire lock on key %" PRIu32
+                                      ", due to "
+                                      "%s, "
+                                      "abort\n",
+                                      key, s.ToString().c_str());
+              }
+            }
+          }
+
+          // After all of the locks are acquired, try to sleep a bit to simulate
+          // some useful work to be done
+          if (max_sleep_after_lock_acquisition_ms_ != 0 && s.ok()) {
+            auto sleep_time_us = Random::GetTLSInstance()->Uniform(
+                static_cast<uint32_t>(max_sleep_after_lock_acquisition_ms_));
+            std::this_thread::sleep_for(
+                std::chrono::milliseconds(sleep_time_us));
+          }
+
+          // release all locks
+          for (const auto& pair : locked_key_status) {
+            auto key_status = pair.second;
+            auto key = key_status.key;
+            ASSERT_TRUE_WITH_INFO(key < key_count_);
+            if (enable_lock_status_validation_) {
+              ASSERT_EQ_WITH_INFO(counters_[key]->load(), values_[key]);
+              auto exclusive = key_status.exclusive;
+              if (exclusive) {
+                // for exclusive lock, bump the value by 1
+                (*counters_[key])++;
+                values_[key]++;
+                DEBUG_LOG_WITH_PREFIX("bump key %" PRIu32 " by 1 to %d\n", key,
+                                      values_[key]);
+                ASSERT_EQ_WITH_INFO(counters_[key]->load(), values_[key]);
+              } else {
+                // shared lock, validate the value has not changed since it was
+                // read
+                ASSERT_EQ_WITH_INFO(counters_[key]->load(), key_status.value);
+                ASSERT_EQ_WITH_INFO(values_[key], key_status.value);
+              }
+              if (exclusive) {
+                ASSERT_TRUE_WITH_INFO(exclusive_lock_status_[key]);
+                ASSERT_EQ_WITH_INFO(*shared_lock_count_[key], 0);
+                exclusive_lock_status_[key] = 0;
+              } else {
+                ASSERT_TRUE_WITH_INFO(!exclusive_lock_status_[key]);
+                ASSERT_TRUE_WITH_INFO(shared_lock_count_[key]->fetch_sub(1) >=
+                                      1);
+              }
+            }
+            DEBUG_LOG_WITH_PREFIX("release lock %" PRIu32 "\n", key);
+            locker_->UnLock(txn, 1, std::to_string(key), env_);
+          }
+        }
+        delete txn;
+      });
+    }
+
+    // run test for a few seconds
+    // print progress
+    auto prev_num_of_locks_acquired = num_of_locks_acquired_.load();
+    std::vector<int64_t> prev_num_of_locks_acquired_per_thread(thread_count_,
+                                                               0);
+    int64_t measured_locks_acquired = 0;
+    for (uint32_t i = 0; i < execution_time_sec_; i++) {
+      std::this_thread::sleep_for(std::chrono::seconds(1));
+      auto num_of_locks_acquired = num_of_locks_acquired_.load();
+      DEBUG_LOG("num_of_locks_acquired: %" PRId64 "\n", num_of_locks_acquired);
+      DEBUG_LOG("num_of_exclusive_locks_acquired: %" PRId64 "\n",
+                num_of_exclusive_locks_acquired_.load());
+      DEBUG_LOG("num_of_shared_locks_acquired: %" PRId64 "\n",
+                num_of_shared_locks_acquired_.load());
+      DEBUG_LOG("num_of_deadlock_detected: %" PRId64 "\n",
+                num_of_deadlock_detected_.load());
+      ASSERT_TRUE_WITH_MSG(num_of_locks_acquired > prev_num_of_locks_acquired,
+                           "No locks were acquired in the last 1 second");
+      for (uint32_t thd_idx = 0; thd_idx < thread_count_; thd_idx++) {
+        auto num_of_locks_acquired_per_thread =
+            num_of_locks_acquired_per_thread_[thd_idx]->load();
+        DEBUG_LOG("thread: %" PRIu32 " acquired %" PRId64 " locks\n", thd_idx,
+                  num_of_locks_acquired_per_thread);
+        if (enable_per_thread_lock_count_assertion_) {
+          ASSERT_TRUE_WITH_MSG(
+              num_of_locks_acquired_per_thread >
+                  prev_num_of_locks_acquired_per_thread[thd_idx],
+              "No locks were acquired in the last 1 second on thread " +
+                  std::to_string(thd_idx));
+        }
+        prev_num_of_locks_acquired_per_thread[thd_idx] =
+            num_of_locks_acquired_per_thread;
+      }
+      prev_num_of_locks_acquired = num_of_locks_acquired;
+      if (i == 0) {
+        measured_locks_acquired = num_of_locks_acquired;
+      }
+      if (i == execution_time_sec_ - 1) {
+        measured_locks_acquired =
+            num_of_locks_acquired - measured_locks_acquired;
+        // Skip the first second, as threads are warming up
+        auto measured_execution_time_sec = execution_time_sec_ - 1;
+        if (measured_execution_time_sec > 0) {
+          printf("measured_num_of_locks_acquired: %" PRId64 "\n",
+                 measured_locks_acquired / (measured_execution_time_sec));
+        }
+      }
+    }
+
+    shutdown_ = true;
+    for (auto& t : threads_) {
+      t.join();
+    }
+
+    // validate values against counters
+    for (uint32_t i = 0; i < key_count_; i++) {
+      ASSERT_TRUE_WITH_MSG(counters_[i]->load() == values_[i],
+                           "Exclusive lock guarantee is violated.");
+    }
+
+    ASSERT_TRUE_WITH_MSG(num_of_locks_acquired_.load() >= 0,
+                         "No lock were acquired at all");
+    printf("num_of_locks_acquired: %" PRId64 "\n",
+           num_of_locks_acquired_.load());
+
+    std::string errmsg;
+    auto no_lock_held = verifyNoLocksHeld(locker_, errmsg);
+    ASSERT_TRUE_WITH_MSG(no_lock_held, errmsg);
+  }
+
+ private:
+  // test configuration
+  Env* env_;
+  TransactionDBOptions txndb_opt_;
+  std::shared_ptr<LockManager> locker_;
+
+  TransactionDB* db_;
+  TransactionOptions txn_opt_;
+
+  uint32_t thread_count_;
+  uint32_t key_count_;
+  uint32_t max_num_keys_to_lock_per_txn_;
+  uint32_t execution_time_sec_;
+  LockTypeToTest lock_type_;
+  bool allow_non_deadlock_error_;
+  uint32_t max_sleep_after_lock_acquisition_ms_;
+
+  // In some of the test run, due to debug or ASAN build and short lock timeout,
+  // a thread may not be able to acquire any lock within a second. So skip this
+  // assertion by default. However, this could be useful for quickly detecting
+  // stuck thread, when running locally with longer timeout.
+  bool enable_per_thread_lock_count_assertion_;
+
+  // Internal test variables
+
+  bool enable_lock_status_validation_;
+  std::vector<std::thread> threads_;
+  std::vector<std::unique_ptr<std::atomic_int>> counters_;
+  std::vector<int> values_;
+
+  // track whether the lock is in exclusive status or
+  // not. vector<bool> does something special underneath, causing consistency
+  // issue. Therefore int64_t is used.
+  std::vector<int64_t> exclusive_lock_status_;
+
+  // A counter to track number of shared locks for tracking shared lock status
+  std::vector<std::unique_ptr<std::atomic_int>> shared_lock_count_;
+
+  // shutdown flag to signal threads to exit
+  std::atomic_bool shutdown_ = false;
+
+  // test statistics
+  std::atomic_int64_t num_of_locks_acquired_ = 0;
+  std::atomic_int64_t num_of_shared_locks_acquired_ = 0;
+  std::atomic_int64_t num_of_exclusive_locks_acquired_ = 0;
+  std::atomic_int64_t num_of_deadlock_detected_ = 0;
+  std::vector<std::unique_ptr<std::atomic_int64_t>>
+      num_of_locks_acquired_per_thread_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/transactions/lock/range/range_locking_test.cc b/utilities/transactions/lock/range/range_locking_test.cc
index 961a5a11ae9c..0e37badbbcad 100644
--- a/utilities/transactions/lock/range/range_locking_test.cc
+++ b/utilities/transactions/lock/range/range_locking_test.cc
@@ -5,21 +5,18 @@
 
 #ifndef OS_WIN
 
-#include <algorithm>
 #include <functional>
+#include <iomanip>
 #include <string>
 #include <thread>
 
 #include "db/db_impl/db_impl.h"
-#include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
-#include "rocksdb/perf_context.h"
 #include "rocksdb/utilities/transaction.h"
 #include "rocksdb/utilities/transaction_db.h"
-#include "utilities/transactions/lock/point/point_lock_manager_test.h"
-#include "utilities/transactions/pessimistic_transaction_db.h"
-#include "utilities/transactions/transaction_test.h"
+#include "utilities/transactions/lock/point/any_lock_manager_test.h"
+#include "utilities/transactions/transaction_db_mutex_impl.h"
 
 using std::string;
 
diff --git a/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc b/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc
index 584d9ebc2765..7674dab03f3e 100644
--- a/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc
+++ b/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc
@@ -130,7 +130,7 @@ Status RangeTreeLockManager::TryLock(PessimisticTransaction* txn,
     case DB_LOCK_NOTGRANTED:
       return Status::TimedOut(Status::SubCode::kLockTimeout);
     case TOKUDB_OUT_OF_LOCKS:
-      return Status::Busy(Status::SubCode::kLockLimit);
+      return Status::LockLimit();
     case DB_LOCK_DEADLOCK: {
       std::reverse(di_path.begin(), di_path.end());
       dlock_buffer_.AddNewPath(
@@ -139,7 +139,7 @@ Status RangeTreeLockManager::TryLock(PessimisticTransaction* txn,
     }
     default:
       assert(0);
-      return Status::Busy(Status::SubCode::kLockLimit);
+      return Status::LockLimit();
   }
 
   return Status::OK();
diff --git a/utilities/transactions/optimistic_transaction_db_impl.cc b/utilities/transactions/optimistic_transaction_db_impl.cc
index 3ad9d517739d..42ddddc82774 100644
--- a/utilities/transactions/optimistic_transaction_db_impl.cc
+++ b/utilities/transactions/optimistic_transaction_db_impl.cc
@@ -73,7 +73,7 @@ Status OptimisticTransactionDB::Open(
     std::vector<ColumnFamilyHandle*>* handles,
     OptimisticTransactionDB** dbptr) {
   Status s;
-  DB* db;
+  std::unique_ptr<DB> db;
 
   std::vector<ColumnFamilyDescriptor> column_families_copy = column_families;
 
@@ -81,8 +81,7 @@ Status OptimisticTransactionDB::Open(
   for (auto& column_family : column_families_copy) {
     ColumnFamilyOptions* options = &column_family.options;
 
-    if (options->max_write_buffer_size_to_maintain == 0 &&
-        options->max_write_buffer_number_to_maintain == 0) {
+    if (options->max_write_buffer_size_to_maintain == 0) {
       // Setting to -1 will set the History size to
       // max_write_buffer_number * write_buffer_size.
       options->max_write_buffer_size_to_maintain = -1;
@@ -92,7 +91,7 @@ Status OptimisticTransactionDB::Open(
   s = DB::Open(db_options, dbname, column_families_copy, handles, &db);
 
   if (s.ok()) {
-    *dbptr = new OptimisticTransactionDBImpl(db, occ_options);
+    *dbptr = new OptimisticTransactionDBImpl(std::move(db), occ_options);
   }
 
   return s;
diff --git a/utilities/transactions/optimistic_transaction_db_impl.h b/utilities/transactions/optimistic_transaction_db_impl.h
index 86213832dde1..be6e7b0b6941 100644
--- a/utilities/transactions/optimistic_transaction_db_impl.h
+++ b/utilities/transactions/optimistic_transaction_db_impl.h
@@ -44,10 +44,9 @@ class OccLockBucketsImpl : public OccLockBucketsImplBase {
 class OptimisticTransactionDBImpl : public OptimisticTransactionDB {
  public:
   explicit OptimisticTransactionDBImpl(
-      DB* db, const OptimisticTransactionDBOptions& occ_options,
-      bool take_ownership = true)
-      : OptimisticTransactionDB(db),
-        db_owner_(take_ownership),
+      std::unique_ptr<DB>&& db,
+      const OptimisticTransactionDBOptions& occ_options)
+      : OptimisticTransactionDB(std::move(db)),
         validate_policy_(occ_options.validate_policy) {
     if (validate_policy_ == OccValidationPolicy::kValidateParallel) {
       auto bucketed_locks = occ_options.shared_lock_buckets;
@@ -60,13 +59,7 @@ class OptimisticTransactionDBImpl : public OptimisticTransactionDB {
     }
   }
 
-  ~OptimisticTransactionDBImpl() {
-    // Prevent this stackable from destroying
-    // base db
-    if (!db_owner_) {
-      db_ = nullptr;
-    }
-  }
+  ~OptimisticTransactionDBImpl() override = default;
 
   Transaction* BeginTransaction(const WriteOptions& write_options,
                                 const OptimisticTransactionOptions& txn_options,
@@ -97,8 +90,6 @@ class OptimisticTransactionDBImpl : public OptimisticTransactionDB {
  private:
   std::shared_ptr<OccLockBucketsImplBase> bucketed_locks_;
 
-  bool db_owner_;
-
   const OccValidationPolicy validate_policy_;
 
   void ReinitializeTransaction(Transaction* txn,
diff --git a/utilities/transactions/pessimistic_transaction.cc b/utilities/transactions/pessimistic_transaction.cc
index e978ad863135..79b26f201d8e 100644
--- a/utilities/transactions/pessimistic_transaction.cc
+++ b/utilities/transactions/pessimistic_transaction.cc
@@ -84,6 +84,10 @@ void PessimisticTransaction::Initialize(const TransactionOptions& txn_options) {
         txn_db_impl_->GetTxnDBOptions().transaction_lock_timeout * 1000;
   }
 
+  // deadlock timeout should be lower than lock timeout
+  deadlock_timeout_us_ =
+      std::min(txn_options.deadlock_timeout_us, lock_timeout_);
+
   if (txn_options.expiration >= 0) {
     expiration_time_ = start_time_ + txn_options.expiration * 1000;
   } else {
@@ -105,13 +109,15 @@ void PessimisticTransaction::Initialize(const TransactionOptions& txn_options) {
   commit_timestamp_ = kMaxTxnTimestamp;
 
   if (txn_options.commit_bypass_memtable) {
-    commit_bypass_memtable_threshold_ = 0;
+    // No need to optimize for empty transction
+    commit_bypass_memtable_threshold_ = 1;
   } else {
     commit_bypass_memtable_threshold_ =
-        db_options.txn_commit_bypass_memtable_threshold;
+        txn_options.large_txn_commit_optimize_threshold;
   }
-  write_batch_.SetTrackPerCFStat(commit_bypass_memtable_threshold_ <
-                                 std::numeric_limits<uint32_t>::max());
+
+  commit_bypass_memtable_byte_threshold_ =
+      txn_options.large_txn_commit_optimize_byte_threshold;
 }
 
 PessimisticTransaction::~PessimisticTransaction() {
@@ -811,7 +817,7 @@ Status WriteCommittedTxn::CommitWithoutPrepareInternal() {
   }
   auto s = db_impl_->WriteImpl(
       write_options_, wb,
-      /*callback*/ nullptr, /*user_write_cb=*/nullptr, /*log_used*/ nullptr,
+      /*callback*/ nullptr, /*user_write_cb=*/nullptr, /*wal_used*/ nullptr,
       /*log_ref*/ 0, /*disable_memtable*/ false, &seq_used, /*batch_cnt=*/0,
       /*pre_release_callback=*/nullptr, post_mem_cb);
   assert(!s.ok() || seq_used != kMaxSequenceNumber);
@@ -825,7 +831,7 @@ Status WriteCommittedTxn::CommitBatchInternal(WriteBatch* batch, size_t) {
   uint64_t seq_used = kMaxSequenceNumber;
   auto s = db_impl_->WriteImpl(write_options_, batch, /*callback*/ nullptr,
                                /*user_write_cb=*/nullptr,
-                               /*log_used*/ nullptr, /*log_ref*/ 0,
+                               /*wal_used*/ nullptr, /*log_ref*/ 0,
                                /*disable_memtable*/ false, &seq_used);
   assert(!s.ok() || seq_used != kMaxSequenceNumber);
   if (s.ok()) {
@@ -852,8 +858,8 @@ Status WriteCommittedTxn::CommitInternal() {
   if (!needs_ts) {
     s = WriteBatchInternal::MarkCommit(working_batch, name_);
   } else {
-    assert(commit_bypass_memtable_threshold_ ==
-           std::numeric_limits<uint32_t>::max());
+    assert(!commit_bypass_memtable_threshold_);
+    assert(!commit_bypass_memtable_byte_threshold_);
     assert(commit_timestamp_ != kMaxTxnTimestamp);
     char commit_ts_buf[sizeof(kMaxTxnTimestamp)];
     EncodeFixed64(commit_ts_buf, commit_timestamp_);
@@ -889,7 +895,39 @@ Status WriteCommittedTxn::CommitInternal() {
   // any operations appended to this working_batch will be ignored from WAL
   working_batch->MarkWalTerminationPoint();
 
-  bool bypass_memtable = wb->Count() > commit_bypass_memtable_threshold_;
+  uint32_t wb_count = wb->Count();
+  RecordInHistogram(db_impl_->immutable_db_options_.stats,
+                    NUM_OP_PER_TRANSACTION, wb_count);
+  bool bypass_memtable = false;
+  if (!needs_ts) {
+    if (commit_bypass_memtable_threshold_ &&
+        wb_count >= commit_bypass_memtable_threshold_) {
+      if (wbwi->GetWBWIOpCount() != wb_count) {
+        ROCKS_LOG_WARN(
+            db_impl_->immutable_db_options().info_log,
+            "Transaction %s qualifies for commit optimization due to update "
+            "count. However, it will commit normally due to wbwi and wb record "
+            "count mismatch. Some updates were added directly to the "
+            "transaction's underlying write batch.",
+            GetName().c_str());
+      } else {
+        bypass_memtable = true;
+      }
+    } else if (commit_bypass_memtable_byte_threshold_ &&
+               wb->GetDataSize() >= commit_bypass_memtable_byte_threshold_) {
+      if (wbwi->GetWBWIOpCount() != wb_count) {
+        ROCKS_LOG_WARN(
+            db_impl_->immutable_db_options().info_log,
+            "Transaction %s qualifies for commit optimization due to write "
+            "batch size. However, it will commit normally due to wbwi and wb "
+            "record count mismatch. Some updates were added directly to the "
+            "transaction's underlying write batch.",
+            GetName().c_str());
+      } else {
+        bypass_memtable = true;
+      }
+    }
+  }
   if (!bypass_memtable) {
     // insert prepared batch into Memtable only skipping WAL.
     // Memtable will ignore BeginPrepare/EndPrepare markers
@@ -914,14 +952,17 @@ Status WriteCommittedTxn::CommitInternal() {
   TEST_SYNC_POINT_CALLBACK("WriteCommittedTxn::CommitInternal:bypass_memtable",
                            static_cast<void*>(&bypass_memtable));
   if (bypass_memtable) {
+    // Used for differentiating commiting WBWI vs directly ingesting WBWI
+    // see (IngestWriteBatchWithIndex())
+    assert(working_batch->HasCommit());
     s = db_impl_->WriteImpl(
         write_options_, working_batch, /*callback*/ nullptr,
         /*user_write_cb=*/nullptr,
-        /*log_used*/ nullptr, /*log_ref*/ log_number_,
+        /*wal_used*/ nullptr, /*log_ref*/ log_number_,
         /*disable_memtable*/ false, &seq_used,
         /*batch_cnt=*/0, /*pre_release_callback=*/nullptr, post_mem_cb,
-        /*wbwi=*/std::make_shared<WriteBatchWithIndex>(std::move(write_batch_)),
-        /*min_prep_log=*/log_number_);
+        /*wbwi=*/
+        std::make_shared<WriteBatchWithIndex>(std::move(write_batch_)));
     // Reset write_batch_ since it's accessed in transaction clean up and
     // might be used for transaction reuse.
     write_batch_ = WriteBatchWithIndex(cmp_, 0, true, 0,
@@ -929,7 +970,7 @@ Status WriteCommittedTxn::CommitInternal() {
   } else {
     s = db_impl_->WriteImpl(write_options_, working_batch, /*callback*/ nullptr,
                             /*user_write_cb=*/nullptr,
-                            /*log_used*/ nullptr, /*log_ref*/ log_number_,
+                            /*wal_used*/ nullptr, /*log_ref*/ log_number_,
                             /*disable_memtable*/ false, &seq_used,
                             /*batch_cnt=*/0, /*pre_release_callback=*/nullptr,
                             post_mem_cb);
diff --git a/utilities/transactions/pessimistic_transaction.h b/utilities/transactions/pessimistic_transaction.h
index a85071ad187a..71ec74f0efa4 100644
--- a/utilities/transactions/pessimistic_transaction.h
+++ b/utilities/transactions/pessimistic_transaction.h
@@ -71,18 +71,26 @@ class PessimisticTransaction : public TransactionBaseImpl {
                                             std::string* key) const override {
     std::lock_guard<std::mutex> lock(wait_mutex_);
     std::vector<TransactionID> ids(waiting_txn_ids_.size());
-    if (key) *key = waiting_key_ ? *waiting_key_ : "";
+    if (timed_out_key_.has_value()) {
+      if (key) *key = timed_out_key_.value();
+    } else {
+      if (key) *key = waiting_key_ ? *waiting_key_ : "";
+    }
     if (column_family_id) *column_family_id = waiting_cf_id_;
     std::copy(waiting_txn_ids_.begin(), waiting_txn_ids_.end(), ids.begin());
     return ids;
   }
 
-  void SetWaitingTxn(autovector<TransactionID> ids, uint32_t column_family_id,
-                     const std::string* key) {
+  void SetWaitingTxn(autovector<TransactionID>& ids, uint32_t column_family_id,
+                     const std::string* key, bool is_timed_out = false) {
     std::lock_guard<std::mutex> lock(wait_mutex_);
     waiting_txn_ids_ = ids;
     waiting_cf_id_ = column_family_id;
-    waiting_key_ = key;
+    if (is_timed_out) {
+      timed_out_key_ = key ? *key : "";
+    } else {
+      waiting_key_ = key;
+    }
   }
 
   void ClearWaitingTxn() {
@@ -106,6 +114,10 @@ class PessimisticTransaction : public TransactionBaseImpl {
   void SetLockTimeout(int64_t timeout) override {
     lock_timeout_ = timeout * 1000;
   }
+  int64_t GetDeadlockTimeout() const { return deadlock_timeout_us_; }
+  void SetDeadlockTimeout(int64_t timeout_ms) override {
+    deadlock_timeout_us_ = timeout_ms * 1000;
+  }
 
   // Returns true if locks were stolen successfully, false otherwise.
   bool TryStealingLocks();
@@ -166,10 +178,11 @@ class PessimisticTransaction : public TransactionBaseImpl {
   // Refer to
   // TransactionOptions::skip_prepare
   bool skip_prepare_ = false;
-  // Refer to
-  // TransactionOptions::commit_bypass_memtable
+  // Refer to TransactionOptions::commit_bypass_memtable
   uint32_t commit_bypass_memtable_threshold_ =
       std::numeric_limits<uint32_t>::max();
+  uint64_t commit_bypass_memtable_byte_threshold_ =
+      std::numeric_limits<uint64_t>::max();
 
  private:
   friend class TransactionTest_ValidateSnapshotTest_Test;
@@ -181,7 +194,7 @@ class PessimisticTransaction : public TransactionBaseImpl {
 
   // IDs for the transactions that are blocking the current transaction.
   //
-  // empty if current transaction is not waiting.
+  // empty if current transaction is not waiting or has timed out
   autovector<TransactionID> waiting_txn_ids_;
 
   // The following two represents the (cf, key) that a transaction is waiting
@@ -195,12 +208,19 @@ class PessimisticTransaction : public TransactionBaseImpl {
   uint32_t waiting_cf_id_;
   const std::string* waiting_key_;
 
+  // Waiting key with lifetime of the txn so it can be accessed after timeouts
+  std::optional<std::string> timed_out_key_;
+
   // Mutex protecting waiting_txn_ids_, waiting_cf_id_ and waiting_key_.
   mutable std::mutex wait_mutex_;
 
   // Timeout in microseconds when locking a key or -1 if there is no timeout.
   int64_t lock_timeout_;
 
+  // Timeout in microseconds before perform dead lock detection.
+  // If 0, deadlock detection will be performed immediately.
+  int64_t deadlock_timeout_us_;
+
   // Whether to perform deadlock detection or not.
   bool deadlock_detect_;
 
diff --git a/utilities/transactions/pessimistic_transaction_db.cc b/utilities/transactions/pessimistic_transaction_db.cc
index 37fd80e86259..823b474e2ffa 100644
--- a/utilities/transactions/pessimistic_transaction_db.cc
+++ b/utilities/transactions/pessimistic_transaction_db.cc
@@ -284,8 +284,7 @@ void TransactionDB::PrepareWrap(
   for (size_t i = 0; i < column_families->size(); i++) {
     ColumnFamilyOptions* cf_options = &(*column_families)[i].options;
 
-    if (cf_options->max_write_buffer_size_to_maintain == 0 &&
-        cf_options->max_write_buffer_number_to_maintain == 0) {
+    if (cf_options->max_write_buffer_size_to_maintain == 0) {
       // Setting to -1 will set the History size to
       // max_write_buffer_number * write_buffer_size.
       cf_options->max_write_buffer_size_to_maintain = -1;
diff --git a/utilities/transactions/timestamped_snapshot_test.cc b/utilities/transactions/timestamped_snapshot_test.cc
index 1ca265aa153a..8bd72eea01b1 100644
--- a/utilities/transactions/timestamped_snapshot_test.cc
+++ b/utilities/transactions/timestamped_snapshot_test.cc
@@ -9,17 +9,26 @@
 #include "utilities/transactions/transaction_test.h"
 
 namespace ROCKSDB_NAMESPACE {
+
+constexpr std::array TimestampedSnapshotWithTsSanityCheck_Params = {
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite),
+    std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite)};
+
 INSTANTIATE_TEST_CASE_P(
     Unsupported, TimestampedSnapshotWithTsSanityCheck,
-    ::testing::Values(
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite),
-        std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite)));
-
-INSTANTIATE_TEST_CASE_P(WriteCommitted, TransactionTest,
-                        ::testing::Combine(::testing::Bool(), ::testing::Bool(),
-                                           ::testing::Values(WRITE_COMMITTED),
-                                           ::testing::Values(kOrderedWrite)));
+    ::testing::ValuesIn(WRAP_PARAM_WITH_PER_KEY_POINT_LOCK_MANAGER_PARAMS(
+        WRAP_PARAM(bool, bool, TxnDBWritePolicy, WriteOrdering),
+        TimestampedSnapshotWithTsSanityCheck_Params)));
+
+INSTANTIATE_TEST_CASE_P(
+    WriteCommitted, TransactionTest,
+    ::testing::Combine(/*use_stackable_db=*/::testing::Bool(),
+                       /*two_write_queue=*/::testing::Bool(),
+                       ::testing::Values(WRITE_COMMITTED),
+                       ::testing::Values(kOrderedWrite),
+                       /*use_per_key_point_lock_mgr=*/::testing::Bool(),
+                       /*deadlock_timeout_us=*/::testing::Values(0, 1000)));
 
 namespace {
 // Not thread-safe. Caller needs to provide external synchronization.
diff --git a/utilities/transactions/transaction_base.h b/utilities/transactions/transaction_base.h
index 859518ceceea..49366e59d56b 100644
--- a/utilities/transactions/transaction_base.h
+++ b/utilities/transactions/transaction_base.h
@@ -250,6 +250,8 @@ class TransactionBaseImpl : public Transaction {
 
   void SetLockTimeout(int64_t /*timeout*/) override { /* Do nothing */ }
 
+  void SetDeadlockTimeout(int64_t /*timeout*/) override { /* Do nothing */ }
+
   const Snapshot* GetSnapshot() const override {
     // will return nullptr when there is no snapshot
     return snapshot_.get();
diff --git a/utilities/transactions/transaction_db_mutex_impl.cc b/utilities/transactions/transaction_db_mutex_impl.cc
index 7e10feccbd0f..9f549eae952d 100644
--- a/utilities/transactions/transaction_db_mutex_impl.cc
+++ b/utilities/transactions/transaction_db_mutex_impl.cc
@@ -7,8 +7,9 @@
 
 #include <chrono>
 #include <condition_variable>
-#include <functional>
 #include <mutex>
+#include <sstream>
+#include <thread>
 
 #include "rocksdb/utilities/transaction_db_mutex.h"
 
diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc
index 97b74f9bce68..3c7a7747af32 100644
--- a/utilities/transactions/transaction_test.cc
+++ b/utilities/transactions/transaction_test.cc
@@ -35,51 +35,71 @@
 
 namespace ROCKSDB_NAMESPACE {
 
+constexpr std::array DBAsBaseDB_TransactionTest_Params = {
+    std::make_tuple(false, false, WRITE_COMMITTED, kOrderedWrite),
+    std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite),
+    std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite),
+    std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite)};
+
 INSTANTIATE_TEST_CASE_P(
     DBAsBaseDB, TransactionTest,
-    ::testing::Values(
-        std::make_tuple(false, false, WRITE_COMMITTED, kOrderedWrite),
-        std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite),
-        std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite),
-        std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite)));
+    ::testing::ValuesIn(WRAP_PARAM_WITH_PER_KEY_POINT_LOCK_MANAGER_PARAMS(
+        WRAP_PARAM(bool, bool, TxnDBWritePolicy, WriteOrdering),
+        DBAsBaseDB_TransactionTest_Params)));
+
+constexpr std::array DBAsBaseDB_TransactionStressTest_Params = {
+    std::make_tuple(false, false, WRITE_COMMITTED, kOrderedWrite),
+    std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite),
+    std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite),
+    std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite)};
+
 INSTANTIATE_TEST_CASE_P(
     DBAsBaseDB, TransactionStressTest,
-    ::testing::Values(
-        std::make_tuple(false, false, WRITE_COMMITTED, kOrderedWrite),
-        std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite),
-        std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite),
-        std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite)));
+    ::testing::ValuesIn(WRAP_PARAM_WITH_PER_KEY_POINT_LOCK_MANAGER_PARAMS(
+        WRAP_PARAM(bool, bool, TxnDBWritePolicy, WriteOrdering),
+        DBAsBaseDB_TransactionStressTest_Params)));
+
+constexpr std::array StackableDBAsBaseDB_TransactionTest_Params = {
+    std::make_tuple(true, true, WRITE_COMMITTED, kOrderedWrite),
+    std::make_tuple(true, true, WRITE_PREPARED, kOrderedWrite),
+    std::make_tuple(true, true, WRITE_UNPREPARED, kOrderedWrite)};
+
 INSTANTIATE_TEST_CASE_P(
     StackableDBAsBaseDB, TransactionTest,
-    ::testing::Values(
-        std::make_tuple(true, true, WRITE_COMMITTED, kOrderedWrite),
-        std::make_tuple(true, true, WRITE_PREPARED, kOrderedWrite),
-        std::make_tuple(true, true, WRITE_UNPREPARED, kOrderedWrite)));
+    ::testing::ValuesIn(WRAP_PARAM_WITH_PER_KEY_POINT_LOCK_MANAGER_PARAMS(
+        WRAP_PARAM(bool, bool, TxnDBWritePolicy, WriteOrdering),
+        StackableDBAsBaseDB_TransactionTest_Params)));
 
 // MySQLStyleTransactionTest takes far too long for valgrind to run. Only do it
 // in full mode (`ROCKSDB_FULL_VALGRIND_RUN` compiler flag is set).
 #if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+constexpr std::array MySQLStyleTransactionTest_Params = {
+    std::make_tuple(false, false, WRITE_COMMITTED, kOrderedWrite, false),
+    std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite, false),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, false),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, true),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, false),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, true),
+    std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite, false),
+    std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite, true),
+    std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite, false),
+    std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite, true),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, false),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, true)};
+
 INSTANTIATE_TEST_CASE_P(
     MySQLStyleTransactionTest, MySQLStyleTransactionTest,
-    ::testing::Values(
-        std::make_tuple(false, false, WRITE_COMMITTED, kOrderedWrite, false),
-        std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite, false),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, false),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, true),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, false),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, true),
-        std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite, false),
-        std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite, true),
-        std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite, false),
-        std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite, true),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, false),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, true)));
+    ::testing::ValuesIn(WRAP_PARAM_WITH_PER_KEY_POINT_LOCK_MANAGER_PARAMS(
+        WRAP_PARAM(bool, bool, TxnDBWritePolicy, WriteOrdering, bool),
+        MySQLStyleTransactionTest_Params)));
+
 #endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 
 TEST_P(TransactionTest, TestUpperBoundUponDeletion) {
@@ -561,6 +581,16 @@ TEST_P(TransactionTest, WaitingTxn) {
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
 
+  // We expect GetWaitingTxns still returns the waiting values as it would
+  // normally before timeout
+  std::string key;
+  uint32_t cf_id;
+  std::vector<TransactionID> wait = txn2->GetWaitingTxns(&cf_id, &key);
+  ASSERT_EQ(key, "foo");
+  ASSERT_EQ(wait.size(), 1);
+  ASSERT_EQ(wait[0], id1);
+  ASSERT_EQ(cf_id, 0U);
+
   delete cfa;
   delete txn1;
   delete txn2;
@@ -3912,16 +3942,16 @@ TEST_P(TransactionTest, LockLimitTest) {
 
   // lock limit reached
   s = txn->Put("W", "w");
-  ASSERT_TRUE(s.IsBusy());
+  ASSERT_TRUE(s.IsLockLimit());
 
   // re-locking same key shouldn't put us over the limit
   s = txn->Put("X", "xx");
   ASSERT_OK(s);
 
   s = txn->GetForUpdate(read_options, "W", &value);
-  ASSERT_TRUE(s.IsBusy());
+  ASSERT_TRUE(s.IsLockLimit());
   s = txn->GetForUpdate(read_options, "V", &value);
-  ASSERT_TRUE(s.IsBusy());
+  ASSERT_TRUE(s.IsLockLimit());
 
   // re-locking same key shouldn't put us over the limit
   s = txn->GetForUpdate(read_options, "Y", &value);
@@ -3940,7 +3970,7 @@ TEST_P(TransactionTest, LockLimitTest) {
 
   // lock limit reached
   s = txn2->Put("M", "m");
-  ASSERT_TRUE(s.IsBusy());
+  ASSERT_TRUE(s.IsLockLimit());
 
   s = txn->Commit();
   ASSERT_OK(s);
@@ -3967,7 +3997,7 @@ TEST_P(TransactionTest, LockLimitTest) {
 
   // lock limit reached
   s = txn2->Delete("Y");
-  ASSERT_TRUE(s.IsBusy());
+  ASSERT_TRUE(s.IsLockLimit());
 
   s = txn2->Commit();
   ASSERT_OK(s);
@@ -3987,6 +4017,44 @@ TEST_P(TransactionTest, LockLimitTest) {
   delete txn2;
 }
 
+TEST_P(TransactionTest, LockLimitWithTimeoutHangTest) {
+  // Tests a bug where transaction can infinite-loop during lock acquiry.
+  // This happens when lock limit is reached and user specifies a positive
+  // timeout which is reached before the transaction start waiting for it.
+  WriteOptions write_options;
+  TransactionOptions txn_options;
+
+  txn_db_options.max_num_locks = 3;
+  txn_db_options.transaction_lock_timeout = 10;  // 10ms
+  ASSERT_OK(ReOpen());
+
+  Transaction* txn = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn);
+
+  ASSERT_OK(txn->Put("X", "x"));
+  ASSERT_OK(txn->Put("Y", "y"));
+  ASSERT_OK(txn->Put("Z", "z"));
+
+  TransactionOptions txn2_options;
+  txn2_options.lock_timeout = 1;  // 1ms short timeout
+  Transaction* txn2 = db->BeginTransaction(write_options, txn2_options);
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "PointLockManager::AcquireWithTimeout:WaitingTxn", [&](void*) {
+        // Sleep for 2ms, so timeout is already passed for txn2 before waiting.
+        // txn2 should fail instead of waiting forever.
+        env->SleepForMicroseconds(2 * 1000);
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // This lock attempt should fail and return
+  ASSERT_TRUE(txn2->Put("W", "w").IsLockLimit());
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  delete txn;
+  delete txn2;
+}
+
 TEST_P(TransactionTest, IteratorTest) {
   // This test does writes without snapshot validation, and then tries to create
   // iterator later, which is unsupported in write unprepared.
@@ -5729,8 +5797,8 @@ Status TransactionStressTestInserter(
   TransactionOptions txn_options;
   txn_options.use_only_the_last_commit_time_batch_for_recovery = true;
 
-  // Inside the inserter we might also retake the snapshot. We do both since two
-  // separte functions are engaged for each.
+  // Inside the inserter we might also retake the snapshot. We do both since
+  // two separte functions are engaged for each.
   txn_options.set_snapshot = rand->OneIn(2);
 
   RandomTransactionInserter inserter(
@@ -8814,7 +8882,7 @@ TEST_P(TransactionTest, SecondaryIndexOnKey) {
   }
 }
 
-TEST_F(TransactionDBTest, CollapseKey) {
+TEST_P(TransactionDBTest, CollapseKey) {
   ASSERT_OK(ReOpen());
   ASSERT_OK(db->Put({}, "hello", "world"));
   ASSERT_OK(db->Flush({}));
@@ -8863,7 +8931,7 @@ TEST_F(TransactionDBTest, CollapseKey) {
   }
 }
 
-TEST_F(TransactionDBTest, FlushedLogWithPendingPrepareIsSynced) {
+TEST_P(TransactionDBTest, FlushedLogWithPendingPrepareIsSynced) {
   // Repro for a bug where we missed a necessary sync of the old WAL during
   // memtable flush. It happened due to applying an optimization to skip syncing
   // the old WAL in too many scenarios (all memtable flushes on single CF
@@ -8908,8 +8976,9 @@ TEST_F(TransactionDBTest, FlushedLogWithPendingPrepareIsSynced) {
   }
 }
 
-class CommitBypassMemtableTest : public DBTestBase,
-                                 public ::testing::WithParamInterface<bool> {
+class CommitBypassMemtableTest
+    : public DBTestBase,
+      public ::testing::WithParamInterface<std::tuple<bool, bool>> {
  public:
   CommitBypassMemtableTest() : DBTestBase("commit_bypass_memtable_test", true) {
     SetUpTransactionDB();
@@ -8920,27 +8989,29 @@ class CommitBypassMemtableTest : public DBTestBase,
   Options options;
   TransactionDBOptions txn_db_opts;
 
-  void SetUpTransactionDB(
-      uint32_t threshold = std::numeric_limits<uint32_t>::max()) {
+  void SetUpTransactionDB(bool atomic_flush = false) {
     options = CurrentOptions();
     options.create_if_missing = true;
     options.allow_2pc = true;
-    options.two_write_queues = GetParam();
+    options.two_write_queues = std::get<0>(GetParam());
     // Avoid write stall
     options.max_write_buffer_number = 8;
+    options.atomic_flush = atomic_flush;
     // Destroy the DB to recreate as a TransactionDB.
     Close();
     Destroy(options, true);
 
     txn_db_opts.write_policy = TxnDBWritePolicy::WRITE_COMMITTED;
-    txn_db_opts.txn_commit_bypass_memtable_threshold = threshold;
+    txn_db_opts.use_per_key_point_lock_mgr = std::get<1>(GetParam());
     ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, &txn_db));
     ASSERT_NE(txn_db, nullptr);
-    db_ = txn_db;
+    db_.reset(txn_db);
   }
 };
 
-INSTANTIATE_TEST_CASE_P(, CommitBypassMemtableTest, testing::Bool());
+INSTANTIATE_TEST_CASE_P(, CommitBypassMemtableTest,
+                        ::testing::Combine(::testing::Bool(),
+                                           ::testing::Bool()));
 
 // TODO: parameterize other tests in the file with commit_bypass_memtable
 TEST_P(CommitBypassMemtableTest, SingleCFUpdate) {
@@ -9382,17 +9453,17 @@ TEST_P(CommitBypassMemtableTest, Recovery) {
   VerifyDBFromMap(expected);
 
   ASSERT_OK(txn_db->Close());
-  delete txn_db;
+  db_.reset();  // destroys txn_db (owned by db_)
   ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, &txn_db));
-  db_ = txn_db;
+  db_.reset(txn_db);
 
   VerifyDBFromMap(expected);
 }
 
-TEST_P(CommitBypassMemtableTest, ThresholdTxnDBOption) {
-  // Tests TransactionDBOptions::txn_commit_bypass_memtable_threshold
+TEST_P(CommitBypassMemtableTest, OptimizeLargeTxnCommitThreshold) {
+  // Tests TransactionOptions::large_txn_commit_optimize_threshold
   const uint32_t threshold = 10;
-  SetUpTransactionDB(/*threshold=*/threshold);
+  SetUpTransactionDB();
   bool commit_bypass_memtable = false;
   // TODO: add and use stats for this
   SyncPoint::GetInstance()->SetCallBack(
@@ -9400,53 +9471,124 @@ TEST_P(CommitBypassMemtableTest, ThresholdTxnDBOption) {
       [&](void* arg) { commit_bypass_memtable = *(static_cast<bool*>(arg)); });
   SyncPoint::GetInstance()->EnableProcessing();
 
-  // TransactionOptions::commit_bypass_memtable takes precedence
   WriteOptions wopts;
+  // Test default (disabled)
   TransactionOptions txn_opts;
-  txn_opts.commit_bypass_memtable = true;
-  Transaction* txn1 = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
+  auto txn1 = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
+  ASSERT_OK(txn1->SetName("xid0"));
+  for (int i = 0; i < 100; ++i) {
+    ASSERT_OK(
+        txn1->Put("key" + std::to_string(i), "value" + std::to_string(i)));
+  }
+  ASSERT_OK(txn1->Prepare());
+  ASSERT_OK(txn1->Commit());
+  ASSERT_FALSE(commit_bypass_memtable);
+  delete txn1;
+
+  // Test with transaction option only
+  txn_opts.large_txn_commit_optimize_threshold = threshold;
+
+  // Test with transaction below threshold
+  txn1 = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
   ASSERT_OK(txn1->SetName("xid1"));
-  ASSERT_OK(txn1->Put("k2", "v2"));
   ASSERT_OK(txn1->Put("k1", "v1"));
   ASSERT_OK(txn1->Prepare());
   ASSERT_OK(txn1->Commit());
-  ASSERT_TRUE(commit_bypass_memtable);
+  ASSERT_FALSE(commit_bypass_memtable);
+  delete txn1;
 
-  // Below threshold
-  for (auto num_ops : {threshold, threshold + 1}) {
-    commit_bypass_memtable = false;
-    txn_opts.commit_bypass_memtable = false;
-    auto txn = txn_db->BeginTransaction(wopts, txn_opts, txn1);
-    txn1 = nullptr;
-    ASSERT_OK(txn->SetName("xid" + std::to_string(num_ops)));
-    for (uint32_t i = 0; i < num_ops; ++i) {
-      ASSERT_OK(
-          txn->Put("key" + std::to_string(i), "value" + std::to_string(i)));
-    }
-    ASSERT_OK(txn->Prepare());
-    ASSERT_OK(txn->Commit());
-    ASSERT_EQ(commit_bypass_memtable, num_ops > threshold);
-    delete txn;
+  // Test with transaction at threshold
+  txn1 = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
+  ASSERT_OK(txn1->SetName("xid2"));
+  for (uint32_t i = 0; i < threshold; ++i) {
+    ASSERT_OK(
+        txn1->Put("key" + std::to_string(i), "value" + std::to_string(i)));
   }
+  ASSERT_OK(txn1->Prepare());
+  ASSERT_OK(txn1->Commit());
+  ASSERT_TRUE(commit_bypass_memtable);
+  delete txn1;
 
-  // Repeat the same test with updates to two CFs
+  SetUpTransactionDB();
+  // Test with multiple column families
   std::vector<std::string> cfs = {"pk", "sk"};
   CreateColumnFamilies(cfs, options);
 
+  txn_opts.large_txn_commit_optimize_threshold = threshold;
+
   // Below threshold
-  for (auto num_ops : {threshold, threshold + 1}) {
-    commit_bypass_memtable = false;
-    txn_opts.commit_bypass_memtable = false;
-    auto txn_cf = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
-    ASSERT_OK(txn_cf->SetName("xid_cf" + std::to_string(num_ops)));
-    for (uint32_t i = 0; i < num_ops; ++i) {
-      ASSERT_OK(txn_cf->Put(handles_[i % 2], "key" + std::to_string(i),
-                            "value" + std::to_string(i)));
-    }
-    ASSERT_OK(txn_cf->Prepare());
-    ASSERT_OK(txn_cf->Commit());
-    ASSERT_EQ(commit_bypass_memtable, num_ops > threshold);
-    delete txn_cf;
+  auto txn_cf = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
+  ASSERT_OK(txn_cf->SetName("xid_cf_below"));
+  for (uint32_t i = 0; i < threshold - 1; ++i) {
+    ASSERT_OK(txn_cf->Put(handles_[i % 2], "key" + std::to_string(i),
+                          "value" + std::to_string(i)));
+  }
+  ASSERT_OK(txn_cf->Prepare());
+  commit_bypass_memtable = false;
+  ASSERT_OK(txn_cf->Commit());
+  ASSERT_FALSE(commit_bypass_memtable);
+  delete txn_cf;
+
+  // At threshold
+  txn_cf = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
+  ASSERT_OK(txn_cf->SetName("xid_cf_at_threshold"));
+  for (uint32_t i = 0; i < threshold; ++i) {
+    ASSERT_OK(txn_cf->Put(handles_[i % 2], "key" + std::to_string(i),
+                          "value" + std::to_string(i)));
+  }
+  ASSERT_OK(txn_cf->Prepare());
+  commit_bypass_memtable = false;
+  ASSERT_OK(txn_cf->Commit());
+  ASSERT_TRUE(commit_bypass_memtable);
+  delete txn_cf;
+
+  // Test that commit_bypass_memtable takes precedence over
+  // large_txn_commit_optimize_threshold
+  txn_opts.large_txn_commit_optimize_threshold =
+      threshold * 10;                      // High threshold
+  txn_opts.commit_bypass_memtable = true;  // Should override threshold
+
+  txn_cf = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
+  ASSERT_OK(txn_cf->SetName("xid_cf_precedence"));
+  ASSERT_OK(txn_cf->Put(handles_[0], "key1", "value1"));  // Just one operation
+  ASSERT_OK(txn_cf->Prepare());
+  commit_bypass_memtable = false;
+  ASSERT_OK(txn_cf->Commit());
+  ASSERT_TRUE(commit_bypass_memtable);  // Should be true because of
+                                        // commit_bypass_memtable
+  delete txn_cf;
+}
+
+TEST_P(CommitBypassMemtableTest, AtomicFlushTest) {
+  const uint32_t threshold = 10;
+  SetUpTransactionDB(/*atomic_flush=*/true);
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  std::vector<std::string> cfs = {"cf0", "cf1", "cf2"};
+  CreateColumnFamilies(cfs, options);
+
+  // Seed data in CF1 and 2 as atomic flush picks CFs with non-empty memtable
+  ASSERT_OK(db_->Put({}, handles_[1], "key1", "val1"));
+  ASSERT_OK(db_->Put({}, handles_[2], "key2", "val2"));
+
+  // Write to cf 0, should see cf1 and cf2 flushed too
+  TransactionOptions txn_opts;
+  txn_opts.large_txn_commit_optimize_threshold = threshold;
+  auto txn = txn_db->BeginTransaction({}, txn_opts, nullptr);
+  for (uint32_t i = 0; i <= threshold; ++i) {
+    ASSERT_OK(txn->Put(handles_[0], "key" + std::to_string(i),
+                       "cf0" + std::to_string(i)));
+  }
+  ASSERT_OK(txn->SetName("cf0"));
+  ASSERT_OK(txn->Prepare());
+  ASSERT_OK(txn->Commit());
+  delete txn;
+
+  ASSERT_OK(db_->WaitForCompact({}));
+  for (size_t i = 0; i < 3; ++i) {
+    auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+    ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+    ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty());
   }
 }
 
@@ -9656,6 +9798,226 @@ TEST_P(CommitBypassMemtableTest, MergeMiniStress) {
     VerifyDBFromMap(expected_cf, nullptr, false, nullptr, handles_[0]);
   }
 }
+
+TEST_P(TransactionDBTest, SelfDeadlockBug) {
+  ASSERT_OK(ReOpen());
+
+  // Create two transactions
+  WriteOptions write_options;
+  TransactionOptions txn_options;
+  txn_options.lock_timeout = 50;  // 50ms
+  txn_options.deadlock_detect = true;
+
+  ASSERT_OK(db->Put({}, "shared_key", "shared_value"));
+
+  // First transaction
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn1);
+  ASSERT_OK(txn1->SetName("txn1"));
+
+  // Second transaction
+  Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn2);
+  ASSERT_OK(txn2->SetName("txn2"));
+
+  // Both transactions acquire shared lock on the same key.
+  std::string value;
+  ASSERT_OK(txn1->GetForUpdate(ReadOptions(), "shared_key", &value,
+                               /*exclusive=*/false));
+  ASSERT_OK(txn2->GetForUpdate(ReadOptions(), "shared_key", &value,
+                               /*exclusive=*/false));
+
+  // Second transaction tries to upgrade to exclusive lock, which should
+  // timeout.
+  Status s = txn1->Put({}, "shared_key", "val");
+  // Print out the deadlock info buffer
+  ASSERT_TRUE(db->GetDeadlockInfoBuffer().empty());
+  ASSERT_TRUE(s.IsTimedOut());
+  ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key");
+
+  // After release lock from txn2, txn1 should be able to proceed.
+  ASSERT_OK(txn2->Rollback());
+  ASSERT_OK(txn1->Put({}, "shared_key", "val"));
+  ASSERT_OK(txn1->Rollback());
+  delete txn1;
+  delete txn2;
+}
+
+INSTANTIATE_TEST_CASE_P(
+    TransactionDBBasicTest, TransactionDBTest,
+    ::testing::Combine(/*user_per_key_point_lock_manager=*/::testing::Bool(),
+                       /*deadlock_timeout_us=*/::testing::Values(0, 1000)));
+
+TEST_P(CommitBypassMemtableTest,
+       OptimizeLargeTxnCommitWriteBatchSizeThreshold) {
+  // Tests TransactionOptions::large_txn_commit_optimize_byte_threshold
+  const uint64_t threshold = 100;
+  SetUpTransactionDB();
+  bool commit_bypass_memtable = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "WriteCommittedTxn::CommitInternal:bypass_memtable",
+      [&](void* arg) { commit_bypass_memtable = *(static_cast<bool*>(arg)); });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+
+  WriteOptions wopts;
+  TransactionOptions txn_opts;
+  // Test default
+  auto txn = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
+  ASSERT_OK(txn->SetName("xid0"));
+  ASSERT_OK(txn->Put("k1", rnd.RandomString(1000)));
+  ASSERT_OK(txn->Prepare());
+  ASSERT_OK(txn->Commit());
+  ASSERT_FALSE(commit_bypass_memtable);
+
+  // Test with transaction option only
+  txn_opts.large_txn_commit_optimize_byte_threshold = threshold;
+  // Above threshold
+  txn = txn_db->BeginTransaction(wopts, txn_opts, txn);
+  ASSERT_OK(txn->SetName("xid1"));
+  ASSERT_OK(txn->Put("k1", rnd.RandomString(threshold)));
+  ASSERT_TRUE(txn->GetWriteBatch()->GetDataSize() >= threshold);
+  ASSERT_OK(txn->Prepare());
+  ASSERT_OK(txn->Commit());
+  ASSERT_TRUE(commit_bypass_memtable);
+
+  // Below threshold
+  txn = txn_db->BeginTransaction(wopts, txn_opts, txn);
+  ASSERT_OK(txn->SetName("xid2"));
+  ASSERT_OK(txn->Put("k2", "v2"));
+  ASSERT_TRUE(txn->GetWriteBatch()->GetDataSize() < threshold);
+  ASSERT_OK(txn->Prepare());
+  ASSERT_OK(txn->Commit());
+  ASSERT_FALSE(commit_bypass_memtable);
+  delete txn;
+
+  // With commit_bypass_memtbale
+  TransactionOptions txn_opts2;
+  txn_opts2.commit_bypass_memtable = true;
+  txn_opts2.large_txn_commit_optimize_byte_threshold = threshold;
+  txn = txn_db->BeginTransaction(wopts, txn_opts2, nullptr);
+  ASSERT_OK(txn->SetName("xid3"));
+  ASSERT_OK(txn->Put("k3", "v3"));
+  ASSERT_TRUE(txn->GetWriteBatch()->GetDataSize() < threshold);
+  ASSERT_OK(txn->Prepare());
+  ASSERT_OK(txn->Commit());
+  ASSERT_TRUE(commit_bypass_memtable);
+  delete txn;
+
+  // With count based threshold `large_txn_commit_optimize_threshold`
+  TransactionOptions txn_opts3;
+  txn_opts3.commit_bypass_memtable = false;
+  txn_opts3.large_txn_commit_optimize_byte_threshold = threshold;
+  txn_opts3.large_txn_commit_optimize_threshold = 3;
+  txn = txn_db->BeginTransaction(wopts, txn_opts3, nullptr);
+  ASSERT_OK(txn->SetName("xid4"));
+  ASSERT_OK(txn->Put("k3", "v3"));
+  ASSERT_OK(txn->Delete("k2"));
+  ASSERT_OK(txn->Delete("k1"));
+  ASSERT_TRUE(txn->GetWriteBatch()->GetDataSize() < threshold);
+  ASSERT_OK(txn->Prepare());
+  ASSERT_OK(txn->Commit());
+  ASSERT_TRUE(commit_bypass_memtable);
+
+  txn = txn_db->BeginTransaction(wopts, txn_opts3, txn);
+  ASSERT_OK(txn->SetName("xid4"));
+  ASSERT_OK(txn->Put("k3", "v3"));
+  ASSERT_OK(txn->Delete("k2"));
+  ASSERT_OK(txn->Delete("k1"));
+  ASSERT_TRUE(txn->GetWriteBatch()->GetDataSize() < threshold);
+  ASSERT_OK(txn->Prepare());
+  ASSERT_OK(txn->Commit());
+  ASSERT_TRUE(commit_bypass_memtable);
+
+  txn = txn_db->BeginTransaction(wopts, txn_opts3, txn);
+  ASSERT_OK(txn->SetName("xid5"));
+  ASSERT_OK(txn->Put("k5", "v5"));
+  ASSERT_TRUE(txn->GetWriteBatch()->GetDataSize() < threshold);
+  ASSERT_OK(txn->Prepare());
+  ASSERT_OK(txn->Commit());
+  ASSERT_FALSE(commit_bypass_memtable);
+  delete txn;
+
+  // Test with multiple column families
+  std::vector<std::string> cfs = {"pk", "sk"};
+  CreateColumnFamilies(cfs, options);
+  TransactionOptions txn_opts_cf;
+
+  txn_opts_cf.large_txn_commit_optimize_byte_threshold = threshold;
+
+  // Below threshold
+  auto txn_cf = txn_db->BeginTransaction(wopts, txn_opts_cf, nullptr);
+  ASSERT_OK(txn_cf->SetName("xid_cf_above"));
+  ASSERT_OK(txn_cf->Put(handles_[0], "k1", rnd.RandomString(threshold / 2)));
+  ASSERT_OK(txn_cf->Put(handles_[1], "k2", rnd.RandomString(threshold / 2)));
+  ASSERT_TRUE(txn_cf->GetWriteBatch()->GetDataSize() >= threshold);
+  ASSERT_OK(txn_cf->Prepare());
+  ASSERT_OK(txn_cf->Commit());
+  ASSERT_TRUE(commit_bypass_memtable);
+
+  txn_cf = txn_db->BeginTransaction(wopts, txn_opts_cf, txn_cf);
+  ASSERT_OK(txn_cf->SetName("xid_cf_below"));
+  ASSERT_OK(txn_cf->Put(handles_[0], "k1", rnd.RandomString(10)));
+  ASSERT_OK(txn_cf->Put(handles_[1], "k2", rnd.RandomString(10)));
+  ASSERT_TRUE(txn_cf->GetWriteBatch()->GetDataSize() < threshold);
+  ASSERT_OK(txn_cf->Prepare());
+  ASSERT_OK(txn_cf->Commit());
+  ASSERT_FALSE(commit_bypass_memtable);
+
+  delete txn_cf;
+}
+
+TEST_P(CommitBypassMemtableTest, WBWIOpCountMismatchWBCount) {
+  // Tests that large txn optimization checks op count in WBWI vs WB. When an
+  // update is written directly to a transaction's underlying write batch, the
+  // optimization should not apply.
+  SetUpTransactionDB();
+  bool commit_bypass_memtable = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "WriteCommittedTxn::CommitInternal:bypass_memtable",
+      [&](void* arg) { commit_bypass_memtable = *(static_cast<bool*>(arg)); });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  {
+    WriteOptions wopts;
+    TransactionOptions txn_opts;
+    txn_opts.large_txn_commit_optimize_byte_threshold = 100;
+    auto txn = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
+    ASSERT_OK(txn->SetName("xid0"));
+    ASSERT_OK(txn->Put("k1", rnd.RandomString(1000)));
+    // This update is written directly to the underlying write batch, so the
+    // optimization should not apply.
+    ASSERT_OK(txn->GetWriteBatch()->GetWriteBatch()->Put("meta", "1"));
+    ASSERT_OK(txn->Prepare());
+    ASSERT_OK(txn->Commit());
+    ASSERT_FALSE(commit_bypass_memtable);
+
+    ASSERT_EQ(Get("meta"), "1");
+    delete txn;
+  }
+
+  {
+    WriteOptions wopts;
+    TransactionOptions txn_opts;
+    txn_opts.large_txn_commit_optimize_threshold = 10;
+    auto txn = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
+    ASSERT_OK(txn->SetName("xid0"));
+    for (int i = 0; i < 10; ++i) {
+      ASSERT_OK(txn->Put(Key(i), rnd.RandomString(10)));
+    }
+    // This update is written directly to the underlying write batch, so the
+    // optimization should not apply.
+    ASSERT_OK(txn->GetWriteBatch()->GetWriteBatch()->Put("meta", "2"));
+    ASSERT_OK(txn->Prepare());
+    ASSERT_OK(txn->Commit());
+    ASSERT_FALSE(commit_bypass_memtable);
+
+    ASSERT_EQ(Get("meta"), "2");
+    delete txn;
+  }
+}
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/utilities/transactions/transaction_test.h b/utilities/transactions/transaction_test.h
index 72f7e7036bf4..464c9e6883f1 100644
--- a/utilities/transactions/transaction_test.h
+++ b/utilities/transactions/transaction_test.h
@@ -49,14 +49,18 @@ class TransactionTestBase : public ::testing::Test {
 
   TransactionDBOptions txn_db_options;
   bool use_stackable_db_;
+  int64_t deadlock_timeout_us_;
 
   TransactionTestBase(bool use_stackable_db, bool two_write_queue,
                       TxnDBWritePolicy write_policy,
-                      WriteOrdering write_ordering)
+                      WriteOrdering write_ordering,
+                      bool use_per_key_point_lock_mgr,
+                      int64_t deadlock_timeout_us)
       : db(nullptr),
         special_env(Env::Default()),
         env(nullptr),
-        use_stackable_db_(use_stackable_db) {
+        use_stackable_db_(use_stackable_db),
+        deadlock_timeout_us_(deadlock_timeout_us) {
     options.create_if_missing = true;
     options.max_write_buffer_number = 2;
     options.write_buffer_size = 4 * 1024;
@@ -77,6 +81,7 @@ class TransactionTestBase : public ::testing::Test {
     txn_db_options.default_lock_timeout = 0;
     txn_db_options.write_policy = write_policy;
     txn_db_options.rollback_merge_operands = true;
+    txn_db_options.use_per_key_point_lock_mgr = use_per_key_point_lock_mgr;
     // This will stress write unprepared, by forcing write batch flush on every
     // write.
     txn_db_options.default_write_batch_flush_threshold = 1;
@@ -481,30 +486,35 @@ class TransactionTestBase : public ::testing::Test {
 
 class TransactionTest
     : public TransactionTestBase,
-      virtual public ::testing::WithParamInterface<
-          std::tuple<bool, bool, TxnDBWritePolicy, WriteOrdering>> {
+      virtual public ::testing::WithParamInterface<std::tuple<
+          bool, bool, TxnDBWritePolicy, WriteOrdering, bool, int64_t>> {
  public:
   TransactionTest()
       : TransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                            std::get<2>(GetParam()), std::get<3>(GetParam())){};
+                            std::get<2>(GetParam()), std::get<3>(GetParam()),
+                            std::get<4>(GetParam()), std::get<5>(GetParam())) {}
 };
 
-class TransactionDBTest : public TransactionTestBase {
+class TransactionDBTest
+    : public TransactionTestBase,
+      virtual public ::testing::WithParamInterface<std::tuple<bool, int64_t>> {
  public:
   TransactionDBTest()
-      : TransactionTestBase(false, false, WRITE_COMMITTED, kOrderedWrite) {}
+      : TransactionTestBase(false, false, WRITE_COMMITTED, kOrderedWrite,
+                            std::get<0>(GetParam()), std::get<1>(GetParam())) {}
 };
 
 class TransactionStressTest : public TransactionTest {};
 
 class MySQLStyleTransactionTest
     : public TransactionTestBase,
-      virtual public ::testing::WithParamInterface<
-          std::tuple<bool, bool, TxnDBWritePolicy, WriteOrdering, bool>> {
+      virtual public ::testing::WithParamInterface<std::tuple<
+          bool, bool, TxnDBWritePolicy, WriteOrdering, bool, bool, int64_t>> {
  public:
   MySQLStyleTransactionTest()
       : TransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                            std::get<2>(GetParam()), std::get<3>(GetParam())),
+                            std::get<2>(GetParam()), std::get<3>(GetParam()),
+                            std::get<5>(GetParam()), std::get<6>(GetParam())),
         with_slow_threads_(std::get<4>(GetParam())) {
     if (with_slow_threads_ &&
         (txn_db_options.write_policy == WRITE_PREPARED ||
@@ -527,11 +537,13 @@ class MySQLStyleTransactionTest
 
 class WriteCommittedTxnWithTsTest
     : public TransactionTestBase,
-      public ::testing::WithParamInterface<std::tuple<bool, bool, bool>> {
+      public ::testing::WithParamInterface<
+          std::tuple<bool, bool, bool, bool, int64_t>> {
  public:
   WriteCommittedTxnWithTsTest()
       : TransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                            WRITE_COMMITTED, kOrderedWrite) {}
+                            WRITE_COMMITTED, kOrderedWrite,
+                            std::get<3>(GetParam()), std::get<4>(GetParam())) {}
   ~WriteCommittedTxnWithTsTest() override {
     for (auto* h : handles_) {
       delete h;
@@ -567,12 +579,13 @@ class WriteCommittedTxnWithTsTest
 
 class TimestampedSnapshotWithTsSanityCheck
     : public TransactionTestBase,
-      public ::testing::WithParamInterface<
-          std::tuple<bool, bool, TxnDBWritePolicy, WriteOrdering>> {
+      public ::testing::WithParamInterface<std::tuple<
+          bool, bool, TxnDBWritePolicy, WriteOrdering, bool, int64_t>> {
  public:
   explicit TimestampedSnapshotWithTsSanityCheck()
       : TransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                            std::get<2>(GetParam()), std::get<3>(GetParam())) {}
+                            std::get<2>(GetParam()), std::get<3>(GetParam()),
+                            std::get<4>(GetParam()), std::get<5>(GetParam())) {}
   ~TimestampedSnapshotWithTsSanityCheck() override {
     for (auto* h : handles_) {
       delete h;
@@ -583,4 +596,68 @@ class TimestampedSnapshotWithTsSanityCheck
   std::vector<ColumnFamilyHandle*> handles_{};
 };
 
+// The following templates causes a bug in GCC 14, ignore the error for now
+#if defined(__GNUC__) && __GNUC__ == 14
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstringop-overflow"
+#endif
+
+// Wrap existing params with per-key point lock manager parameters
+template <typename TargetParamType, typename SourceParamType, std::size_t... Is>
+std::vector<TargetParamType> WrapParamWithPerKeyPointLockManagerParamsImpl(
+    SourceParamType&& source_param, std::index_sequence<Is...>) {
+  std::vector<TargetParamType> wrapped_params;
+  // Use original PointLockManager
+  wrapped_params.push_back(TargetParamType(
+      std::get<Is>(std::forward<SourceParamType>(source_param))..., false,
+      INT64_C(0)));
+  // Use PerKeyPointLockManager with deadlock timeout 0
+  wrapped_params.push_back(TargetParamType(
+      std::get<Is>(std::forward<SourceParamType>(source_param))..., true,
+      INT64_C(0)));
+  // Use PerKeyPointLockManager with deadlock timeout 1000
+  wrapped_params.push_back(TargetParamType(
+      std::get<Is>(std::forward<SourceParamType>(source_param))..., true,
+      INT64_C(1000)));
+
+  return wrapped_params;
+}
+
+template <typename TargetParamType, typename SourceParamType>
+std::vector<TargetParamType> WrapParamWithPerKeyPointLockManagerParams(
+    SourceParamType&& source_param) {
+  // Get the size of the source param
+  constexpr std::size_t N = std::tuple_size_v<std::decay_t<SourceParamType>>;
+  // Create an index sequence from 0 to N-1
+  return WrapParamWithPerKeyPointLockManagerParamsImpl<TargetParamType>(
+      std::forward<SourceParamType>(source_param),
+      std::make_index_sequence<N>{});
+}
+
+template <typename TargetParamType, typename SourceParamType, size_t M>
+std::vector<TargetParamType> WrapParamsWithPerKeyPointLockManagerParams(
+    std::array<SourceParamType, M> source_param) {
+  std::vector<TargetParamType> wrapped_params;
+  for (auto& param : source_param) {
+    // Create an index sequence from 0 to N-1
+    auto new_params =
+        WrapParamWithPerKeyPointLockManagerParams<TargetParamType>(
+            std::forward<SourceParamType>(param));
+    wrapped_params.insert(wrapped_params.end(), new_params.begin(),
+                          new_params.end());
+  }
+  return wrapped_params;
+}
+
+#if defined(__GNUC__) && __GNUC__ == 14
+#pragma GCC diagnostic pop
+#endif
+
+#define WRAP_PARAM(...) __VA_ARGS__
+
+#define WRAP_PARAM_WITH_PER_KEY_POINT_LOCK_MANAGER_PARAMS(SOURCE_PARAM_TYPES, \
+                                                          PARAMS)             \
+  WrapParamsWithPerKeyPointLockManagerParams<                                 \
+      std::tuple<SOURCE_PARAM_TYPES, bool, int64_t>>(PARAMS)
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/transactions/write_committed_transaction_ts_test.cc b/utilities/transactions/write_committed_transaction_ts_test.cc
index d73371f80f40..36c958c33d08 100644
--- a/utilities/transactions/write_committed_transaction_ts_test.cc
+++ b/utilities/transactions/write_committed_transaction_ts_test.cc
@@ -14,26 +14,12 @@
 namespace ROCKSDB_NAMESPACE {
 
 INSTANTIATE_TEST_CASE_P(
-    DBAsBaseDB, WriteCommittedTxnWithTsTest,
-    ::testing::Values(std::make_tuple(false, /*two_write_queue=*/false,
-                                      /*enable_indexing=*/false),
-                      std::make_tuple(false, /*two_write_queue=*/true,
-                                      /*enable_indexing=*/false),
-                      std::make_tuple(false, /*two_write_queue=*/false,
-                                      /*enable_indexing=*/true),
-                      std::make_tuple(false, /*two_write_queue=*/true,
-                                      /*enable_indexing=*/true)));
-
-INSTANTIATE_TEST_CASE_P(
-    DBAsStackableDB, WriteCommittedTxnWithTsTest,
-    ::testing::Values(std::make_tuple(true, /*two_write_queue=*/false,
-                                      /*enable_indexing=*/false),
-                      std::make_tuple(true, /*two_write_queue=*/true,
-                                      /*enable_indexing=*/false),
-                      std::make_tuple(true, /*two_write_queue=*/false,
-                                      /*enable_indexing=*/true),
-                      std::make_tuple(true, /*two_write_queue=*/true,
-                                      /*enable_indexing=*/true)));
+    DBAsBaseDBAndStackableDB, WriteCommittedTxnWithTsTest,
+    ::testing::Combine(/*use_stackable_db=*/::testing::Bool(),
+                       /*two_write_queue=*/::testing::Bool(),
+                       /*enable_indexing=*/::testing::Bool(),
+                       /*use_per_key_point_lock_mgr=*/::testing::Bool(),
+                       /*deadlock_timeout_us=*/::testing::Values(0, 1000)));
 
 TEST_P(WriteCommittedTxnWithTsTest, SanityChecks) {
   ASSERT_OK(ReOpenNoDelete());
diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc
index 28443c525baf..956c8e66a685 100644
--- a/utilities/transactions/write_prepared_transaction_test.cc
+++ b/utilities/transactions/write_prepared_transaction_test.cc
@@ -196,7 +196,7 @@ TEST(PreparedHeap, Concurrent) {
 TEST(WriteBatchWithIndex, SubBatchCnt) {
   ColumnFamilyOptions cf_options;
   std::string cf_name = "two";
-  DB* db;
+  std::unique_ptr<DB> db;
   Options options;
   options.create_if_missing = true;
   const std::string dbname = test::PerThreadDBPath("transaction_testdb");
@@ -285,7 +285,6 @@ TEST(WriteBatchWithIndex, SubBatchCnt) {
   }
 
   delete cf_handle;
-  delete db;
 }
 
 TEST(CommitEntry64b, BasicTest) {
@@ -354,9 +353,12 @@ class WritePreparedTransactionTestBase : public TransactionTestBase {
  public:
   WritePreparedTransactionTestBase(bool use_stackable_db, bool two_write_queue,
                                    TxnDBWritePolicy write_policy,
-                                   WriteOrdering write_ordering)
+                                   WriteOrdering write_ordering,
+                                   bool user_per_key_point_lock_mgr,
+                                   int64_t deadlock_timeout_us)
       : TransactionTestBase(use_stackable_db, two_write_queue, write_policy,
-                            write_ordering){};
+                            write_ordering, user_per_key_point_lock_mgr,
+                            deadlock_timeout_us) {}
 
  protected:
   void UpdateTransactionDBOptions(size_t snapshot_cache_bits,
@@ -528,27 +530,30 @@ class WritePreparedTransactionTestBase : public TransactionTestBase {
 
 class WritePreparedTransactionTest
     : public WritePreparedTransactionTestBase,
-      virtual public ::testing::WithParamInterface<
-          std::tuple<bool, bool, TxnDBWritePolicy, WriteOrdering>> {
+      virtual public ::testing::WithParamInterface<std::tuple<
+          bool, bool, TxnDBWritePolicy, WriteOrdering, bool, int64_t>> {
  public:
   WritePreparedTransactionTest()
       : WritePreparedTransactionTestBase(
             std::get<0>(GetParam()), std::get<1>(GetParam()),
-            std::get<2>(GetParam()), std::get<3>(GetParam())){};
+            std::get<2>(GetParam()), std::get<3>(GetParam()),
+            std::get<4>(GetParam()), std::get<5>(GetParam())) {}
 };
 
 #if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 class SnapshotConcurrentAccessTest
     : public WritePreparedTransactionTestBase,
-      virtual public ::testing::WithParamInterface<std::tuple<
-          bool, bool, TxnDBWritePolicy, WriteOrdering, size_t, size_t>> {
+      virtual public ::testing::WithParamInterface<
+          std::tuple<bool, bool, TxnDBWritePolicy, WriteOrdering, size_t,
+                     size_t, bool, int64_t>> {
  public:
   SnapshotConcurrentAccessTest()
       : WritePreparedTransactionTestBase(
             std::get<0>(GetParam()), std::get<1>(GetParam()),
-            std::get<2>(GetParam()), std::get<3>(GetParam())),
+            std::get<2>(GetParam()), std::get<3>(GetParam()),
+            std::get<6>(GetParam()), std::get<7>(GetParam())),
         split_id_(std::get<4>(GetParam())),
-        split_cnt_(std::get<5>(GetParam())){};
+        split_cnt_(std::get<5>(GetParam())) {}
 
  protected:
   // A test is split into split_cnt_ tests, each identified with split_id_ where
@@ -560,13 +565,15 @@ class SnapshotConcurrentAccessTest
 
 class SeqAdvanceConcurrentTest
     : public WritePreparedTransactionTestBase,
-      virtual public ::testing::WithParamInterface<std::tuple<
-          bool, bool, TxnDBWritePolicy, WriteOrdering, size_t, size_t>> {
+      virtual public ::testing::WithParamInterface<
+          std::tuple<bool, bool, TxnDBWritePolicy, WriteOrdering, size_t,
+                     size_t, bool, int64_t>> {
  public:
   SeqAdvanceConcurrentTest()
       : WritePreparedTransactionTestBase(
             std::get<0>(GetParam()), std::get<1>(GetParam()),
-            std::get<2>(GetParam()), std::get<3>(GetParam())),
+            std::get<2>(GetParam()), std::get<3>(GetParam()),
+            std::get<6>(GetParam()), std::get<7>(GetParam())),
         split_id_(std::get<4>(GetParam())),
         split_cnt_(std::get<5>(GetParam())) {
     special_env.skip_fsync_ = true;
@@ -579,120 +586,143 @@ class SeqAdvanceConcurrentTest
   size_t split_cnt_;
 };
 
+constexpr std::array WritePreparedTransactionTest_Params = {
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite)};
+
 INSTANTIATE_TEST_CASE_P(
     WritePreparedTransaction, WritePreparedTransactionTest,
-    ::testing::Values(
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite)));
+    ::testing::ValuesIn(WRAP_PARAM_WITH_PER_KEY_POINT_LOCK_MANAGER_PARAMS(
+        WRAP_PARAM(bool, bool, TxnDBWritePolicy, WriteOrdering),
+        WritePreparedTransactionTest_Params)));
 
 #if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+constexpr std::array TwoWriteQueue_SnapshotConcurrentAccessTest_Params = {
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 0, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 1, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 2, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 3, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 4, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 5, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 6, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 7, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 8, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 9, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 10, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 11, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 12, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 13, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 14, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 15, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 16, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 17, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 18, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 19, 20),
+
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 0, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 1, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 2, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 3, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 4, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 5, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 6, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 7, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 8, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 9, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 10, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 11, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 12, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 13, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 14, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 15, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 16, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 17, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 18, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 19, 20)};
+
 INSTANTIATE_TEST_CASE_P(
-    TwoWriteQueues, SnapshotConcurrentAccessTest,
-    ::testing::Values(
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 0, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 1, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 2, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 3, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 4, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 5, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 6, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 7, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 8, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 9, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 10, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 11, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 12, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 13, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 14, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 15, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 16, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 17, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 18, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 19, 20),
-
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 0, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 1, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 2, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 3, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 4, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 5, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 6, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 7, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 8, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 9, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 10, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 11, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 12, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 13, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 14, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 15, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 16, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 17, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 18, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 19, 20)));
+    TwoWriteQueuesPointLockManager, SnapshotConcurrentAccessTest,
+    ::testing::ValuesIn(WRAP_PARAM_WITH_PER_KEY_POINT_LOCK_MANAGER_PARAMS(
+        WRAP_PARAM(bool, bool, TxnDBWritePolicy, WriteOrdering, size_t, size_t),
+        TwoWriteQueue_SnapshotConcurrentAccessTest_Params)));
+
+constexpr std::array OneWriteQueue_SnapshotConcurrentAccessTest_Params = {
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 0, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 1, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 2, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 3, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 4, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 5, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 6, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 7, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 8, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 9, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 10, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 11, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 12, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 13, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 14, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 15, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 16, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 17, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 18, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 19, 20),
+};
 
 INSTANTIATE_TEST_CASE_P(
     OneWriteQueue, SnapshotConcurrentAccessTest,
-    ::testing::Values(
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 0, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 1, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 2, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 3, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 4, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 5, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 6, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 7, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 8, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 9, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 10, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 11, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 12, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 13, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 14, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 15, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 16, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 17, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 18, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 19, 20)));
+    ::testing::ValuesIn(WRAP_PARAM_WITH_PER_KEY_POINT_LOCK_MANAGER_PARAMS(
+        WRAP_PARAM(bool, bool, TxnDBWritePolicy, WriteOrdering, size_t, size_t),
+        OneWriteQueue_SnapshotConcurrentAccessTest_Params)));
+
+constexpr std::array TwoWriteQueues_SeqAdvanceConcurrentTest_Params = {
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 0, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 1, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 2, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 3, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 4, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 5, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 6, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 7, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 8, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 9, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 0, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 1, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 2, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 3, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 4, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 5, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 6, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 7, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 8, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 9, 10)};
 
 INSTANTIATE_TEST_CASE_P(
     TwoWriteQueues, SeqAdvanceConcurrentTest,
-    ::testing::Values(
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 0, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 1, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 2, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 3, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 4, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 5, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 6, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 7, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 8, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 9, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 0, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 1, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 2, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 3, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 4, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 5, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 6, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 7, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 8, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 9, 10)));
+    ::testing::ValuesIn(WRAP_PARAM_WITH_PER_KEY_POINT_LOCK_MANAGER_PARAMS(
+        WRAP_PARAM(bool, bool, TxnDBWritePolicy, WriteOrdering, size_t, size_t),
+        TwoWriteQueues_SeqAdvanceConcurrentTest_Params)));
+
+constexpr std::array OneWriteQueue_SeqAdvanceConcurrentTest_Params = {
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 0, 10),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 1, 10),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 2, 10),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 3, 10),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 4, 10),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 5, 10),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 6, 10),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 7, 10),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 8, 10),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 9, 10)};
 
 INSTANTIATE_TEST_CASE_P(
     OneWriteQueue, SeqAdvanceConcurrentTest,
-    ::testing::Values(
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 0, 10),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 1, 10),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 2, 10),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 3, 10),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 4, 10),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 5, 10),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 6, 10),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 7, 10),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 8, 10),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 9, 10)));
+    ::testing::ValuesIn(WRAP_PARAM_WITH_PER_KEY_POINT_LOCK_MANAGER_PARAMS(
+        WRAP_PARAM(bool, bool, TxnDBWritePolicy, WriteOrdering, size_t, size_t),
+        OneWriteQueue_SeqAdvanceConcurrentTest_Params)));
+
 #endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 
 TEST_P(WritePreparedTransactionTest, CommitMap) {
@@ -791,7 +821,8 @@ TEST_P(WritePreparedTransactionTest, CheckKeySkipOldMemtable) {
   const int kAttemptImmMemTable = 1;
   for (int attempt = kAttemptHistoryMemtable; attempt <= kAttemptImmMemTable;
        attempt++) {
-    options.max_write_buffer_number_to_maintain = 3;
+    options.max_write_buffer_size_to_maintain =
+        3 * static_cast<int>(options.write_buffer_size);
     ASSERT_OK(ReOpen());
 
     WriteOptions write_options;
diff --git a/utilities/transactions/write_prepared_transaction_test_seqno.cc b/utilities/transactions/write_prepared_transaction_test_seqno.cc
new file mode 100644
index 000000000000..0148ab9cc32c
--- /dev/null
+++ b/utilities/transactions/write_prepared_transaction_test_seqno.cc
@@ -0,0 +1,425 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+// Test to verify that sequence numbers remain consistent during error recovery
+// with WritePrepared TransactionDB and two_write_queues=true.
+//
+// The fix: SyncLastSequenceWithAllocated() is called during ResumeImpl to
+// ensure that allocated-but-not-published sequence numbers are accounted for
+// before creating new memtables/WALs, preventing "sequence number going
+// backwards" corruption on subsequent recovery.
+
+#include <atomic>
+#include <memory>
+#include <string>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "db/version_set.h"
+#include "port/stack_trace.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "utilities/fault_injection_fs.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class WritePreparedTransactionSeqnoTest : public ::testing::Test {
+ public:
+  WritePreparedTransactionSeqnoTest()
+      : db_(nullptr),
+        special_env_(Env::Default()),
+        fault_fs_(new FaultInjectionTestFS(FileSystem::Default())),
+        env_(new CompositeEnvWrapper(&special_env_, fault_fs_)) {
+    options_.create_if_missing = true;
+    options_.max_write_buffer_number = 2;
+    options_.write_buffer_size = 4 * 1024;
+    options_.level0_file_num_compaction_trigger = 2;
+    options_.env = env_.get();
+    // Use two_write_queues which is typical for WritePrepared
+    options_.two_write_queues = true;
+    // Enable auto recovery from retryable errors
+    options_.max_bgerror_resume_count = 2;
+    options_.bgerror_resume_retry_interval = 100000;  // 100ms
+
+    dbname_ = test::PerThreadDBPath("write_prepared_seqno_test");
+    EXPECT_OK(DestroyDB(dbname_, options_));
+
+    txn_db_options_.transaction_lock_timeout = 0;
+    txn_db_options_.default_lock_timeout = 0;
+    txn_db_options_.write_policy = TxnDBWritePolicy::WRITE_PREPARED;
+  }
+
+  ~WritePreparedTransactionSeqnoTest() {
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    if (db_) {
+      for (auto h : handles_) {
+        if (h) {
+          EXPECT_OK(db_->DestroyColumnFamilyHandle(h));
+        }
+      }
+      handles_.clear();
+      delete db_;
+      db_ = nullptr;
+    }
+  }
+
+  Status Open() {
+    return TransactionDB::Open(options_, txn_db_options_, dbname_, &db_);
+  }
+
+  void Close() {
+    for (auto h : handles_) {
+      if (h) {
+        EXPECT_OK(db_->DestroyColumnFamilyHandle(h));
+      }
+    }
+    handles_.clear();
+    delete db_;
+    db_ = nullptr;
+  }
+
+  DBImpl* dbimpl() { return static_cast_with_check<DBImpl>(db_->GetRootDB()); }
+
+ protected:
+  TransactionDB* db_;
+  SpecialEnv special_env_;
+  std::shared_ptr<FaultInjectionTestFS> fault_fs_;
+  std::unique_ptr<Env> env_;
+  std::string dbname_;
+  Options options_;
+  TransactionDBOptions txn_db_options_;
+  std::vector<ColumnFamilyHandle*> handles_;
+};
+
+// Regression test: verify that after error recovery with two_write_queues,
+// the DB can be closed and reopened without sequence number corruption.
+TEST_F(WritePreparedTransactionSeqnoTest,
+       SeqnoGoesBackwardsDuringErrorRecovery) {
+  ASSERT_OK(Open());
+
+  // Write some initial data and flush to establish baseline
+  WriteOptions write_opts;
+  TransactionOptions txn_opts;
+  for (int i = 0; i < 10; i++) {
+    Transaction* txn = db_->BeginTransaction(write_opts, txn_opts);
+    ASSERT_NE(txn, nullptr);
+    ASSERT_OK(txn->SetName("txn" + std::to_string(i)));
+    ASSERT_OK(txn->Put("key" + std::to_string(i), "value" + std::to_string(i)));
+    ASSERT_OK(txn->Prepare());
+    ASSERT_OK(txn->Commit());
+    delete txn;
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  // Write more data - these will allocate sequence numbers
+  for (int i = 10; i < 20; i++) {
+    Transaction* txn = db_->BeginTransaction(write_opts, txn_opts);
+    ASSERT_NE(txn, nullptr);
+    ASSERT_OK(txn->SetName("txn" + std::to_string(i)));
+    ASSERT_OK(txn->Put("key" + std::to_string(i), "value" + std::to_string(i)));
+    ASSERT_OK(txn->Prepare());
+    ASSERT_OK(txn->Commit());
+    delete txn;
+  }
+
+  // Set up sync point dependency chain for deterministic recovery
+  // synchronization, following the pattern from
+  // ManifestWriteRetryableErrorAutoRecover in error_handler_fs_test.cc.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"RecoverFromRetryableBGIOError:BeforeStart",
+        "SeqnoGoesBackwardsDuringErrorRecovery:0"},
+       {"SeqnoGoesBackwardsDuringErrorRecovery:1",
+        "RecoverFromRetryableBGIOError:BeforeWait1"},
+       {"RecoverFromRetryableBGIOError:RecoverSuccess",
+        "SeqnoGoesBackwardsDuringErrorRecovery:2"}});
+
+  // Inject a retryable MANIFEST write error on the next flush
+  IOStatus error_to_inject = IOStatus::IOError("Injected MANIFEST error");
+  error_to_inject.SetRetryable(true);
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_to_inject); });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Trigger a flush that will fail due to MANIFEST write error
+  Status s = db_->Flush(FlushOptions());
+  ASSERT_NOK(s);
+
+  // Wait for recovery to start, then re-enable filesystem and let it proceed
+  TEST_SYNC_POINT("SeqnoGoesBackwardsDuringErrorRecovery:0");
+  fault_fs_->SetFilesystemActive(true);
+  SyncPoint::GetInstance()->ClearCallBack(
+      "VersionSet::LogAndApply:WriteManifest");
+  TEST_SYNC_POINT("SeqnoGoesBackwardsDuringErrorRecovery:1");
+
+  // Wait for recovery to complete
+  TEST_SYNC_POINT("SeqnoGoesBackwardsDuringErrorRecovery:2");
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  // Write some more data after recovery
+  for (int i = 20; i < 30; i++) {
+    Transaction* txn = db_->BeginTransaction(write_opts, txn_opts);
+    ASSERT_NE(txn, nullptr);
+    ASSERT_OK(txn->SetName("txn_after_" + std::to_string(i)));
+    ASSERT_OK(txn->Put("key" + std::to_string(i), "value" + std::to_string(i)));
+    ASSERT_OK(txn->Prepare());
+    ASSERT_OK(txn->Commit());
+    delete txn;
+  }
+
+  // Close and reopen - this would fail with "sequence number going backwards"
+  // before the fix.
+  Close();
+
+  Status reopen_s = Open();
+  ASSERT_OK(reopen_s);
+
+  // Verify data integrity
+  ReadOptions read_opts;
+  for (int i = 0; i < 20; i++) {
+    std::string value;
+    ASSERT_OK(db_->Get(read_opts, "key" + std::to_string(i), &value));
+    ASSERT_EQ(value, "value" + std::to_string(i));
+  }
+
+  Close();
+}
+
+// Test that verifies the sequence number discrepancy is resolved by checking
+// that LastSequence >= LastAllocatedSequence after recovery completes.
+TEST_F(WritePreparedTransactionSeqnoTest, SeqnoDiscrepancyDuringErrorRecovery) {
+  ASSERT_OK(Open());
+
+  WriteOptions write_opts;
+  TransactionOptions txn_opts;
+
+  // Write initial data and flush
+  for (int i = 0; i < 5; i++) {
+    Transaction* txn = db_->BeginTransaction(write_opts, txn_opts);
+    ASSERT_NE(txn, nullptr);
+    ASSERT_OK(txn->SetName("init_txn" + std::to_string(i)));
+    ASSERT_OK(txn->Put("key" + std::to_string(i), "value" + std::to_string(i)));
+    ASSERT_OK(txn->Prepare());
+    ASSERT_OK(txn->Commit());
+    delete txn;
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  // Write more transactions with two_write_queues to potentially create a gap
+  // between allocated and published sequence numbers. These must be written
+  // before installing the error injection callback, since the small write
+  // buffer (4KB) could trigger an automatic flush during these writes.
+  for (int i = 5; i < 10; i++) {
+    Transaction* txn = db_->BeginTransaction(write_opts, txn_opts);
+    ASSERT_NE(txn, nullptr);
+    ASSERT_OK(txn->SetName("txn" + std::to_string(i)));
+    ASSERT_OK(txn->Put("key" + std::to_string(i), "value" + std::to_string(i)));
+    ASSERT_OK(txn->Prepare());
+    ASSERT_OK(txn->Commit());
+    delete txn;
+  }
+
+  // Track sequence numbers at key points
+  std::atomic<uint64_t> last_seq_after_recovery{0};
+  std::atomic<uint64_t> last_allocated_seq_after_recovery{0};
+  std::atomic<bool> captured_seqs_after{false};
+
+  IOStatus error_to_inject = IOStatus::IOError("Injected error");
+  error_to_inject.SetRetryable(true);
+
+  // Set up sync point dependency chain for deterministic synchronization
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"RecoverFromRetryableBGIOError:BeforeStart",
+        "SeqnoDiscrepancyDuringErrorRecovery:0"},
+       {"SeqnoDiscrepancyDuringErrorRecovery:1",
+        "RecoverFromRetryableBGIOError:BeforeWait1"},
+       {"RecoverFromRetryableBGIOError:RecoverSuccess",
+        "SeqnoDiscrepancyDuringErrorRecovery:2"}});
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_to_inject); });
+
+  // Capture sequence numbers after recovery completes to verify the fix
+  SyncPoint::GetInstance()->SetCallBack(
+      "RecoverFromRetryableBGIOError:RecoverSuccess", [&](void*) {
+        DBImpl* db_impl = dbimpl();
+        if (db_impl) {
+          VersionSet* vs = db_impl->GetVersionSet();
+          if (vs) {
+            last_seq_after_recovery.store(vs->LastSequence());
+            last_allocated_seq_after_recovery.store(
+                vs->LastAllocatedSequence());
+            captured_seqs_after.store(true);
+          }
+        }
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Trigger a flush that will fail
+  Status flush_s = db_->Flush(FlushOptions());
+  ASSERT_NOK(flush_s);
+
+  // Wait for recovery to start, re-enable filesystem, let it proceed
+  TEST_SYNC_POINT("SeqnoDiscrepancyDuringErrorRecovery:0");
+  fault_fs_->SetFilesystemActive(true);
+  SyncPoint::GetInstance()->ClearCallBack(
+      "VersionSet::LogAndApply:WriteManifest");
+  TEST_SYNC_POINT("SeqnoDiscrepancyDuringErrorRecovery:1");
+
+  // Wait for recovery to complete
+  TEST_SYNC_POINT("SeqnoDiscrepancyDuringErrorRecovery:2");
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  // Verify that sequences were captured and are in sync after recovery
+  ASSERT_TRUE(captured_seqs_after.load());
+  ASSERT_GE(last_seq_after_recovery.load(),
+            last_allocated_seq_after_recovery.load())
+      << "LastSequence should be >= LastAllocatedSequence after recovery";
+
+  // Close and reopen should succeed without corruption
+  Close();
+  ASSERT_OK(Open());
+
+  // Verify data integrity
+  ReadOptions read_opts;
+  for (int i = 0; i < 10; i++) {
+    std::string value;
+    ASSERT_OK(db_->Get(read_opts, "key" + std::to_string(i), &value));
+    ASSERT_EQ(value, "value" + std::to_string(i));
+  }
+
+  Close();
+}
+
+// Test that verifies SyncLastSequenceWithAllocated is called during ResumeImpl
+// by checking sequence numbers before and after the sync point.
+TEST_F(WritePreparedTransactionSeqnoTest, ConcurrentWritesDuringErrorRecovery) {
+  ASSERT_OK(Open());
+
+  WriteOptions write_opts;
+  TransactionOptions txn_opts;
+
+  // Write initial data and flush
+  for (int i = 0; i < 5; i++) {
+    Transaction* txn = db_->BeginTransaction(write_opts, txn_opts);
+    ASSERT_NE(txn, nullptr);
+    ASSERT_OK(txn->SetName("init_txn" + std::to_string(i)));
+    ASSERT_OK(txn->Put("key" + std::to_string(i), "value" + std::to_string(i)));
+    ASSERT_OK(txn->Prepare());
+    ASSERT_OK(txn->Commit());
+    delete txn;
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  // Write more transactions. These must be written before installing the error
+  // injection callback, since the small write buffer (4KB) could trigger an
+  // automatic flush during these writes.
+  for (int i = 5; i < 10; i++) {
+    Transaction* txn = db_->BeginTransaction(write_opts, txn_opts);
+    ASSERT_NE(txn, nullptr);
+    ASSERT_OK(txn->SetName("txn" + std::to_string(i)));
+    ASSERT_OK(txn->Put("key" + std::to_string(i), "value" + std::to_string(i)));
+    ASSERT_OK(txn->Prepare());
+    ASSERT_OK(txn->Commit());
+    delete txn;
+  }
+
+  // Track sequence numbers at key points during recovery
+  std::atomic<uint64_t> seq_before_resume{0};
+  std::atomic<uint64_t> alloc_seq_before_resume{0};
+  std::atomic<uint64_t> seq_after_resume{0};
+  std::atomic<uint64_t> alloc_seq_after_resume{0};
+
+  IOStatus error_to_inject = IOStatus::IOError("Injected error");
+  error_to_inject.SetRetryable(true);
+
+  // Set up sync point dependency chain for deterministic synchronization
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"RecoverFromRetryableBGIOError:BeforeStart",
+        "ConcurrentWritesDuringErrorRecovery:0"},
+       {"ConcurrentWritesDuringErrorRecovery:1",
+        "RecoverFromRetryableBGIOError:BeforeWait1"},
+       {"RecoverFromRetryableBGIOError:RecoverSuccess",
+        "ConcurrentWritesDuringErrorRecovery:2"}});
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_to_inject); });
+
+  // Capture sequences right before ResumeImpl runs the sync
+  SyncPoint::GetInstance()->SetCallBack("DBImpl::ResumeImpl:Start", [&](void*) {
+    DBImpl* db_impl = dbimpl();
+    if (db_impl) {
+      VersionSet* vs = db_impl->GetVersionSet();
+      if (vs) {
+        seq_before_resume.store(vs->LastSequence());
+        alloc_seq_before_resume.store(vs->LastAllocatedSequence());
+      }
+    }
+  });
+
+  // Capture sequences right after ResumeImpl syncs them
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::ResumeImpl:AfterSyncSeq", [&](void*) {
+        DBImpl* db_impl = dbimpl();
+        if (db_impl) {
+          VersionSet* vs = db_impl->GetVersionSet();
+          if (vs) {
+            seq_after_resume.store(vs->LastSequence());
+            alloc_seq_after_resume.store(vs->LastAllocatedSequence());
+          }
+        }
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Trigger a flush that will fail
+  Status flush_s = db_->Flush(FlushOptions());
+  ASSERT_NOK(flush_s);
+
+  // Wait for recovery to start, re-enable filesystem, let it proceed
+  TEST_SYNC_POINT("ConcurrentWritesDuringErrorRecovery:0");
+  fault_fs_->SetFilesystemActive(true);
+  SyncPoint::GetInstance()->ClearCallBack(
+      "VersionSet::LogAndApply:WriteManifest");
+  TEST_SYNC_POINT("ConcurrentWritesDuringErrorRecovery:1");
+
+  // Wait for recovery to complete
+  TEST_SYNC_POINT("ConcurrentWritesDuringErrorRecovery:2");
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  // Verify that the AfterSyncSeq callback fired and sequences are in sync
+  ASSERT_GT(seq_after_resume.load(), 0u)
+      << "DBImpl::ResumeImpl:AfterSyncSeq callback should have fired";
+  ASSERT_EQ(seq_after_resume.load(), alloc_seq_after_resume.load())
+      << "Fix should have synced sequences";
+
+  // Close and reopen
+  Close();
+  ASSERT_OK(Open());
+
+  // Verify data integrity
+  ReadOptions read_opts;
+  for (int i = 0; i < 10; i++) {
+    std::string value;
+    ASSERT_OK(db_->Get(read_opts, "key" + std::to_string(i), &value));
+    ASSERT_EQ(value, "value" + std::to_string(i));
+  }
+
+  Close();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/utilities/transactions/write_prepared_txn.h b/utilities/transactions/write_prepared_txn.h
index aca6a19ea08d..3237b8011328 100644
--- a/utilities/transactions/write_prepared_txn.h
+++ b/utilities/transactions/write_prepared_txn.h
@@ -36,6 +36,97 @@ class WritePreparedTxnDB;
 // committed data from uncommitted data. Uncommitted data could be after the
 // Prepare phase in 2PC (WritePreparedTxn) or before that
 // (WriteUnpreparedTxnImpl).
+//
+// == Concrete example: WritePrepared 2PC transaction ==
+//
+// User code:
+//
+//   Transaction* txn = db->BeginTransaction(write_opts, txn_opts);
+//   txn->SetName("txn1");
+//   txn->Put("key1", "value1");   // buffered in WriteBatch, nothing written
+//   yet txn->Prepare();               // Phase 1 txn->Commit(); // Phase 2
+//
+// -- Phase 1: Prepare (PrepareInternal) --
+//
+// The Prepare call (write_prepared_txn.cc PrepareInternal) calls:
+//
+//   db_impl_->WriteImpl(write_options, GetWriteBatch(),
+//                       ..., !DISABLE_MEMTABLE, ...);
+//
+// !DISABLE_MEMTABLE is false — memtable is enabled. This is the defining
+// characteristic of "WritePrepared": the actual data (Put("key1", "value1"))
+// is written to the memtable at Prepare time.
+//
+// Because disable_memtable == false, the routing check at
+// db_impl_write.cc:502 is not taken. The write goes through the main write
+// queue (write_thread_), which handles both WAL and memtable:
+//
+//   Destination | What gets written                          | Sequence
+//   ------------|--------------------------------------------|-----------
+//   WAL         | Put(key1, value1) + EndPrepare(txn1)       | prepare_seq
+//   Memtable    | Put(key1, value1)                          | prepare_seq
+//
+// The data is now durable (WAL) and in the memtable, but not yet visible
+// to readers. Readers use GetLastPublishedSequence() which consults a
+// commit map — since prepare_seq is in the PreparedHeap but not yet in the
+// CommitCache, readers know this data is uncommitted and skip it.
+//
+// -- Phase 2: Commit (CommitInternal) --
+//
+// The Commit call (write_prepared_txn.cc CommitInternal) calls:
+//
+//   db_impl_->WriteImpl(write_options_, working_batch,
+//                       ..., disable_memtable, ...);
+//
+// In the typical case (do_one_write == true, i.e., the commit-time batch
+// is empty or has no data), disable_memtable is true. Now the routing
+// check at db_impl_write.cc:502 is taken:
+//
+//   if (two_write_queues_ && disable_memtable) {
+//       return WriteImplWALOnly(&nonmem_write_thread_, ...);
+//   }
+//
+// The commit goes through the second write queue (nonmem_write_thread_),
+// WAL only:
+//
+//   Destination | What gets written   | Sequence
+//   ------------|---------------------|-----------
+//   WAL         | Commit(txn1) marker | commit_seq
+//   Memtable    | Nothing             | —
+//
+// The PreReleaseCallback (WritePreparedCommitEntryPreReleaseCallback)
+// updates the CommitCache to record that prepare_seq was committed at
+// commit_seq. After this, readers consulting the commit map will see that
+// the data at prepare_seq is committed and therefore visible.
+//
+// -- Why two queues help --
+//
+// The Commit phase doesn't touch the memtable — it only writes a small
+// marker to WAL and updates an in-memory commit map. By routing this
+// through a separate queue, Commit writes don't have to wait behind other
+// transactions' Prepare writes (which do the expensive memtable insertion
+// on the main queue). This is the optimization mentioned in the options
+// comment about MySQL 2PC where commits are serial.
+//
+// -- Sequence number flow --
+//
+//                            last_sequence_ | last_allocated_seq |
+//                            last_published_seq
+//                            ---------------|--------------------|-------------------
+//   Before Prepare:                  9      |         9          |        9
+//
+//   Prepare (main queue):
+//     FetchAdd alloc seq             9      |        10          |        9
+//     Write WAL + memtable
+//     SetLastSequence               10      |        10          |        9
+//     (published_seq not advanced yet — data is uncommitted)
+//
+//   Commit (2nd queue):
+//     FetchAdd alloc seq            10      |        11          |        9
+//     Write WAL only
+//     Update CommitCache
+//     SetLastPublishedSeq           10      |        11          |       11
+//
 class WritePreparedTxn : public PessimisticTransaction {
  public:
   WritePreparedTxn(WritePreparedTxnDB* db, const WriteOptions& write_options,
diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc
index 26b413bf8b20..54cc2511cc78 100644
--- a/utilities/transactions/write_prepared_txn_db.cc
+++ b/utilities/transactions/write_prepared_txn_db.cc
@@ -107,7 +107,7 @@ Status WritePreparedTxnDB::VerifyCFOptions(
   if (!cf_options.memtable_factory->CanHandleDuplicatedKey()) {
     return Status::InvalidArgument(
         "memtable_factory->CanHandleDuplicatedKey() cannot be false with "
-        "WritePrpeared transactions");
+        "WritePrepared transactions");
   }
   return Status::OK();
 }
@@ -196,14 +196,14 @@ Status WritePreparedTxnDB::WriteInternal(const WriteOptions& write_options_orig,
   const uint64_t no_log_ref = 0;
   uint64_t seq_used = kMaxSequenceNumber;
   const size_t ZERO_PREPARES = 0;
-  const bool kSeperatePrepareCommitBatches = true;
+  const bool kSeparatePrepareCommitBatches = true;
   // Since this is not 2pc, there is no need for AddPrepared but having it in
   // the PreReleaseCallback enables an optimization. Refer to
   // SmallestUnCommittedSeq for more details.
   AddPreparedCallback add_prepared_callback(
       this, db_impl_, batch_cnt,
       db_impl_->immutable_db_options().two_write_queues,
-      !kSeperatePrepareCommitBatches);
+      !kSeparatePrepareCommitBatches);
   WritePreparedCommitEntryPreReleaseCallback update_commit_map(
       this, db_impl_, kMaxSequenceNumber, ZERO_PREPARES, batch_cnt);
   PreReleaseCallback* pre_release_callback;
@@ -484,7 +484,7 @@ Status WritePreparedTxnDB::NewIterators(
 }
 
 void WritePreparedTxnDB::Init(const TransactionDBOptions& txn_db_opts) {
-  // Adcance max_evicted_seq_ no more than 100 times before the cache wraps
+  // Advance max_evicted_seq_ no more than 100 times before the cache wraps
   // around.
   INC_STEP_FOR_MAX_EVICTED =
       std::max(COMMIT_CACHE_SIZE / 100, static_cast<size_t>(1));
@@ -731,7 +731,7 @@ void WritePreparedTxnDB::AdvanceMaxEvictedSeq(const SequenceNumber& prev_max,
   bool update_snapshots = false;
   if (new_snapshots_version > snapshots_version_) {
     // This is to avoid updating the snapshots_ if it already updated
-    // with a more recent vesion by a concrrent thread
+    // with a more recent version by a concurrent thread
     update_snapshots = true;
     // We only care about snapshots lower then max
     snapshots = GetSnapshotListFromDB(new_max);
@@ -807,7 +807,7 @@ SnapshotImpl* WritePreparedTxnDB::GetSnapshotInternal(
       throw std::runtime_error(
           "Snapshot seq " + std::to_string(snap_impl->GetSequenceNumber()) +
           " after " + std::to_string(retry) +
-          " retries is still less than futre_max_evicted_seq_" +
+          " retries is still less than future_max_evicted_seq_" +
           std::to_string(max));
     }
   }
@@ -930,9 +930,9 @@ void WritePreparedTxnDB::UpdateSnapshots(
   // both new and old lists, it will appear upper in the new list. So if
   // we simply insert the new snapshots in order, if an overwritten item
   // is still valid in the new list is either written to the same place in
-  // the array or it is written in a higher palce before it gets
-  // overwritten by another item. This guarantess a reader that reads the
-  // list bottom-up will eventaully see a snapshot that repeats in the
+  // the array or it is written in a higher place before it gets
+  // overwritten by another item. This guarantee a reader that reads the
+  // list bottom-up will eventually see a snapshot that repeats in the
   // update, either before it gets overwritten by the writer or
   // afterwards.
   size_t i = 0;
@@ -981,7 +981,7 @@ void WritePreparedTxnDB::CheckAgainstSnapshots(const CommitEntry& evicted) {
   // reader should be able to read all the snapshots that are still valid
   // after the update. Since the survived snapshots are written in a higher
   // place before gets overwritten the reader that reads bottom-up will
-  // eventully see it.
+  // eventually see it.
   const bool next_is_larger = true;
   // We will set to true if the border line snapshot suggests that.
   bool search_larger_list = false;
@@ -1003,7 +1003,7 @@ void WritePreparedTxnDB::CheckAgainstSnapshots(const CommitEntry& evicted) {
     }
   }
 #ifndef NDEBUG
-  // Release the remaining sync points before accquiring the lock
+  // Release the remaining sync points before acquiring the lock
   for (++sync_i; sync_i <= 10; ++sync_i) {
     TEST_IDX_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:p:", sync_i);
     TEST_IDX_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:s:", sync_i);
@@ -1020,7 +1020,7 @@ void WritePreparedTxnDB::CheckAgainstSnapshots(const CommitEntry& evicted) {
                    evicted.prep_seq, evicted.commit_seq, cnt);
     ReadLock rl(&snapshots_mutex_);
     // Items could have moved from the snapshots_ to snapshot_cache_ before
-    // accquiring the lock. To make sure that we do not miss a valid snapshot,
+    // acquiring the lock. To make sure that we do not miss a valid snapshot,
     // read snapshot_cache_ again while holding the lock.
     for (size_t i = 0; i < SNAPSHOT_CACHE_SIZE; i++) {
       SequenceNumber snapshot_seq =
diff --git a/utilities/transactions/write_unprepared_transaction_test.cc b/utilities/transactions/write_unprepared_transaction_test.cc
index e655eb42a04e..587f12ea2d0b 100644
--- a/utilities/transactions/write_unprepared_transaction_test.cc
+++ b/utilities/transactions/write_unprepared_transaction_test.cc
@@ -13,37 +13,43 @@ class WriteUnpreparedTransactionTestBase : public TransactionTestBase {
  public:
   WriteUnpreparedTransactionTestBase(bool use_stackable_db,
                                      bool two_write_queue,
-                                     TxnDBWritePolicy write_policy)
+                                     TxnDBWritePolicy write_policy,
+                                     bool use_per_key_point_lock_mgr,
+                                     int64_t deadlock_timeout_us)
       : TransactionTestBase(use_stackable_db, two_write_queue, write_policy,
-                            kOrderedWrite) {}
+                            kOrderedWrite, use_per_key_point_lock_mgr,
+                            deadlock_timeout_us) {}
 };
 
 class WriteUnpreparedTransactionTest
     : public WriteUnpreparedTransactionTestBase,
       virtual public ::testing::WithParamInterface<
-          std::tuple<bool, bool, TxnDBWritePolicy>> {
+          std::tuple<bool, bool, TxnDBWritePolicy, bool, int64_t>> {
  public:
   WriteUnpreparedTransactionTest()
-      : WriteUnpreparedTransactionTestBase(std::get<0>(GetParam()),
-                                           std::get<1>(GetParam()),
-                                           std::get<2>(GetParam())) {}
+      : WriteUnpreparedTransactionTestBase(
+            std::get<0>(GetParam()), std::get<1>(GetParam()),
+            std::get<2>(GetParam()), std::get<3>(GetParam()),
+            std::get<4>(GetParam())) {}
 };
 
 INSTANTIATE_TEST_CASE_P(
     WriteUnpreparedTransactionTest, WriteUnpreparedTransactionTest,
-    ::testing::Values(std::make_tuple(false, false, WRITE_UNPREPARED),
-                      std::make_tuple(false, true, WRITE_UNPREPARED)));
+    ::testing::Combine(::testing::Values(false), ::testing::Bool(),
+                       ::testing::Values(WRITE_UNPREPARED), ::testing::Bool(),
+                       ::testing::Values(0, 1000)));
 
 enum SnapshotAction { NO_SNAPSHOT, RO_SNAPSHOT, REFRESH_SNAPSHOT };
 enum VerificationOperation { VERIFY_GET, VERIFY_NEXT, VERIFY_PREV };
 class WriteUnpreparedSnapshotTest
     : public WriteUnpreparedTransactionTestBase,
-      virtual public ::testing::WithParamInterface<
-          std::tuple<bool, SnapshotAction, VerificationOperation>> {
+      virtual public ::testing::WithParamInterface<std::tuple<
+          bool, SnapshotAction, VerificationOperation, bool, int64_t>> {
  public:
   WriteUnpreparedSnapshotTest()
-      : WriteUnpreparedTransactionTestBase(false, std::get<0>(GetParam()),
-                                           WRITE_UNPREPARED),
+      : WriteUnpreparedTransactionTestBase(
+            false, std::get<0>(GetParam()), WRITE_UNPREPARED,
+            std::get<3>(GetParam()), std::get<4>(GetParam())),
         action_(std::get<1>(GetParam())),
         verify_op_(std::get<2>(GetParam())) {}
   SnapshotAction action_;
@@ -56,10 +62,11 @@ class WriteUnpreparedSnapshotTest
 // verification operation
 INSTANTIATE_TEST_CASE_P(
     WriteUnpreparedSnapshotTest, WriteUnpreparedSnapshotTest,
-    ::testing::Combine(
-        ::testing::Bool(),
-        ::testing::Values(NO_SNAPSHOT, RO_SNAPSHOT, REFRESH_SNAPSHOT),
-        ::testing::Values(VERIFY_GET, VERIFY_NEXT, VERIFY_PREV)));
+    ::testing::Combine(::testing::Bool(),
+                       ::testing::Values(NO_SNAPSHOT, RO_SNAPSHOT,
+                                         REFRESH_SNAPSHOT),
+                       ::testing::Values(VERIFY_GET, VERIFY_NEXT, VERIFY_PREV),
+                       ::testing::Bool(), ::testing::Values(0, 1000)));
 
 TEST_P(WriteUnpreparedTransactionTest, ReadYourOwnWrite) {
   // The following tests checks whether reading your own write for
diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc
index 8e9647b8c477..444c1c9b6350 100644
--- a/utilities/transactions/write_unprepared_txn.cc
+++ b/utilities/transactions/write_unprepared_txn.cc
@@ -374,7 +374,7 @@ Status WriteUnpreparedTxn::FlushWriteBatchToDBInternal(bool prepared) {
   uint64_t seq_used = kMaxSequenceNumber;
   // log_number_ should refer to the oldest log containing uncommitted data
   // from the current transaction. This means that if log_number_ is set,
-  // WriteImpl should not overwrite that value, so set log_used to nullptr if
+  // WriteImpl should not overwrite that value, so set wal_used to nullptr if
   // log_number_ is already set.
   s = db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(),
                           /*callback*/ nullptr, /*user_write_cb=*/nullptr,
diff --git a/utilities/ttl/db_ttl_impl.cc b/utilities/ttl/db_ttl_impl.cc
index 55354c6cbce3..84a2dbce9bc6 100644
--- a/utilities/ttl/db_ttl_impl.cc
+++ b/utilities/ttl/db_ttl_impl.cc
@@ -305,7 +305,8 @@ int RegisterTtlObjects(ObjectLibrary& library, const std::string& /*arg*/) {
   return static_cast<int>(library.GetFactoryCount(&num_types));
 }
 // Open the db inside DBWithTTLImpl because options needs pointer to its ttl
-DBWithTTLImpl::DBWithTTLImpl(DB* db) : DBWithTTL(db), closed_(false) {}
+DBWithTTLImpl::DBWithTTLImpl(std::unique_ptr<DB>&& db)
+    : DBWithTTL(std::move(db)), closed_(false) {}
 
 DBWithTTLImpl::~DBWithTTLImpl() {
   if (!closed_) {
@@ -372,7 +373,7 @@ Status DBWithTTL::Open(
     DBWithTTLImpl::SanitizeOptions(
         ttls[i], &column_families_sanitized[i].options, clock);
   }
-  DB* db;
+  std::unique_ptr<DB> db;
 
   Status st;
   if (read_only) {
@@ -382,7 +383,7 @@ Status DBWithTTL::Open(
     st = DB::Open(db_options, dbname, column_families_sanitized, handles, &db);
   }
   if (st.ok()) {
-    *dbptr = new DBWithTTLImpl(db);
+    *dbptr = new DBWithTTLImpl(std::move(db));
   } else {
     *dbptr = nullptr;
   }
@@ -635,4 +636,22 @@ void DBWithTTLImpl::SetTtl(ColumnFamilyHandle* h, int32_t ttl) {
   filter->SetTtl(ttl);
 }
 
+Status DBWithTTLImpl::GetTtl(ColumnFamilyHandle* h, int32_t* ttl) {
+  if (h == nullptr || ttl == nullptr) {
+    return Status::InvalidArgument(
+        "column family handle or ttl cannot be null");
+  }
+  std::shared_ptr<TtlCompactionFilterFactory> filter;
+  Options opts;
+  opts = GetOptions(h);
+  filter = std::static_pointer_cast<TtlCompactionFilterFactory>(
+      opts.compaction_filter_factory);
+  if (!filter) {
+    return Status::InvalidArgument(
+        "TTLCompactionFilterFactory is not set for TTLDB");
+  }
+  *ttl = filter->GetTtl();
+  return Status::OK();
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/ttl/db_ttl_impl.h b/utilities/ttl/db_ttl_impl.h
index 731cd3955fe1..b8b3866233f0 100644
--- a/utilities/ttl/db_ttl_impl.h
+++ b/utilities/ttl/db_ttl_impl.h
@@ -32,7 +32,7 @@ class DBWithTTLImpl : public DBWithTTL {
                               SystemClock* clock);
 
   static void RegisterTtlClasses();
-  explicit DBWithTTLImpl(DB* db);
+  explicit DBWithTTLImpl(std::unique_ptr<DB>&& db);
 
   virtual ~DBWithTTLImpl();
 
@@ -100,6 +100,8 @@ class DBWithTTLImpl : public DBWithTTL {
 
   void SetTtl(ColumnFamilyHandle* h, int32_t ttl) override;
 
+  Status GetTtl(ColumnFamilyHandle* h, int32_t* ttl) override;
+
  private:
   // remember whether the Close completes or not
   bool closed_;
@@ -184,6 +186,7 @@ class TtlCompactionFilterFactory : public CompactionFilterFactory {
   std::unique_ptr<CompactionFilter> CreateCompactionFilter(
       const CompactionFilter::Context& context) override;
   void SetTtl(int32_t ttl) { ttl_ = ttl; }
+  int32_t GetTtl() { return ttl_; }
 
   const char* Name() const override { return kClassName(); }
   static const char* kClassName() { return "TtlCompactionFilterFactory"; }
diff --git a/utilities/ttl/ttl_test.cc b/utilities/ttl/ttl_test.cc
index 4bbf11505d49..26454c6ee08c 100644
--- a/utilities/ttl/ttl_test.cc
+++ b/utilities/ttl/ttl_test.cc
@@ -617,7 +617,6 @@ TEST_F(TtlTest, UnregisteredMergeOperator) {
    public:
     const char* Name() const override { return "UnregisteredMergeOperator"; }
   };
-  options_.fail_if_options_file_error = true;
   options_.merge_operator = std::make_shared<UnregisteredMergeOperator>();
   OpenTtl();
   CloseTtl();
@@ -659,7 +658,7 @@ TEST_F(TtlTest, TtlFiftenYears) {
 }
 
 TEST_F(TtlTest, ColumnFamiliesTest) {
-  DB* db;
+  std::unique_ptr<DB> db;
   Options options;
   options.create_if_missing = true;
   options.env = env_.get();
@@ -670,7 +669,7 @@ TEST_F(TtlTest, ColumnFamiliesTest) {
                                    "ttl_column_family", &handle));
 
   delete handle;
-  delete db;
+  db.reset();
 
   std::vector<ColumnFamilyDescriptor> column_families;
   column_families.emplace_back(kDefaultColumnFamilyName,
@@ -721,6 +720,9 @@ TEST_F(TtlTest, ChangeTtlOnOpenDb) {
 
   OpenTtl(1);  // T=0:Open the db with ttl = 2
   SetTtl(3);
+  int32_t ttl = 0;
+  ASSERT_OK(db_ttl_->GetTtl(db_ttl_->DefaultColumnFamily(), &ttl));
+  ASSERT_EQ(ttl, 3);
   PutValues(0, kSampleSize_);  // T=0:Insert Set1. Delete at t=2
   SleepCompactCheck(2, 0, kSampleSize_, true);  // T=2:Set1 should be there
   CloseTtl();
diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc
index 2970ce6e5028..3171c0bf71f2 100644
--- a/utilities/write_batch_with_index/write_batch_with_index.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index.cc
@@ -33,7 +33,7 @@ struct WriteBatchWithIndex::Rep {
         last_sub_batch_offset(0),
         sub_batch_cnt(1),
         overwrite_key(_overwrite_key),
-        track_cf_stat(false) {}
+        op_count(0) {}
   ReadableWriteBatch write_batch;
   WriteBatchEntryComparator comparator;
   Arena arena;
@@ -45,11 +45,12 @@ struct WriteBatchWithIndex::Rep {
   // Total number of sub-batches in the write batch. Default is 1.
   size_t sub_batch_cnt;
 
-  bool overwrite_key;
-  bool track_cf_stat;
+  const bool overwrite_key;
   // Tracks ids of CFs that have updates in this WBWI, number of updates and
-  // number of overwritten single deletions per cf.
-  std::unordered_map<uint32_t, CFStat> cf_id_to_stat;
+  // number of overwritten single deletions per cf. Useful for WBWIMemTable
+  // when this WBWI is ingested into a DB.
+  std::unordered_map<uint32_t, WriteBatchWithIndex::CFStat> cf_id_to_stat;
+  size_t op_count;
 
   // In overwrite mode, find the existing entry for the same key and update it
   // to point to the current entry if this is not a Merge operation.
@@ -126,15 +127,13 @@ bool WriteBatchWithIndex::Rep::UpdateExistingEntryWithCfId(
     last_sub_batch_offset = last_entry_offset;
     sub_batch_cnt++;
   }
-  if (track_cf_stat) {
-    if (most_recent_entry->has_single_del &&
-        !most_recent_entry->has_overwritten_single_del) {
-      cf_id_to_stat[column_family_id].overwritten_sd_count++;
-      most_recent_entry->has_overwritten_single_del = true;
-    }
-    if (type == kSingleDeleteRecord) {
-      most_recent_entry->has_single_del = true;
-    }
+  if (most_recent_entry->has_single_del &&
+      !most_recent_entry->has_overwritten_single_del) {
+    cf_id_to_stat[column_family_id].overwritten_sd_count++;
+    most_recent_entry->has_overwritten_single_del = true;
+  }
+  if (type == kSingleDeleteRecord) {
+    most_recent_entry->has_single_del = true;
   }
   // Some sanity check for using Merge and SD on the same key.
   if (iter.Entry().type == kSingleDeleteRecord) {
@@ -157,6 +156,7 @@ bool WriteBatchWithIndex::Rep::UpdateExistingEntryWithCfId(
 void WriteBatchWithIndex::Rep::AddOrUpdateIndexWithCfId(
     uint32_t cf_id, const Slice& key, WriteType type, size_t last_entry_offset,
     const Comparator* cf_cmp) {
+  op_count++;
   uint32_t update_count = 0;
   if (!UpdateExistingEntryWithCfId(cf_id, key, type, last_entry_offset,
                                    &update_count)) {
@@ -196,17 +196,14 @@ void WriteBatchWithIndex::Rep::AddNewEntry(uint32_t column_family_id,
       key.size(), update_count);
   skip_list.Insert(index_entry);
 
-  if (track_cf_stat) {
-    if (type == kSingleDeleteRecord) {
-      index_entry->has_single_del = true;
-    }
-    cf_id_to_stat[column_family_id].entry_count++;
+  if (type == kSingleDeleteRecord) {
+    index_entry->has_single_del = true;
   }
+  cf_id_to_stat[column_family_id].entry_count++;
 }
 
 void WriteBatchWithIndex::Rep::Clear() {
   write_batch.Clear();
-  cf_id_to_stat.clear();
   ClearIndex();
 }
 
@@ -217,6 +214,8 @@ void WriteBatchWithIndex::Rep::ClearIndex() {
   new (&skip_list) WriteBatchEntrySkipList(comparator, &arena);
   last_sub_batch_offset = 0;
   sub_batch_cnt = 1;
+  cf_id_to_stat.clear();
+  op_count = 0;
 }
 
 Status WriteBatchWithIndex::Rep::ReBuildIndex() {
@@ -366,10 +365,19 @@ Iterator* WriteBatchWithIndex::NewIteratorWithBase(
                                read_options);
 }
 
-Iterator* WriteBatchWithIndex::NewIteratorWithBase(Iterator* base_iterator) {
+Iterator* WriteBatchWithIndex::NewIteratorWithBase(
+    Iterator* base_iterator, const ReadOptions* read_options) {
+  WBWIIteratorImpl* wbwiii;
   // default column family's comparator
-  auto wbwiii = new WBWIIteratorImpl(0, &(rep->skip_list), &rep->write_batch,
-                                     &rep->comparator);
+  if (read_options != nullptr) {
+    wbwiii = new WBWIIteratorImpl(
+        0, &(rep->skip_list), &rep->write_batch, &rep->comparator,
+        read_options->iterate_lower_bound, read_options->iterate_upper_bound);
+  } else {
+    wbwiii = new WBWIIteratorImpl(0, &(rep->skip_list), &rep->write_batch,
+                                  &rep->comparator);
+  }
+
   return new BaseDeltaIterator(nullptr, base_iterator, wbwiii,
                                rep->comparator.default_comparator(),
                                /* read_options */ nullptr);
@@ -1164,17 +1172,12 @@ const Comparator* WriteBatchWithIndexInternal::GetUserComparator(
   return ucmps.GetComparator(cf_id);
 }
 
-void WriteBatchWithIndex::SetTrackPerCFStat(bool track) {
-  // Should be set when the wbwi contains no update.
-  assert(GetWriteBatch()->Count() == 0);
-  rep->track_cf_stat = track;
-}
-
 const std::unordered_map<uint32_t, WriteBatchWithIndex::CFStat>&
 WriteBatchWithIndex::GetCFStats() const {
-  assert(rep->track_cf_stat);
   return rep->cf_id_to_stat;
 }
 
+size_t WriteBatchWithIndex::GetWBWIOpCount() const { return rep->op_count; }
+
 bool WriteBatchWithIndex::GetOverwriteKey() const { return rep->overwrite_key; }
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/write_batch_with_index/write_batch_with_index_internal.h b/utilities/write_batch_with_index/write_batch_with_index_internal.h
index 79134217e200..293d5289cb35 100644
--- a/utilities/write_batch_with_index/write_batch_with_index_internal.h
+++ b/utilities/write_batch_with_index/write_batch_with_index_internal.h
@@ -165,9 +165,9 @@ struct WriteBatchIndexEntry {
   uint32_t column_family;  // column family of the entry.
   // The following three fields are only maintained when the WBWI is created
   // with overwrite_key = true.
-  uint32_t update_count;   // The number of updates (1-based) for this key up to
-                           // this entry.
-  bool has_single_del;     // whether single del was issued for this key
+  uint32_t update_count;  // The number of updates (1-based) for this key up to
+                          // this entry.
+  bool has_single_del;    // whether single del was issued for this key
   bool has_overwritten_single_del;  // whether a single del for this key was
                                     // overwritten by another key
   // The following two fields are used when search_key is null.
@@ -406,6 +406,11 @@ class WBWIIteratorImpl final : public WBWIIterator {
   bool out_of_bound_ = false;
 
   bool TestOutOfBound() const {
+    if (!iterate_lower_bound_ && !iterate_upper_bound_) {
+      // The Entry() call below is non-trivial, tests the common and cheaper
+      // no bound case first.
+      return false;
+    }
     const Slice& curKey = Entry().key;
     return AtOrAfterUpperBound(&curKey) || BeforeLowerBound(&curKey);
   }
diff --git a/utilities/write_batch_with_index/write_batch_with_index_test.cc b/utilities/write_batch_with_index/write_batch_with_index_test.cc
index 9e26d734baf7..caa1881e89b2 100644
--- a/utilities/write_batch_with_index/write_batch_with_index_test.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index_test.cc
@@ -342,6 +342,10 @@ void AssertIterEqual(WBWIIteratorImpl* wbwii,
   }
   ASSERT_FALSE(wbwii->Valid());
 }
+
+void AssertWBWICountEQWBCount(WriteBatchWithIndex& wbwi) {
+  ASSERT_EQ(wbwi.GetWBWIOpCount(), wbwi.GetWriteBatch()->Count());
+}
 }  // namespace
 
 class WBWIBaseTest : public testing::Test {
@@ -356,9 +360,11 @@ class WBWIBaseTest : public testing::Test {
   }
 
   virtual ~WBWIBaseTest() {
+    AssertWBWICountEQWBCount(*batch_);
+
     if (db_ != nullptr) {
       ReleaseSnapshot();
-      delete db_;
+      db_.reset();
       EXPECT_OK(DestroyDB(dbname_, options_));
     }
   }
@@ -429,7 +435,7 @@ class WBWIBaseTest : public testing::Test {
   }
 
  public:
-  DB* db_;
+  std::unique_ptr<DB> db_;
   std::string dbname_;
   Options options_;
   WriteOptions write_opts_;
@@ -715,6 +721,7 @@ TEST_P(WriteBatchWithIndexTest, TestValueAsSecondaryIndex) {
   batch_.reset(new WriteBatchWithIndex(nullptr, 20, GetParam()));
 
   TestValueAsSecondaryIndexHelper(entries_list, batch_.get(), GetParam());
+  AssertWBWICountEQWBCount(*batch_);
 
   // Clear batch and re-run test with new values
   batch_->Clear();
@@ -729,6 +736,7 @@ TEST_P(WriteBatchWithIndexTest, TestValueAsSecondaryIndex) {
   entries_list = std::vector<Entry>(new_entries, new_entries + 8);
 
   TestValueAsSecondaryIndexHelper(entries_list, batch_.get(), GetParam());
+  AssertWBWICountEQWBCount(*batch_);
 }
 
 TEST_P(WriteBatchWithIndexTest, WBWIIteratorImpl) {
@@ -1586,21 +1594,21 @@ TEST_P(WriteBatchWithIndexTest, TestGetFromBatchAndDB) {
   ASSERT_OK(batch_->Put("a", "batch_->a"));
   ASSERT_OK(batch_->Delete("b"));
 
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "a", &value));
   ASSERT_EQ("batch_->a", value);
 
-  Status s = batch_->GetFromBatchAndDB(db_, read_opts_, "b", &value);
+  Status s = batch_->GetFromBatchAndDB(db_.get(), read_opts_, "b", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "c", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "c", &value));
   ASSERT_EQ("c", value);
 
-  s = batch_->GetFromBatchAndDB(db_, read_opts_, "x", &value);
+  s = batch_->GetFromBatchAndDB(db_.get(), read_opts_, "x", &value);
   ASSERT_TRUE(s.IsNotFound());
 
   ASSERT_OK(db_->Delete(write_opts_, "x"));
 
-  s = batch_->GetFromBatchAndDB(db_, read_opts_, "x", &value);
+  s = batch_->GetFromBatchAndDB(db_.get(), read_opts_, "x", &value);
   ASSERT_TRUE(s.IsNotFound());
 }
 
@@ -1622,24 +1630,24 @@ TEST_P(WriteBatchWithIndexTest, TestGetFromBatchAndDBMerge) {
   ASSERT_OK(batch_->Merge("d", "d1"));
   ASSERT_OK(batch_->Merge("e", "e0"));
 
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "a", &value));
   ASSERT_EQ("a0,a1,a2", value);
 
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "b", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "b", &value));
   ASSERT_EQ("b0,b1,b2", value);
 
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "c", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "c", &value));
   ASSERT_EQ("c0", value);
 
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "d", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "d", &value));
   ASSERT_EQ("d0,d1", value);
 
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "e", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "e", &value));
   ASSERT_EQ("e0", value);
 
   ASSERT_OK(db_->Delete(write_opts_, "x"));
 
-  s = batch_->GetFromBatchAndDB(db_, read_opts_, "x", &value);
+  s = batch_->GetFromBatchAndDB(db_.get(), read_opts_, "x", &value);
   ASSERT_TRUE(s.IsNotFound());
 
   const Snapshot* snapshot = db_->GetSnapshot();
@@ -1648,42 +1656,44 @@ TEST_P(WriteBatchWithIndexTest, TestGetFromBatchAndDBMerge) {
 
   ASSERT_OK(db_->Delete(write_opts_, "a"));
 
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "a", &value));
   ASSERT_EQ("a1,a2", value);
 
-  ASSERT_OK(
-      s = batch_->GetFromBatchAndDB(db_, snapshot_read_options, "a", &value));
+  ASSERT_OK(s = batch_->GetFromBatchAndDB(db_.get(), snapshot_read_options, "a",
+                                          &value));
   ASSERT_EQ("a0,a1,a2", value);
 
   ASSERT_OK(batch_->Delete("a"));
 
-  s = batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value);
+  s = batch_->GetFromBatchAndDB(db_.get(), read_opts_, "a", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  s = batch_->GetFromBatchAndDB(db_, snapshot_read_options, "a", &value);
+  s = batch_->GetFromBatchAndDB(db_.get(), snapshot_read_options, "a", &value);
   ASSERT_TRUE(s.IsNotFound());
 
   ASSERT_OK(s = db_->Merge(write_opts_, "c", "c1"));
 
-  ASSERT_OK(s = batch_->GetFromBatchAndDB(db_, read_opts_, "c", &value));
+  ASSERT_OK(s = batch_->GetFromBatchAndDB(db_.get(), read_opts_, "c", &value));
   ASSERT_EQ("c0,c1", value);
 
-  ASSERT_OK(
-      s = batch_->GetFromBatchAndDB(db_, snapshot_read_options, "c", &value));
+  ASSERT_OK(s = batch_->GetFromBatchAndDB(db_.get(), snapshot_read_options, "c",
+                                          &value));
   ASSERT_EQ("c0", value);
 
   ASSERT_OK(db_->Put(write_opts_, "e", "e1"));
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "e", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "e", &value));
   ASSERT_EQ("e1,e0", value);
 
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, snapshot_read_options, "e", &value));
+  ASSERT_OK(
+      batch_->GetFromBatchAndDB(db_.get(), snapshot_read_options, "e", &value));
   ASSERT_EQ("e0", value);
 
   ASSERT_OK(s = db_->Delete(write_opts_, "e"));
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "e", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "e", &value));
   ASSERT_EQ("e0", value);
 
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, snapshot_read_options, "e", &value));
+  ASSERT_OK(
+      batch_->GetFromBatchAndDB(db_.get(), snapshot_read_options, "e", &value));
   ASSERT_EQ("e0", value);
 
   db_->ReleaseSnapshot(snapshot);
@@ -1695,24 +1705,24 @@ TEST_F(WBWIOverwriteTest, TestGetFromBatchAndDBMerge2) {
 
   std::string value;
 
-  s = batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value);
+  s = batch_->GetFromBatchAndDB(db_.get(), read_opts_, "A", &value);
   ASSERT_TRUE(s.IsNotFound());
 
   ASSERT_OK(batch_->Merge("A", "xxx"));
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "A", &value));
   ASSERT_EQ(value, "xxx");
 
   ASSERT_OK(batch_->Merge("A", "yyy"));
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "A", &value));
   ASSERT_EQ(value, "xxx,yyy");
 
   ASSERT_OK(db_->Put(write_opts_, "A", "a0"));
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "A", &value));
   ASSERT_EQ(value, "a0,xxx,yyy");
 
   ASSERT_OK(batch_->Delete("A"));
 
-  s = batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value);
+  s = batch_->GetFromBatchAndDB(db_.get(), read_opts_, "A", &value);
   ASSERT_TRUE(s.IsNotFound());
 }
 
@@ -1727,7 +1737,7 @@ TEST_P(WriteBatchWithIndexTest, TestGetFromBatchAndDBMerge3) {
   ASSERT_OK(db_->Flush(flush_options, db_->DefaultColumnFamily()));
   ASSERT_OK(batch_->Merge("A", "2"));
 
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "A", &value));
   ASSERT_EQ(value, "1,2");
 }
 
@@ -1753,23 +1763,23 @@ TEST_P(WriteBatchWithIndexTest, TestPinnedGetFromBatchAndDB) {
       // Do it again with a flushed DB...
       ASSERT_OK(db_->Flush(FlushOptions(), db_->DefaultColumnFamily()));
     }
-    ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value));
+    ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "a", &value));
     ASSERT_EQ("a0,a1,a2", value.ToString());
 
-    ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "b", &value));
+    ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "b", &value));
     ASSERT_EQ("b0,b1,b2", value.ToString());
 
-    ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "c", &value));
+    ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "c", &value));
     ASSERT_EQ("c0", value.ToString());
 
-    ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "d", &value));
+    ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "d", &value));
     ASSERT_EQ("d0,d1", value.ToString());
 
-    ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "e", &value));
+    ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "e", &value));
     ASSERT_EQ("e0", value.ToString());
     ASSERT_OK(db_->Delete(write_opts_, "x"));
 
-    s = batch_->GetFromBatchAndDB(db_, read_opts_, "x", &value);
+    s = batch_->GetFromBatchAndDB(db_.get(), read_opts_, "x", &value);
     ASSERT_TRUE(s.IsNotFound());
   }
 }
@@ -2587,7 +2597,7 @@ TEST_P(WriteBatchWithIndexTest, MultiGetTest) {
   std::vector<PinnableSlice> values(keys.size());
   std::vector<Status> statuses(keys.size());
 
-  batch_->MultiGetFromBatchAndDB(db_, read_opts_, cf0, key_slices.size(),
+  batch_->MultiGetFromBatchAndDB(db_.get(), read_opts_, cf0, key_slices.size(),
                                  key_slices.data(), values.data(),
                                  statuses.data(), false);
   for (size_t i = 0; i < keys.size(); ++i) {
@@ -2666,7 +2676,7 @@ TEST_P(WriteBatchWithIndexTest, MultiGetTest2) {
       int random = rnd.Uniform(num_keys);
       key_slices.emplace_back(keys[random]);
     }
-    batch_->MultiGetFromBatchAndDB(db_, read_opts_, cf0, keys_per_pass,
+    batch_->MultiGetFromBatchAndDB(db_.get(), read_opts_, cf0, keys_per_pass,
                                    key_slices.data(), values.data(),
                                    statuses.data(), false);
     for (size_t i = 0; i < keys_per_pass; i++) {
@@ -2819,9 +2829,9 @@ TEST_P(WriteBatchWithIndexTest, GetFromBatchAndDBAfterMerge) {
   ASSERT_OK(db_->Put(write_opts_, "o", "aa"));
   ASSERT_OK(batch_->Merge("o", "bb"));  // Merging bb under key "o"
   ASSERT_OK(batch_->Merge("m", "cc"));  // Merging bc under key "m"
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "o", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "o", &value));
   ASSERT_EQ(value, "aa,bb");
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "m", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "m", &value));
   ASSERT_EQ(value, "cc");
 }
 
@@ -2835,19 +2845,19 @@ TEST_P(WriteBatchWithIndexTest, GetAfterPut) {
   ASSERT_OK(batch_->Put("key", "aa"));  // Writing aa under key
   ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
   ASSERT_EQ(value, "aa");
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "key", &value));
   ASSERT_EQ(value, "aa");
 
   ASSERT_OK(batch_->Merge("key", "bb"));  // Merging bb under key
   ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
   ASSERT_EQ(value, "aa,bb");
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "key", &value));
   ASSERT_EQ(value, "aa,bb");
 
   ASSERT_OK(batch_->Merge("key", "cc"));  // Merging cc under key
   ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
   ASSERT_EQ(value, "aa,bb,cc");
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "key", &value));
   ASSERT_EQ(value, "aa,bb,cc");
 }
 
@@ -2860,25 +2870,25 @@ TEST_P(WriteBatchWithIndexTest, GetAfterMergePut) {
   ASSERT_OK(batch_->Merge("key", "aa"));  // Merging aa under key
   Status s = batch_->GetFromBatch(cf0, options_, "key", &value);
   ASSERT_EQ(s.code(), Status::Code::kMergeInProgress);
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "key", &value));
   ASSERT_EQ(value, "orig,aa");
 
   ASSERT_OK(batch_->Merge("key", "bb"));  // Merging bb under key
   s = batch_->GetFromBatch(cf0, options_, "key", &value);
   ASSERT_EQ(s.code(), Status::Code::kMergeInProgress);
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "key", &value));
   ASSERT_EQ(value, "orig,aa,bb");
 
   ASSERT_OK(batch_->Put("key", "cc"));  // Writing cc under key
   ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
   ASSERT_EQ(value, "cc");
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "key", &value));
   ASSERT_EQ(value, "cc");
 
   ASSERT_OK(batch_->Merge("key", "dd"));  // Merging dd under key
   ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
   ASSERT_EQ(value, "cc,dd");
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "key", &value));
   ASSERT_EQ(value, "cc,dd");
 }
 
@@ -2890,30 +2900,30 @@ TEST_P(WriteBatchWithIndexTest, GetAfterMergeDelete) {
   ASSERT_OK(batch_->Merge("key", "aa"));  // Merging aa under key
   Status s = batch_->GetFromBatch(cf0, options_, "key", &value);
   ASSERT_EQ(s.code(), Status::Code::kMergeInProgress);
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "key", &value));
   ASSERT_EQ(value, "aa");
 
   ASSERT_OK(batch_->Merge("key", "bb"));  // Merging bb under key
   s = batch_->GetFromBatch(cf0, options_, "key", &value);
   ASSERT_EQ(s.code(), Status::Code::kMergeInProgress);
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "key", &value));
   ASSERT_EQ(value, "aa,bb");
 
   ASSERT_OK(batch_->Delete("key"));  // Delete key from batch
   s = batch_->GetFromBatch(cf0, options_, "key", &value);
   ASSERT_TRUE(s.IsNotFound());
-  s = batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value);
+  s = batch_->GetFromBatchAndDB(db_.get(), read_opts_, "key", &value);
   ASSERT_TRUE(s.IsNotFound());
 
   ASSERT_OK(batch_->Merge("key", "cc"));  // Merging cc under key
   ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
   ASSERT_EQ(value, "cc");
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "key", &value));
   ASSERT_EQ(value, "cc");
   ASSERT_OK(batch_->Merge("key", "dd"));  // Merging dd under key
   ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
   ASSERT_EQ(value, "cc,dd");
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "key", &value));
   ASSERT_EQ(value, "cc,dd");
 }
 
@@ -2939,9 +2949,9 @@ TEST_P(WriteBatchWithIndexTest, TestBadMergeOperator) {
   ASSERT_OK(batch_->Put("b", "b0"));
 
   ASSERT_OK(batch_->Merge("a", "a1"));
-  ASSERT_NOK(batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value));
+  ASSERT_NOK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "a", &value));
   ASSERT_NOK(batch_->GetFromBatch(column_family, options_, "a", &value));
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "b", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "b", &value));
   ASSERT_OK(batch_->GetFromBatch(column_family, options_, "b", &value));
 }
 
@@ -2964,7 +2974,7 @@ TEST_P(WriteBatchWithIndexTest, ColumnFamilyWithTimestamp) {
   {
     std::string value;
     ASSERT_TRUE(
-        batch_->GetFromBatchAndDB(db_, ReadOptions(), &cf2, "key", &value)
+        batch_->GetFromBatchAndDB(db_.get(), ReadOptions(), &cf2, "key", &value)
             .IsInvalidArgument());
   }
   {
@@ -2974,7 +2984,7 @@ TEST_P(WriteBatchWithIndexTest, ColumnFamilyWithTimestamp) {
         {PinnableSlice(), PinnableSlice()}};
     std::array<Status, num_keys> statuses{{Status(), Status()}};
     constexpr bool sorted_input = false;
-    batch_->MultiGetFromBatchAndDB(db_, ReadOptions(), &cf2, num_keys,
+    batch_->MultiGetFromBatchAndDB(db_.get(), ReadOptions(), &cf2, num_keys,
                                    keys.data(), pinnable_vals.data(),
                                    statuses.data(), sorted_input);
     for (const auto& s : statuses) {
@@ -3135,13 +3145,15 @@ TEST_P(WriteBatchWithIndexTest, WideColumnsBatchOnly) {
   // GetFromBatchAndDB
   {
     PinnableSlice value;
-    ASSERT_TRUE(batch_->GetFromBatchAndDB(db_, read_opts_, delete_key, &value)
-                    .IsNotFound());
+    ASSERT_TRUE(
+        batch_->GetFromBatchAndDB(db_.get(), read_opts_, delete_key, &value)
+            .IsNotFound());
   }
 
   for (size_t i = 1; i < num_keys; ++i) {
     PinnableSlice value;
-    ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, keys[i], &value));
+    ASSERT_OK(
+        batch_->GetFromBatchAndDB(db_.get(), read_opts_, keys[i], &value));
     ASSERT_EQ(value, expected[i].front().value());
   }
 
@@ -3151,9 +3163,9 @@ TEST_P(WriteBatchWithIndexTest, WideColumnsBatchOnly) {
     std::array<Status, num_keys> statuses;
     constexpr bool sorted_input = false;
 
-    batch_->MultiGetFromBatchAndDB(db_, read_opts_, db_->DefaultColumnFamily(),
-                                   num_keys, keys.data(), values.data(),
-                                   statuses.data(), sorted_input);
+    batch_->MultiGetFromBatchAndDB(
+        db_.get(), read_opts_, db_->DefaultColumnFamily(), num_keys,
+        keys.data(), values.data(), statuses.data(), sorted_input);
 
     ASSERT_TRUE(statuses[0].IsNotFound());
 
@@ -3167,7 +3179,7 @@ TEST_P(WriteBatchWithIndexTest, WideColumnsBatchOnly) {
   {
     PinnableWideColumns columns;
     ASSERT_TRUE(batch_
-                    ->GetEntityFromBatchAndDB(db_, read_opts_,
+                    ->GetEntityFromBatchAndDB(db_.get(), read_opts_,
                                               db_->DefaultColumnFamily(),
                                               delete_key, &columns)
                     .IsNotFound());
@@ -3176,7 +3188,7 @@ TEST_P(WriteBatchWithIndexTest, WideColumnsBatchOnly) {
   for (size_t i = 1; i < num_keys; ++i) {
     PinnableWideColumns columns;
     ASSERT_OK(batch_->GetEntityFromBatchAndDB(
-        db_, read_opts_, db_->DefaultColumnFamily(), keys[i], &columns));
+        db_.get(), read_opts_, db_->DefaultColumnFamily(), keys[i], &columns));
     ASSERT_EQ(columns.columns(), expected[i]);
   }
 
@@ -3187,8 +3199,8 @@ TEST_P(WriteBatchWithIndexTest, WideColumnsBatchOnly) {
     constexpr bool sorted_input = false;
 
     batch_->MultiGetEntityFromBatchAndDB(
-        db_, read_opts_, db_->DefaultColumnFamily(), num_keys, keys.data(),
-        results.data(), statuses.data(), sorted_input);
+        db_.get(), read_opts_, db_->DefaultColumnFamily(), num_keys,
+        keys.data(), results.data(), statuses.data(), sorted_input);
 
     ASSERT_TRUE(statuses[0].IsNotFound());
 
@@ -3285,14 +3297,15 @@ TEST_P(WriteBatchWithIndexTest, WideColumnsBatchAndDB) {
   // GetFromBatchAndDB
   for (size_t i = 0; i < num_keys - 1; ++i) {
     PinnableSlice value;
-    ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, keys[i], &value));
+    ASSERT_OK(
+        batch_->GetFromBatchAndDB(db_.get(), read_opts_, keys[i], &value));
     ASSERT_EQ(value, expected[i].front().value());
   }
 
   {
     PinnableSlice value;
     ASSERT_TRUE(
-        batch_->GetFromBatchAndDB(db_, read_opts_, no_merge_c_key, &value)
+        batch_->GetFromBatchAndDB(db_.get(), read_opts_, no_merge_c_key, &value)
             .IsNotFound());
   }
 
@@ -3302,9 +3315,9 @@ TEST_P(WriteBatchWithIndexTest, WideColumnsBatchAndDB) {
     std::array<Status, num_keys> statuses;
     constexpr bool sorted_input = false;
 
-    batch_->MultiGetFromBatchAndDB(db_, read_opts_, db_->DefaultColumnFamily(),
-                                   num_keys, keys.data(), values.data(),
-                                   statuses.data(), sorted_input);
+    batch_->MultiGetFromBatchAndDB(
+        db_.get(), read_opts_, db_->DefaultColumnFamily(), num_keys,
+        keys.data(), values.data(), statuses.data(), sorted_input);
 
     for (size_t i = 0; i < num_keys - 1; ++i) {
       ASSERT_OK(statuses[i]);
@@ -3318,14 +3331,14 @@ TEST_P(WriteBatchWithIndexTest, WideColumnsBatchAndDB) {
   for (size_t i = 0; i < num_keys - 1; ++i) {
     PinnableWideColumns columns;
     ASSERT_OK(batch_->GetEntityFromBatchAndDB(
-        db_, read_opts_, db_->DefaultColumnFamily(), keys[i], &columns));
+        db_.get(), read_opts_, db_->DefaultColumnFamily(), keys[i], &columns));
     ASSERT_EQ(columns.columns(), expected[i]);
   }
 
   {
     PinnableWideColumns columns;
     ASSERT_TRUE(batch_
-                    ->GetEntityFromBatchAndDB(db_, read_opts_,
+                    ->GetEntityFromBatchAndDB(db_.get(), read_opts_,
                                               db_->DefaultColumnFamily(),
                                               no_merge_c_key, &columns)
                     .IsNotFound());
@@ -3338,8 +3351,8 @@ TEST_P(WriteBatchWithIndexTest, WideColumnsBatchAndDB) {
     constexpr bool sorted_input = false;
 
     batch_->MultiGetEntityFromBatchAndDB(
-        db_, read_opts_, db_->DefaultColumnFamily(), num_keys, keys.data(),
-        results.data(), statuses.data(), sorted_input);
+        db_.get(), read_opts_, db_->DefaultColumnFamily(), num_keys,
+        keys.data(), results.data(), statuses.data(), sorted_input);
 
     for (size_t i = 0; i < num_keys - 1; ++i) {
       ASSERT_OK(statuses[i]);
@@ -3543,15 +3556,15 @@ TEST_P(WriteBatchWithIndexTest, EntityReadSanityChecks) {
     constexpr ColumnFamilyHandle* column_family = nullptr;
     PinnableWideColumns columns;
     ASSERT_TRUE(batch_
-                    ->GetEntityFromBatchAndDB(db_, ReadOptions(), column_family,
-                                              foo, &columns)
+                    ->GetEntityFromBatchAndDB(db_.get(), ReadOptions(),
+                                              column_family, foo, &columns)
                     .IsInvalidArgument());
   }
 
   {
     constexpr PinnableWideColumns* columns = nullptr;
     ASSERT_TRUE(batch_
-                    ->GetEntityFromBatchAndDB(db_, ReadOptions(),
+                    ->GetEntityFromBatchAndDB(db_.get(), ReadOptions(),
                                               db_->DefaultColumnFamily(), foo,
                                               columns)
                     .IsInvalidArgument());
@@ -3563,7 +3576,7 @@ TEST_P(WriteBatchWithIndexTest, EntityReadSanityChecks) {
 
     PinnableWideColumns columns;
     ASSERT_TRUE(batch_
-                    ->GetEntityFromBatchAndDB(db_, read_options,
+                    ->GetEntityFromBatchAndDB(db_.get(), read_options,
                                               db_->DefaultColumnFamily(), foo,
                                               &columns)
                     .IsInvalidArgument());
@@ -3591,9 +3604,9 @@ TEST_P(WriteBatchWithIndexTest, EntityReadSanityChecks) {
     std::array<Status, num_keys> statuses;
     constexpr bool sorted_input = false;
 
-    batch_->MultiGetEntityFromBatchAndDB(db_, ReadOptions(), column_family,
-                                         num_keys, keys.data(), results.data(),
-                                         statuses.data(), sorted_input);
+    batch_->MultiGetEntityFromBatchAndDB(
+        db_.get(), ReadOptions(), column_family, num_keys, keys.data(),
+        results.data(), statuses.data(), sorted_input);
 
     ASSERT_TRUE(statuses[0].IsInvalidArgument());
     ASSERT_TRUE(statuses[1].IsInvalidArgument());
@@ -3606,7 +3619,7 @@ TEST_P(WriteBatchWithIndexTest, EntityReadSanityChecks) {
     constexpr bool sorted_input = false;
 
     batch_->MultiGetEntityFromBatchAndDB(
-        db_, ReadOptions(), db_->DefaultColumnFamily(), num_keys, keys,
+        db_.get(), ReadOptions(), db_->DefaultColumnFamily(), num_keys, keys,
         results.data(), statuses.data(), sorted_input);
 
     ASSERT_TRUE(statuses[0].IsInvalidArgument());
@@ -3620,8 +3633,8 @@ TEST_P(WriteBatchWithIndexTest, EntityReadSanityChecks) {
     constexpr bool sorted_input = false;
 
     batch_->MultiGetEntityFromBatchAndDB(
-        db_, ReadOptions(), db_->DefaultColumnFamily(), num_keys, keys.data(),
-        results, statuses.data(), sorted_input);
+        db_.get(), ReadOptions(), db_->DefaultColumnFamily(), num_keys,
+        keys.data(), results, statuses.data(), sorted_input);
 
     ASSERT_TRUE(statuses[0].IsInvalidArgument());
     ASSERT_TRUE(statuses[1].IsInvalidArgument());
@@ -3637,8 +3650,8 @@ TEST_P(WriteBatchWithIndexTest, EntityReadSanityChecks) {
     constexpr bool sorted_input = false;
 
     batch_->MultiGetEntityFromBatchAndDB(
-        db_, read_options, db_->DefaultColumnFamily(), num_keys, keys.data(),
-        results.data(), statuses.data(), sorted_input);
+        db_.get(), read_options, db_->DefaultColumnFamily(), num_keys,
+        keys.data(), results.data(), statuses.data(), sorted_input);
     ASSERT_TRUE(statuses[0].IsInvalidArgument());
     ASSERT_TRUE(statuses[1].IsInvalidArgument());
   }
@@ -3646,7 +3659,6 @@ TEST_P(WriteBatchWithIndexTest, EntityReadSanityChecks) {
 
 TEST_P(WriteBatchWithIndexTest, TrackAndClearCFStats) {
   std::string value;
-  batch_->SetTrackPerCFStat(true);
   ASSERT_OK(batch_->Put("A", "val"));
   ASSERT_OK(batch_->SingleDelete("B"));
 
@@ -3735,7 +3747,6 @@ TEST_F(WBWIMemTableTest, ReadFromWBWIMemtable) {
   Random& rnd = *Random::GetTLSInstance();
   auto wbwi = std::make_shared<WriteBatchWithIndex>(
       cmp, 0, /*overwrite_key=*/true, 0, 0);
-  wbwi->SetTrackPerCFStat(true);
   std::vector<std::pair<std::string, std::string>> expected;
   const int kNumUpdate = 10000;
   expected.resize(kNumUpdate);
@@ -3818,6 +3829,7 @@ TEST_F(WBWIMemTableTest, ReadFromWBWIMemtable) {
     // See comment for WBWIMemTable for sequence number assignment method.
     expected_seqno[idx]++;
   }
+  AssertWBWICountEQWBCount(*wbwi);
   // Get a non-existing key
   found_final_value = false;
   ASSERT_EQ("NOT_FOUND", Get("foo", wbwi_mem, visible_seq, &found_final_value));
@@ -3999,7 +4011,6 @@ TEST_F(WBWIMemTableTest, IterEmitSingleDelete) {
 
   auto wbwi = std::make_shared<WriteBatchWithIndex>(
       cmp, 0, /*overwrite_key=*/true, 0, 0);
-  wbwi->SetTrackPerCFStat(true);
 
   ASSERT_OK(wbwi->Put(DBTestBase::Key(0), "val0"));
   ASSERT_OK(wbwi->SingleDelete(DBTestBase::Key(0)));
@@ -4153,7 +4164,6 @@ TEST_F(WBWIMemTableTest, WBWIMemTableWithMerge) {
 
   auto wbwi = std::make_shared<WriteBatchWithIndex>(
       cmp, 0, /*overwrite_key=*/true, 0, 0);
-  wbwi->SetTrackPerCFStat(true);
   std::unique_ptr<WBWIMemTable> wbwi_mem{
       new WBWIMemTable(wbwi, cmp,
                        /*cf_id=*/0, &immutable_opts, &mutable_cf_options,
diff --git a/wiki/fifo_compaction.md b/wiki/fifo_compaction.md
new file mode 100644
index 000000000000..52c25fb920a5
--- /dev/null
+++ b/wiki/fifo_compaction.md
@@ -0,0 +1,672 @@
+# FIFO Compaction Strategy
+
+This document describes the FIFO compaction style in RocksDB, covering the
+file dropping strategies and both the old and new intra-L0 compaction
+picking strategies.
+
+## Overview
+
+FIFO compaction is designed for time-series and log-like workloads where data
+has a natural expiration. All data lives at L0. When total data exceeds a
+configured size limit, the oldest SST files are dropped — no merge, no rewrite,
+just deletion. This gives near-zero write amplification for the compaction layer.
+
+```
+L0 (all data lives here):
+  Newest                                                           Oldest
+    |                                                                |
+    v                                                                v
+  [SST_N] [SST_N-1] ... [SST_3] [SST_2] [SST_1]
+    ^                                       ^
+    |                                       |
+  new flushes added here          oldest files dropped here
+                                  (when over size limit)
+```
+
+Without intra-L0 compaction, every memtable flush creates a new small SST file.
+Over time, the number of L0 files grows, increasing read amplification (each
+point lookup must check every L0 file). Intra-L0 compaction addresses this by
+merging small files into fewer larger files.
+
+## Compaction Picking Priority Chain
+
+When compaction is triggered (score >= 1.0), the picker tries these strategies
+in order, returning the first non-null result:
+
+```
+PickCompaction():
+    |
+    |-- 1. PickTTLCompaction()               [File Dropping]
+    |       Drop files older than TTL.
+    |
+    |-- 2. PickSizeCompaction()              [File Dropping]
+    |       Drop oldest files when over size limit.
+    |
+    |-- 3. PickIntraL0Compaction()           [Intra-L0]
+    |       Dispatcher: merges small L0 files to reduce file count.
+    |       Requires allow_compaction=true. Dispatches to:
+    |         - PickRatioBasedIntraL0Compaction (use_kv_ratio_compaction=true)
+    |         - PickCostBasedIntraL0Compaction  (use_kv_ratio_compaction=false)
+    |
+    |-- 4. PickTemperatureChangeCompaction() [Temperature Migration]
+            Rewrite one file to change its temperature tier.
+            Lowest priority — runs only if nothing else needs to be done.
+```
+
+Steps 1 and 2 are **file dropping** — they delete old files to enforce size
+or TTL limits. Step 3 is **intra-L0 compaction** — it merges small files into
+fewer larger ones. `PickIntraL0Compaction` is the dispatcher that selects
+between the two strategies based on `use_kv_ratio_compaction`.
+
+Step 4 is **temperature migration** — it rewrites a single file to change its
+storage temperature (e.g., moving cold data to cheaper storage). It picks one
+file at a time, checking if the file's age exceeds a configured threshold but
+its current temperature doesn't match the target. It runs last because it's
+the lowest priority: disk space management (dropping) and read amplification
+(intra-L0) are more important than storage tiering. Since FIFO only allows
+one compaction at a time, running temperature change last ensures it never
+blocks more critical operations.
+
+Note: Intra-L0 compaction runs after size-based dropping. If `PickSizeCompaction`
+dropped files (returned non-null), `PickIntraL0Compaction` is skipped. This
+means intra-L0 only runs when the DB is under the size limit or when
+size-based compaction is already in progress.
+
+## Score Computation
+
+The compaction score determines when compaction should be triggered. For FIFO:
+
+```
+score = effective_total_size / effective_max_size
+```
+
+Where:
+- `effective_total_size` = total SST size (or SST + blob when
+  `max_data_files_size > 0`)
+- `effective_max_size` = `max_table_files_size` (or `max_data_files_size`
+  when set)
+
+Additional score contributions:
+- When `allow_compaction` is true (enables intra-L0 compaction):
+  `score = max(score, num_sorted_runs / level0_file_num_compaction_trigger)`
+- When `ttl > 0`: score is boosted by expired file count
+- When temperature thresholds are set: score is boosted if files need
+  temperature change
+
+---
+
+# Part 1: File Dropping Strategies
+
+These strategies delete old files to enforce data size or TTL limits.
+No data is rewritten — files are simply removed.
+
+## TTL-Based Dropping (`PickTTLCompaction`)
+
+**When**: `ttl > 0`
+
+Drops L0 files whose data is older than the TTL threshold. Iterates from
+oldest to newest, checking `newest_key_time` or `creation_time` from table
+properties against `current_time - ttl`.
+
+```
+Before TTL compaction (ttl = 3600s, files older than 1 hour):
+
+  L0: [F6:10m] [F5:20m] [F4:40m] [F3:50m] [F2:70m] [F1:80m]
+                                              ^^^^     ^^^^
+                                            older than TTL --> DROP
+
+After:
+  L0: [F6:10m] [F5:20m] [F4:40m] [F3:50m]
+```
+
+Returns `nullptr` if deleting expired files would still leave the total size
+above the size limit — in that case, size-based dropping handles it instead.
+
+**Config**: `MutableCFOptions::ttl` (in seconds)
+
+## Size-Based Dropping (`PickSizeCompaction`)
+
+**When**: Total size exceeds the configured limit.
+
+### SST-Only Mode (default)
+
+Compares sum of SST file sizes against `max_table_files_size`:
+
+```
+Before (total 1.2GB > max_table_files_size 1GB):
+
+  L0: [F8:200MB] [F7:200MB] [F6:200MB] [F5:200MB] [F4:200MB] [F3:200MB]
+                                                       total = 1.2GB
+
+  Drop oldest files until under limit:
+  Drop F3 (200MB) --> remaining = 1.0GB <= 1GB limit --> STOP
+
+After:
+  L0: [F8:200MB] [F7:200MB] [F6:200MB] [F5:200MB] [F4:200MB]
+                                                       total = 1.0GB
+```
+
+### Blob-Aware Mode (`max_data_files_size > 0`)
+
+When BlobDB is enabled, SST files are small (keys + blob references) and blob
+files hold the actual values. The total disk usage is dominated by blob files,
+so `max_table_files_size` (SST-only) cannot control total disk usage.
+
+`max_data_files_size` accounts for both SST and blob files:
+
+```
+effective_size = total_sst + total_blob
+
+Example: total_sst = 10MB, total_blob = 9.99GB
+  max_table_files_size = 1GB  --> sees 10MB, no dropping (WRONG!)
+  max_data_files_size = 10GB  --> sees 10GB, drops when exceeded (CORRECT)
+```
+
+When dropping files, proportional estimation is used to account for blob
+data freed per SST file:
+
+```
+data_per_file = effective_size / num_files
+```
+
+Blob files are automatically cleaned up when their linked SSTs are deleted
+(via `BlobFileMetaData::GetLinkedSsts()` reference counting).
+
+**Config**:
+- `CompactionOptionsFIFO::max_table_files_size` (default: 1GB)
+- `CompactionOptionsFIFO::max_data_files_size` (default: 0, disabled)
+
+## Temperature Migration (`PickTemperatureChangeCompaction`)
+
+**When**: `file_temperature_age_thresholds` is non-empty
+
+This is NOT file dropping — it **rewrites** a single SST file to assign it a
+new storage temperature (e.g., kWarm, kCold). This allows tiered storage
+systems to move aging data to cheaper/slower media. The file content is
+unchanged; only the temperature metadata is updated.
+
+Picks one file at a time, scanning from oldest to newest. For each file,
+checks if its age exceeds a configured threshold AND its current temperature
+doesn't match the target. Only one file is migrated per compaction to minimize
+impact on other operations. Only works with single-level FIFO
+(`num_levels == 1`).
+
+This runs as the **lowest priority** in the picking chain (step 4) because
+storage tiering is less urgent than disk space management (dropping) or read
+amplification (intra-L0 compaction). Since FIFO allows only one compaction at
+a time, this ensures temperature migration never blocks critical operations.
+
+```
+Config: file_temperature_age_thresholds = [{kWarm, 3600}, {kCold, 86400}]
+
+  [F6:5m,kUnk] [F5:30m,kUnk] [F4:2h,kUnk] [F3:5h,kUnk] [F2:2d,kUnk]
+                                                            ^^^^^^^^
+                                                            age > 86400s
+                                                            --> compact to kCold
+
+After:
+  [F6:5m,kUnk] [F5:30m,kUnk] [F4:2h,kUnk] [F3:5h,kUnk] [F2:2d,kCold]
+```
+
+**Config**: `CompactionOptionsFIFO::file_temperature_age_thresholds`
+
+---
+
+# Part 2: Intra-L0 Compaction
+
+Intra-L0 compaction merges multiple small L0 files into fewer larger files
+to reduce file count and read amplification. Unlike file dropping, this
+rewrites data — but only SST data (blob files are never rewritten).
+
+`allow_compaction = true` is the **master switch** for intra-L0 compaction.
+When enabled, `use_kv_ratio_compaction` selects which picking strategy to use:
+
+```
+  allow_compaction = true          (master switch for intra-L0)
+          |
+          +-- use_kv_ratio_compaction = false   (default)
+          |     Old Strategy: PickCostBasedIntraL0Compaction
+          |     Guard: 1.1 * write_buffer_size
+          |     Works when SST ~= write_buffer_size (non-BlobDB)
+          |
+          +-- use_kv_ratio_compaction = true
+                New Strategy: PickRatioBasedIntraL0Compaction
+                Guard: capacity-derived target from SST/blob ratio
+                Works when SST << write_buffer_size (BlobDB)
+                Requires: max_data_files_size > 0
+```
+
+## Old Strategy: `PickCostBasedIntraL0Compaction`
+
+**When**: `allow_compaction = true` AND `use_kv_ratio_compaction = false`.
+Called from `PickIntraL0Compaction` (which only runs when `PickSizeCompaction`
+returned nullptr, meaning the DB is under the size limit).
+
+This is the original intra-L0 compaction, implemented in
+`PickCostBasedIntraL0Compaction()`. It uses a greedy algorithm to pick files,
+with a `write_buffer_size`-based guard to prevent re-compacting large files.
+
+### Algorithm
+
+```
+1. Start from the newest L0 file (index 0)
+2. Greedily add older files while compact_bytes_per_del_file decreases
+3. Stop when:
+   - A file is being_compacted
+   - compact_bytes_per_del_file starts increasing (diminishing returns)
+   - Total exceeds max_compaction_bytes
+4. Check: enough files (>= trigger) AND per_del < 1.1 * write_buffer_size
+5. Output: always a single file
+```
+
+### Understanding `compact_bytes_per_del_file`
+
+`compact_bytes_per_del_file` measures the **cost per file eliminated**. When
+we compact N files into 1 output, we eliminate (N-1) files but must read and
+rewrite all N files' data. The metric is:
+
+```
+compact_bytes_per_del_file = total_input_bytes / (num_files - 1)
+```
+
+The algorithm greedily adds files as long as this ratio keeps **decreasing**
+(meaning each additional file is "cheap" to include). When adding a file
+causes the ratio to **increase**, we stop — it signals diminishing returns.
+
+```
+Example: scanning files from newest (left) to oldest (right)
+
+  Files:    [F5:32KB] [F4:64KB] [F3:48KB] [F2:96KB] [F1:128KB]
+
+  Step 1: Start with F5 (32KB). compact_bytes = 32KB.
+  Step 2: Add F4.  compact_bytes = 96KB.  per_del = 96/1 = 96KB.
+  Step 3: Add F3.  compact_bytes = 144KB. per_del = 144/2 = 72KB. (72 < 96, improving)
+  Step 4: Add F2.  compact_bytes = 240KB. per_del = 240/3 = 80KB. (80 > 72, WORSE!)
+          --> STOP. Adding F2 makes the ratio increase.
+
+  Result: pick [F5, F4, F3] (3 files), per_del = 72KB.
+```
+
+The ratio increases when a file is significantly larger than the average of
+files already selected. This naturally prevents including already-compacted
+files (which are larger than flush files) — IF the size gap is significant.
+
+### Example (uniform flush files)
+
+```
+Before (4 flush files of 64KB each, trigger=4):
+
+  L0: [F4:64KB] [F3:64KB] [F2:64KB] [F1:64KB]
+       newest                          oldest
+
+  PickCostBasedIntraL0Compaction:
+    Add F4: compact_bytes = 64KB
+    Add F3: compact_bytes = 128KB, per_del = 128/1 = 128KB
+    Add F2: compact_bytes = 192KB, per_del = 192/2 = 96KB  (96 < 128, better)
+    Add F1: compact_bytes = 256KB, per_del = 256/3 = 85KB  (85 < 96, better)
+    No more files. Check: 4 >= trigger(4) and 85KB < 70MB. OK.
+
+After:
+  L0: [C1:256KB]    (single compacted output)
+```
+
+### Example (flush + compacted, ratio detects size gap)
+
+```
+  L0: [F8:64KB] [F7:64KB] [F6:64KB] [F5:64KB] [C1:256KB]
+       newest                                     oldest (compacted)
+
+  PickCostBasedIntraL0Compaction:
+    Add F8: compact_bytes = 64KB
+    Add F7: compact_bytes = 128KB, per_del = 128/1 = 128KB
+    Add F6: compact_bytes = 192KB, per_del = 192/2 = 96KB  (improving)
+    Add F5: compact_bytes = 256KB, per_del = 256/3 = 85KB  (improving)
+    Add C1: compact_bytes = 512KB, per_del = 512/4 = 128KB (128 > 85, WORSE!)
+    --> STOP before C1.
+
+  Result: pick [F8, F7, F6, F5] — compacted file C1 is excluded.
+  This works because C1 (256KB) is 4x larger than flush files (64KB).
+```
+
+### Anti-Re-Compaction Guard
+
+The guard `compact_bytes_per_del_file < 1.1 * write_buffer_size` prevents
+picking files that are already near memtable size. The idea: compacted files
+should be ~write_buffer_size, so they'd push `per_del` above the guard.
+
+```
+Guard works when SST ~= write_buffer_size:
+
+  Files: [64MB, 64MB, 64MB, 64MB]   (SST ~= WBS = 64MB)
+  per_del = 256MB/3 = 85MB > 70MB   --> guard rejects --> no re-compaction
+```
+
+### Known Limitation with BlobDB
+
+With BlobDB, SST files are ~1000x smaller than `write_buffer_size`. The guard
+threshold (e.g., 70MB) is never reached by any L0 file. ALL files pass the
+guard, including previously compacted files:
+
+```
+Guard FAILS when SST << write_buffer_size (BlobDB):
+
+  write_buffer_size = 64MB, SST files ~64KB (1000x smaller)
+  Guard threshold: 1.1 * 64MB = 70.4MB
+
+  10 compacted files of 256KB each:
+    per_del = 2560KB/9 = 284KB << 70.4MB --> guard passes!
+    ALL 10 files re-compacted into 1 file of 2.56MB
+
+  Result: cascading re-compaction creates "monster files"
+
+  Round 1: [64KB, 64KB, 64KB, 64KB] --> compact --> [256KB]
+  Round 2: [64KB, 64KB, 64KB, 256KB] --> compact ALL --> [448KB]
+  Round 3: [64KB, 64KB, 64KB, 448KB] --> compact ALL --> [640KB]
+  ... files grow unboundedly
+```
+
+Use the KV-ratio strategy instead for BlobDB workloads.
+
+### Config
+
+- `CompactionOptionsFIFO::allow_compaction` (default: false)
+- Anti-re-compaction guard: `1.1 * write_buffer_size`
+- Min files: `level0_file_num_compaction_trigger`
+
+## New Strategy: `use_kv_ratio_compaction` (`PickRatioBasedIntraL0Compaction`)
+
+**When**: `allow_compaction = true` AND `use_kv_ratio_compaction = true`
+AND `max_data_files_size > 0`
+
+This strategy replaces the `write_buffer_size`-based guard with a
+**capacity-derived target** and uses **tiered size-based merging** to achieve
+logarithmic write amplification. It observes the actual SST/blob size ratio,
+computes a target graduated file size, and merges files incrementally through
+size tiers rather than directly to target.
+
+### Why a New Strategy?
+
+```
+Without BlobDB:  SST ~= write_buffer_size     --> old guard works
+With BlobDB:     SST ~= write_buffer_size/1000 --> old guard is useless
+```
+
+The new strategy derives the target from the **data capacity** and
+**observed key/value ratio**, not from `write_buffer_size`.
+
+### Algorithm
+
+**Step 1: Target Computation**
+
+The target graduated file size can be determined in two ways:
+
+```
+If max_compaction_bytes > 0 (explicitly set by user):
+  target = max_compaction_bytes      // user override
+
+If max_compaction_bytes == 0 (default, auto-calculate):
+  sst_ratio = total_l0_sst / (total_l0_sst + total_blob)
+  total_sst_at_cap = max_data_files_size * sst_ratio
+  target = total_sst_at_cap / level0_file_num_compaction_trigger
+```
+
+```
+Example (auto-calculated):
+  max_data_files_size = 10GB, sst_ratio = 0.001 (64KB SST / 64MB total)
+  total_sst_at_cap = 10GB * 0.001 = 10MB
+  trigger = 10
+  target = 10MB / 10 = 1MB
+```
+
+The `sst_ratio` is **recomputed on every `PickCompaction` call**. The
+computation is trivial (sum file sizes + arithmetic) and `PickCompaction`
+is only called once per flush or compaction completion, so no caching is
+needed. This also means the ratio naturally adapts when `SetOptions()`
+changes configuration.
+
+**Step 2: Tier Boundaries**
+
+Tier boundaries form a geometric sequence descending from the target,
+using `trigger` as the growth factor:
+
+```
+..., target/trigger^2, target/trigger, target
+```
+
+Example with target=1MB, trigger=10:
+  boundaries = [10KB, 100KB, 1MB]
+
+Boundaries below 10KB are not generated (SST files of most workloads
+are larger than this). If target itself is below 10KB, it is used as
+the sole boundary.
+
+Files >= target are "graduated" and never compacted again. They sit
+in L0 until FIFO drops them.
+
+**Step 3: Tiered File Selection**
+
+For each tier boundary (smallest first), scan L0 from oldest to newest:
+
+```
+For each boundary B (from smallest to largest):
+  1. Skip files >= B (they belong to higher tiers) and being_compacted files
+  2. Collect contiguous files < B
+  3. Stop when accumulated >= B (cap at 2*B to prevent tier-skipping)
+  4. If >= 2 files and accumulated >= B: merge them
+  5. Output (~B bytes) lands at the next tier
+```
+
+Processing boundaries smallest-first ensures bottom-up build: flush outputs
+are merged first, and higher-tier merges happen naturally as lower-tier
+outputs accumulate.
+
+```
+Example (target=1MB, trigger=10, flush~10KB):
+
+  Tier boundaries: [10KB, 100KB, 1MB]
+
+  L0: [1MB_grad] [1MB_grad] [100KB] [100KB] [10KB] [10KB] [F] [F] [F] [F]
+
+  Scan at boundary=10KB:
+    F,F,F,F (all < 10KB) --> accumulated >= 10KB? If yes, merge → ~10KB output
+
+  Scan at boundary=100KB:
+    10KB,10KB,... (all < 100KB) --> accumulated >= 100KB? merge → ~100KB
+
+  Scan at boundary=1MB:
+    100KB,100KB,... (all < 1MB) --> accumulated >= 1MB? merge → graduated!
+```
+
+### Trade-Off: Write Amp vs L0 File Count
+
+The tiered approach trades higher L0 file count for logarithmic write amp:
+
+```
+Write amp per byte:
+  k + 1 = ceil(log(target/flush) / log(trigger)) + 1
+  Each byte is rewritten once per tier crossing.
+
+L0 file count at steady state:
+  trigger + k * (trigger - 1)
+  More than the original trigger target, but bounded logarithmically.
+
+Example (target=1MB, flush=1KB, trigger=10):
+  k = 3 tiers, write amp = 4, file count ≈ 37
+  vs flat merging: write amp ≈ 57
+```
+
+### Anti-Re-Compaction Guard
+
+The guard is implicit in the tier boundaries:
+
+```
+Graduated files (>= target) are skipped at EVERY tier boundary.
+  1MB >= 1MB   --> skipped at 1MB boundary
+  1MB >= 100KB --> skipped at 100KB boundary
+  1MB >= 10KB  --> skipped at 10KB boundary
+
+Intermediate tier files are only merged at HIGHER tier boundaries.
+  A 100KB file (output of tier-0 merge) is:
+    >= 100KB --> skipped at 100KB boundary (won't be re-merged at same tier)
+    < 1MB    --> eligible at 1MB boundary (merges into graduated file)
+```
+
+Compare with the old strategy's guard:
+
+```
+Old: guard = 1.1 * write_buffer_size (breaks when SST << WBS)
+New: graduated files >= target always excluded; intermediate files
+     progress through tiers without cascading re-compaction
+```
+
+### Steady State
+
+```
+Steady state L0 (target=64MB, trigger=4, flush~1MB):
+
+  [64MB_grad, 64MB_grad, 64MB_grad, 64MB_grad,
+   16MB, 16MB, 16MB,
+   4MB, 4MB,
+   1MB, 1MB, 1MB]
+
+  - 4 graduated files at target size (frozen until FIFO drops them)
+  - Intermediate files at tier sizes (accumulating for next merge)
+  - Flush outputs (accumulating for first tier merge)
+
+When FIFO drops the oldest graduated file, it removes exactly
+1/trigger of the total SST data (predictable).
+```
+
+### Write Amplification
+
+```
+With BlobDB (SST ~1KB, blob ~1MB per flush, target=1MB, trigger=10):
+  - k = 3 tiers (1KB → 10KB → 100KB → 1MB)
+  - SST write amp: k+1 = 4x (flush + 3 tier crossings)
+  - Blob write amp: ~1x (never rewritten)
+  - Total write amp: ~1 + 1KB*4/(1KB+1MB) ≈ 1.004x
+
+Without BlobDB (SST ~64MB per flush):
+  - target = large, ratio = 1, k = 1 typically
+  - SST write amp: ~2x
+```
+
+### File Uniformity
+
+At steady state, all graduated files are close to the target size.
+Output is in [boundary, 2*boundary) at each tier. Variable flush sizes
+are handled naturally — the size-based merge rule produces consistent
+output regardless of individual file sizes.
+
+### Config
+
+- `CompactionOptionsFIFO::allow_compaction` (required: true)
+- `CompactionOptionsFIFO::use_kv_ratio_compaction` (default: false)
+- `CompactionOptionsFIFO::max_data_files_size` (required, > 0)
+- `level0_file_num_compaction_trigger` (target max L0 file count)
+- `max_compaction_bytes` (default: 0 = auto-calculate target from capacity;
+  when > 0, overrides auto-calculated target with this value)
+
+## Choosing Between Old and New Intra-L0 Strategies
+
+Both strategies require `allow_compaction = true`. The choice of strategy
+depends on whether BlobDB is used:
+
+```
+Decision tree:
+
+  Want intra-L0 compaction?
+    |
+    +-- NO:  allow_compaction = false (default)
+    |        No file merging, only dropping.
+    |
+    +-- YES: allow_compaction = true
+             |
+             +-- Using BlobDB (SST << write_buffer_size)?
+             |     |
+             |     +-- YES: use_kv_ratio_compaction = true
+             |     |        (also requires max_data_files_size > 0)
+             |     |
+             |     +-- NO:  use_kv_ratio_compaction = false (default)
+             |              Old strategy works fine.
+```
+
+| Criteria | Old (default) | New (`use_kv_ratio_compaction`) |
+|----------|------------------------|-------------------------------|
+| Guard mechanism | `1.1 * write_buffer_size` | capacity-derived target |
+| Works with BlobDB? | No (guard broken) | Yes (designed for it) |
+| File uniformity | Poor with BlobDB | Good (+/-25%) |
+| Re-compaction risk | High with BlobDB | None (tiered boundaries prevent it) |
+| Write amp (BlobDB) | Unpredictable | Logarithmic: (k+1)x SST, ~1x total |
+| Requires | `allow_compaction=true` | `allow_compaction=true` + `use_kv_ratio_compaction=true` + `max_data_files_size>0` |
+
+---
+
+# Configuration Examples
+
+## Basic FIFO (no intra-L0 compaction)
+
+```cpp
+options.compaction_style = kCompactionStyleFIFO;
+options.compaction_options_fifo.max_table_files_size = 1ULL * 1024 * 1024 * 1024;  // 1GB
+```
+
+## FIFO with old intra-L0 (non-BlobDB)
+
+```cpp
+options.compaction_style = kCompactionStyleFIFO;
+options.compaction_options_fifo.max_table_files_size = 1ULL * 1024 * 1024 * 1024;
+options.compaction_options_fifo.allow_compaction = true;
+options.level0_file_num_compaction_trigger = 4;
+```
+
+## FIFO with BlobDB and KV-ratio compaction
+
+```cpp
+options.compaction_style = kCompactionStyleFIFO;
+options.compaction_options_fifo.max_data_files_size = 10ULL * 1024 * 1024 * 1024;  // 10GB
+options.compaction_options_fifo.allow_compaction = true;   // master switch
+options.compaction_options_fifo.use_kv_ratio_compaction = true;  // select new strategy
+options.level0_file_num_compaction_trigger = 10;
+options.enable_blob_files = true;
+options.min_blob_size = 1024;
+```
+
+## FIFO with TTL + BlobDB
+
+```cpp
+options.compaction_style = kCompactionStyleFIFO;
+options.compaction_options_fifo.max_data_files_size = 10ULL * 1024 * 1024 * 1024;
+options.compaction_options_fifo.allow_compaction = true;
+options.compaction_options_fifo.use_kv_ratio_compaction = true;
+options.level0_file_num_compaction_trigger = 10;
+options.ttl = 86400;  // 24 hours
+options.enable_blob_files = true;
+options.min_blob_size = 1024;
+```
+
+---
+
+# Configuration Reference
+
+## CompactionOptionsFIFO
+
+| Option | Type | Default | Description |
+|--------|------|---------|-------------|
+| `max_table_files_size` | uint64_t | 1GB | SST-only size limit for FIFO dropping |
+| `max_data_files_size` | uint64_t | 0 | Combined SST+blob size limit (0=disabled) |
+| `allow_compaction` | bool | false | Master switch for intra-L0 compaction (required for both old and new strategies) |
+| `use_kv_ratio_compaction` | bool | false | Select capacity-derived intra-L0 strategy (requires allow_compaction=true AND max_data_files_size>0) |
+| `age_for_warm` | uint64_t | 0 | DEPRECATED |
+| `file_temperature_age_thresholds` | vector | empty | Age-based temperature migration |
+| `allow_trivial_copy_when_change_temperature` | bool | false | Allow trivial copy for temp change |
+| `trivial_copy_buffer_size` | uint64_t | 4096 | Buffer size for trivial copy |
+
+## Related CF Options
+
+| Option | Relevance to FIFO |
+|--------|-------------------|
+| `level0_file_num_compaction_trigger` | Target max L0 file count for KV-ratio; min files for old intra-L0 |
+| `ttl` | TTL-based file expiration (seconds) |
+| `write_buffer_size` | Guard threshold for old-style intra-L0 (1.1x) |
+| `max_compaction_bytes` | For KV-ratio: 0 = auto-calculate target from capacity; > 0 = use as target directly. For old intra-L0: cap on total input size. Default sanitized to target_file_size_base * 25 (except when use_kv_ratio_compaction=true, where 0 is preserved) |